gpu_info_nvcuda.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvcuda.h"
  4. void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  5. CUresult ret;
  6. resp->err = NULL;
  7. resp->num_devices = 0;
  8. resp->cudaErr = CUDA_SUCCESS;
  9. const int buflen = 256;
  10. char buf[buflen + 1];
  11. int i;
  12. struct lookup {
  13. char *s;
  14. void **p;
  15. } l[] = {
  16. {"cuInit", (void *)&resp->ch.cuInit},
  17. {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
  18. {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
  19. {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
  20. {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
  21. {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
  22. {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
  23. {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
  24. {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
  25. {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
  26. {NULL, NULL},
  27. };
  28. resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  29. if (!resp->ch.handle) {
  30. char *msg = LOAD_ERR();
  31. LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
  32. snprintf(buf, buflen,
  33. "Unable to load %s library to query for Nvidia GPUs: %s",
  34. nvcuda_lib_path, msg);
  35. free(msg);
  36. resp->err = strdup(buf);
  37. resp->cudaErr = -1;
  38. return;
  39. }
  40. for (i = 0; l[i].s != NULL; i++) {
  41. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  42. if (!*(l[i].p)) {
  43. char *msg = LOAD_ERR();
  44. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  45. UNLOAD_LIBRARY(resp->ch.handle);
  46. resp->ch.handle = NULL;
  47. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  48. msg);
  49. free(msg);
  50. resp->err = strdup(buf);
  51. resp->cudaErr = -1;
  52. return;
  53. }
  54. }
  55. ret = (*resp->ch.cuInit)(0);
  56. if (ret != CUDA_SUCCESS) {
  57. LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
  58. UNLOAD_LIBRARY(resp->ch.handle);
  59. resp->ch.handle = NULL;
  60. snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
  61. resp->err = strdup(buf);
  62. resp->cudaErr = ret;
  63. return;
  64. }
  65. int version = 0;
  66. resp->ch.driver_major = 0;
  67. resp->ch.driver_minor = 0;
  68. // Report driver version if we're in verbose mode, ignore errors
  69. ret = (*resp->ch.cuDriverGetVersion)(&version);
  70. if (ret != CUDA_SUCCESS) {
  71. LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  72. } else {
  73. resp->ch.driver_major = version / 1000;
  74. resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
  75. LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  76. }
  77. ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  78. if (ret != CUDA_SUCCESS) {
  79. LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
  80. UNLOAD_LIBRARY(resp->ch.handle);
  81. resp->ch.handle = NULL;
  82. snprintf(buf, buflen, "unable to get device count: %d", ret);
  83. resp->err = strdup(buf);
  84. resp->cudaErr = ret;
  85. return;
  86. }
  87. }
  88. const int buflen = 256;
  89. void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  90. resp->err = NULL;
  91. nvcudaMemory_t memInfo = {0,0};
  92. CUresult ret;
  93. CUdevice device = -1;
  94. CUcontext ctx = NULL;
  95. char buf[buflen + 1];
  96. CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  97. if (h.handle == NULL) {
  98. resp->err = strdup("cuda driver library handle isn't initialized");
  99. return;
  100. }
  101. ret = (*h.cuDeviceGet)(&device, i);
  102. if (ret != CUDA_SUCCESS) {
  103. snprintf(buf, buflen, "cuda driver library device failed to initialize");
  104. resp->err = strdup(buf);
  105. return;
  106. }
  107. int major = 0;
  108. int minor = 0;
  109. ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  110. if (ret != CUDA_SUCCESS) {
  111. LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  112. } else {
  113. ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
  114. if (ret != CUDA_SUCCESS) {
  115. LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
  116. } else {
  117. resp->minor = minor;
  118. resp->major = major;
  119. }
  120. }
  121. ret = (*h.cuDeviceGetUuid)(&uuid, device);
  122. if (ret != CUDA_SUCCESS) {
  123. LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
  124. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  125. } else {
  126. // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
  127. snprintf(&resp->gpu_id[0], GPU_ID_LEN,
  128. "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
  129. uuid.bytes[0],
  130. uuid.bytes[1],
  131. uuid.bytes[2],
  132. uuid.bytes[3],
  133. uuid.bytes[4],
  134. uuid.bytes[5],
  135. uuid.bytes[6],
  136. uuid.bytes[7],
  137. uuid.bytes[8],
  138. uuid.bytes[9],
  139. uuid.bytes[10],
  140. uuid.bytes[11],
  141. uuid.bytes[12],
  142. uuid.bytes[13],
  143. uuid.bytes[14],
  144. uuid.bytes[15]
  145. );
  146. }
  147. ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  148. if (ret != CUDA_SUCCESS) {
  149. LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
  150. resp->gpu_name[0] = '\0';
  151. }
  152. // To get memory we have to set (and release) a context
  153. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  154. if (ret != CUDA_SUCCESS) {
  155. snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
  156. resp->err = strdup(buf);
  157. return;
  158. }
  159. ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  160. if (ret != CUDA_SUCCESS) {
  161. snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
  162. resp->err = strdup(buf);
  163. // Best effort on failure...
  164. (*h.cuCtxDestroy)(ctx);
  165. return;
  166. }
  167. resp->total = memInfo.total;
  168. resp->free = memInfo.free;
  169. LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  170. LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  171. LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  172. ret = (*h.cuCtxDestroy)(ctx);
  173. if (ret != CUDA_SUCCESS) {
  174. LOG(1, "cuda driver library failed to release device context %d", ret);
  175. }
  176. }
  177. void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
  178. CUresult ret;
  179. CUcontext ctx = NULL;
  180. CUdevice device = -1;
  181. *free = 0;
  182. *total = 0;
  183. ret = (*h.cuDeviceGet)(&device, i);
  184. if (ret != CUDA_SUCCESS) {
  185. LOG(1, "cuda driver library device failed to initialize");
  186. return;
  187. }
  188. // To get memory we have to set (and release) a context
  189. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  190. if (ret != CUDA_SUCCESS) {
  191. LOG(1, "cuda driver library failed to get device context %d", ret);
  192. return;
  193. }
  194. ret = (*h.cuMemGetInfo_v2)(free, total);
  195. if (ret != CUDA_SUCCESS) {
  196. LOG(1, "cuda driver library device memory info lookup failure %d", ret);
  197. // Best effort on failure...
  198. (*h.cuCtxDestroy)(ctx);
  199. return;
  200. }
  201. ret = (*h.cuCtxDestroy)(ctx);
  202. if (ret != CUDA_SUCCESS) {
  203. LOG(1, "cuda driver library failed to release device context %d", ret);
  204. }
  205. }
  206. void nvcuda_release(nvcuda_handle_t h) {
  207. LOG(h.verbose, "releasing cuda driver library\n");
  208. UNLOAD_LIBRARY(h.handle);
  209. // TODO and other context release logic?
  210. h.handle = NULL;
  211. }
  212. #endif // __APPLE__