gpu_info_cudart.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_cudart.h"
  4. void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  5. cudartReturn_t ret;
  6. resp->err = NULL;
  7. resp->num_devices = 0;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[] = {
  15. {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
  16. {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
  17. {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
  18. {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
  19. {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
  20. {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
  21. {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
  22. {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
  23. {NULL, NULL},
  24. };
  25. resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
  26. if (!resp->ch.handle) {
  27. char *msg = LOAD_ERR();
  28. LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
  29. snprintf(buf, buflen,
  30. "Unable to load %s library to query for Nvidia GPUs: %s",
  31. cudart_lib_path, msg);
  32. free(msg);
  33. resp->err = strdup(buf);
  34. return;
  35. }
  36. for (i = 0; l[i].s != NULL; i++) {
  37. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  38. if (!*(l[i].p)) {
  39. char *msg = LOAD_ERR();
  40. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  41. UNLOAD_LIBRARY(resp->ch.handle);
  42. resp->ch.handle = NULL;
  43. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  44. msg);
  45. free(msg);
  46. resp->err = strdup(buf);
  47. return;
  48. }
  49. }
  50. ret = (*resp->ch.cudaSetDevice)(0);
  51. if (ret != CUDART_SUCCESS) {
  52. LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
  53. UNLOAD_LIBRARY(resp->ch.handle);
  54. resp->ch.handle = NULL;
  55. if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
  56. resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
  57. return;
  58. }
  59. snprintf(buf, buflen, "cudart init failure: %d", ret);
  60. resp->err = strdup(buf);
  61. return;
  62. }
  63. int version = 0;
  64. cudartDriverVersion_t driverVersion;
  65. driverVersion.major = 0;
  66. driverVersion.minor = 0;
  67. // Report driver version if we're in verbose mode, ignore errors
  68. ret = (*resp->ch.cudaDriverGetVersion)(&version);
  69. if (ret != CUDART_SUCCESS) {
  70. LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  71. } else {
  72. driverVersion.major = version / 1000;
  73. driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
  74. LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  75. }
  76. ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
  77. if (ret != CUDART_SUCCESS) {
  78. LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
  79. UNLOAD_LIBRARY(resp->ch.handle);
  80. resp->ch.handle = NULL;
  81. snprintf(buf, buflen, "unable to get device count: %d", ret);
  82. resp->err = strdup(buf);
  83. return;
  84. }
  85. }
  86. void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  87. resp->err = NULL;
  88. cudartMemory_t memInfo = {0,0,0};
  89. cudartReturn_t ret;
  90. const int buflen = 256;
  91. char buf[buflen + 1];
  92. if (h.handle == NULL) {
  93. resp->err = strdup("cudart handle isn't initialized");
  94. return;
  95. }
  96. ret = (*h.cudaSetDevice)(i);
  97. if (ret != CUDART_SUCCESS) {
  98. snprintf(buf, buflen, "cudart device failed to initialize");
  99. resp->err = strdup(buf);
  100. return;
  101. }
  102. cudaDeviceProp_t props;
  103. ret = (*h.cudaGetDeviceProperties)(&props, i);
  104. if (ret != CUDART_SUCCESS) {
  105. LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
  106. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  107. resp->major = 0;
  108. resp->minor = 0;
  109. } else {
  110. int allNull = 1;
  111. for (int j = 0; j < 16; j++) {
  112. if (props.uuid.bytes[j] != 0) {
  113. allNull = 0;
  114. break;
  115. }
  116. }
  117. if (allNull != 0) {
  118. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  119. } else {
  120. // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
  121. snprintf(&resp->gpu_id[0], GPU_ID_LEN,
  122. "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
  123. props.uuid.bytes[0],
  124. props.uuid.bytes[1],
  125. props.uuid.bytes[2],
  126. props.uuid.bytes[3],
  127. props.uuid.bytes[4],
  128. props.uuid.bytes[5],
  129. props.uuid.bytes[6],
  130. props.uuid.bytes[7],
  131. props.uuid.bytes[8],
  132. props.uuid.bytes[9],
  133. props.uuid.bytes[10],
  134. props.uuid.bytes[11],
  135. props.uuid.bytes[12],
  136. props.uuid.bytes[13],
  137. props.uuid.bytes[14],
  138. props.uuid.bytes[15]
  139. );
  140. }
  141. resp->major = props.major;
  142. resp->minor = props.minor;
  143. // TODO add other useful properties from props
  144. }
  145. ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
  146. if (ret != CUDART_SUCCESS) {
  147. snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
  148. resp->err = strdup(buf);
  149. return;
  150. }
  151. resp->total = memInfo.total;
  152. resp->free = memInfo.free;
  153. resp->used = memInfo.used;
  154. LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
  155. LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
  156. LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
  157. LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  158. }
  159. void cudart_release(cudart_handle_t h) {
  160. LOG(h.verbose, "releasing cudart library\n");
  161. UNLOAD_LIBRARY(h.handle);
  162. h.handle = NULL;
  163. }
  164. #endif // __APPLE__