gpu_info_nvml.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvml.h"
  4. void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  5. nvmlReturn_t ret;
  6. resp->err = NULL;
  7. const int buflen = 256;
  8. char buf[buflen + 1];
  9. int i;
  10. struct lookup {
  11. char *s;
  12. void **p;
  13. } l[] = {
  14. {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
  15. {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
  16. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
  17. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
  18. {NULL, NULL},
  19. };
  20. resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
  21. if (!resp->ch.handle) {
  22. char *msg = LOAD_ERR();
  23. LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
  24. snprintf(buf, buflen,
  25. "Unable to load %s library to query for Nvidia GPUs: %s",
  26. nvml_lib_path, msg);
  27. free(msg);
  28. resp->err = strdup(buf);
  29. return;
  30. }
  31. // TODO once we've squashed the remaining corner cases remove this log
  32. // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
  33. for (i = 0; l[i].s != NULL; i++) {
  34. // TODO once we've squashed the remaining corner cases remove this log
  35. // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
  36. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  37. if (!*(l[i].p)) {
  38. resp->ch.handle = NULL;
  39. char *msg = LOAD_ERR();
  40. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  41. UNLOAD_LIBRARY(resp->ch.handle);
  42. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  43. msg);
  44. free(msg);
  45. resp->err = strdup(buf);
  46. return;
  47. }
  48. }
  49. ret = (*resp->ch.nvmlInit_v2)();
  50. if (ret != NVML_SUCCESS) {
  51. LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
  52. UNLOAD_LIBRARY(resp->ch.handle);
  53. resp->ch.handle = NULL;
  54. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  55. resp->err = strdup(buf);
  56. return;
  57. }
  58. }
  59. void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
  60. nvmlDevice_t device;
  61. nvmlMemory_t memInfo = {0};
  62. nvmlReturn_t ret;
  63. ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
  64. if (ret != NVML_SUCCESS) {
  65. LOG(1, "unable to get device handle %d: %d", device_id, ret);
  66. *free = 0;
  67. return;
  68. }
  69. ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
  70. if (ret != NVML_SUCCESS) {
  71. LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
  72. *free = 0;
  73. return;
  74. }
  75. *free = memInfo.free;
  76. *total = memInfo.total;
  77. *used = memInfo.used;
  78. }
  79. void nvml_release(nvml_handle_t h) {
  80. LOG(h.verbose, "releasing nvml library\n");
  81. nvmlReturn_t ret;
  82. ret = (*h.nvmlShutdown)();
  83. if (ret != NVML_SUCCESS) {
  84. LOG(1, "error during nvmlShutdown %d", ret);
  85. }
  86. UNLOAD_LIBRARY(h.handle);
  87. h.handle = NULL;
  88. }
  89. #endif // __APPLE__