gpu_info_oneapi.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. #ifndef __APPLE__
  2. #include "gpu_info_oneapi.h"
  3. #include <string.h>
  4. void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  5. ze_result_t ret;
  6. resp->err = NULL;
  7. resp->oh.devices = NULL;
  8. resp->oh.num_devices = NULL;
  9. resp->oh.drivers = NULL;
  10. resp->oh.num_drivers = 0;
  11. const int buflen = 256;
  12. char buf[buflen + 1];
  13. int i, d;
  14. struct lookup {
  15. char *s;
  16. void **p;
  17. } l[] = {
  18. {"zesInit", (void *)&resp->oh.zesInit},
  19. {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
  20. {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
  21. {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
  22. {"zesDeviceEnumMemoryModules",
  23. (void *)&resp->oh.zesDeviceEnumMemoryModules},
  24. {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
  25. {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
  26. {NULL, NULL},
  27. };
  28. resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
  29. if (!resp->oh.handle) {
  30. char *msg = LOAD_ERR();
  31. snprintf(buf, buflen,
  32. "Unable to load %s library to query for Intel GPUs: %s\n",
  33. oneapi_lib_path, msg);
  34. free(msg);
  35. resp->err = strdup(buf);
  36. return;
  37. }
  38. // TODO once we've squashed the remaining corner cases remove this log
  39. LOG(resp->oh.verbose,
  40. "wiring Level-Zero management library functions in %s\n",
  41. oneapi_lib_path);
  42. for (i = 0; l[i].s != NULL; i++) {
  43. // TODO once we've squashed the remaining corner cases remove this log
  44. LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
  45. *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
  46. if (!*(l[i].p)) {
  47. resp->oh.handle = NULL;
  48. char *msg = LOAD_ERR();
  49. LOG(resp->oh.verbose, "dlerr: %s\n", msg);
  50. UNLOAD_LIBRARY(resp->oh.handle);
  51. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
  52. free(msg);
  53. resp->err = strdup(buf);
  54. return;
  55. }
  56. }
  57. LOG(resp->oh.verbose, "calling zesInit\n");
  58. ret = (*resp->oh.zesInit)(0);
  59. if (ret != ZE_RESULT_SUCCESS) {
  60. LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
  61. snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
  62. resp->err = strdup(buf);
  63. oneapi_release(resp->oh);
  64. return;
  65. }
  66. LOG(resp->oh.verbose, "calling zesDriverGet\n");
  67. ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
  68. if (ret != ZE_RESULT_SUCCESS) {
  69. LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
  70. snprintf(buf, buflen, "unable to get driver count: %x", ret);
  71. resp->err = strdup(buf);
  72. oneapi_release(resp->oh);
  73. return;
  74. }
  75. LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
  76. resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
  77. resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
  78. memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
  79. resp->oh.devices =
  80. malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
  81. ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
  82. if (ret != ZE_RESULT_SUCCESS) {
  83. LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
  84. snprintf(buf, buflen, "unable to get driver count: %x", ret);
  85. resp->err = strdup(buf);
  86. oneapi_release(resp->oh);
  87. return;
  88. }
  89. for (d = 0; d < resp->oh.num_drivers; d++) {
  90. LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
  91. ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
  92. &resp->oh.num_devices[d], NULL);
  93. if (ret != ZE_RESULT_SUCCESS) {
  94. LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
  95. snprintf(buf, buflen, "unable to get device count: %x", ret);
  96. resp->err = strdup(buf);
  97. oneapi_release(resp->oh);
  98. return;
  99. }
  100. resp->oh.devices[d] =
  101. malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
  102. ret = (*resp->oh.zesDeviceGet)(
  103. resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
  104. if (ret != ZE_RESULT_SUCCESS) {
  105. LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
  106. snprintf(buf, buflen, "unable to get device count: %x", ret);
  107. resp->err = strdup(buf);
  108. oneapi_release(resp->oh);
  109. return;
  110. }
  111. }
  112. return;
  113. }
  114. void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
  115. mem_info_t *resp) {
  116. ze_result_t ret;
  117. resp->err = NULL;
  118. uint64_t totalMem = 0;
  119. uint64_t usedMem = 0;
  120. const int buflen = 256;
  121. char buf[buflen + 1];
  122. int i, d, m;
  123. if (h.handle == NULL) {
  124. resp->err = strdup("Level-Zero handle not initialized");
  125. return;
  126. }
  127. if (driver > h.num_drivers || device > h.num_devices[driver]) {
  128. resp->err = strdup("driver of device index out of bounds");
  129. return;
  130. }
  131. resp->total = 0;
  132. resp->free = 0;
  133. zes_device_ext_properties_t ext_props;
  134. ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
  135. ext_props.pNext = NULL;
  136. zes_device_properties_t props;
  137. props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
  138. props.pNext = &ext_props;
  139. ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
  140. if (ret != ZE_RESULT_SUCCESS) {
  141. snprintf(buf, buflen, "unable to get device properties: %d", ret);
  142. resp->err = strdup(buf);
  143. return;
  144. }
  145. snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
  146. // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
  147. // (this is probably wrong...)
  148. // TODO - the driver isn't included - what if there are multiple drivers?
  149. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
  150. if (h.verbose) {
  151. // When in verbose mode, report more information about
  152. // the card we discover.
  153. LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
  154. props.modelName);
  155. LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
  156. props.brandName);
  157. LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
  158. props.vendorName);
  159. LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
  160. props.serialNumber);
  161. LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
  162. props.boardNumber);
  163. }
  164. // TODO
  165. // Compute Capability equivalent in resp->major, resp->minor, resp->patch
  166. uint32_t memCount = 0;
  167. ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
  168. NULL);
  169. if (ret != ZE_RESULT_SUCCESS) {
  170. snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
  171. ret);
  172. resp->err = strdup(buf);
  173. return;
  174. }
  175. LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
  176. zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
  177. (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
  178. for (m = 0; m < memCount; m++) {
  179. zes_mem_state_t state;
  180. state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
  181. state.pNext = NULL;
  182. ret = (*h.zesMemoryGetState)(mems[m], &state);
  183. if (ret != ZE_RESULT_SUCCESS) {
  184. snprintf(buf, buflen, "unable to get memory state: %x", ret);
  185. resp->err = strdup(buf);
  186. free(mems);
  187. return;
  188. }
  189. resp->total += state.size;
  190. resp->free += state.free;
  191. }
  192. free(mems);
  193. }
  194. void oneapi_release(oneapi_handle_t h) {
  195. int d;
  196. LOG(h.verbose, "releasing oneapi library\n");
  197. for (d = 0; d < h.num_drivers; d++) {
  198. if (h.devices != NULL && h.devices[d] != NULL) {
  199. free(h.devices[d]);
  200. }
  201. }
  202. if (h.devices != NULL) {
  203. free(h.devices);
  204. h.devices = NULL;
  205. }
  206. if (h.num_devices != NULL) {
  207. free(h.num_devices);
  208. h.num_devices = NULL;
  209. }
  210. if (h.drivers != NULL) {
  211. free(h.drivers);
  212. h.drivers = NULL;
  213. }
  214. h.num_drivers = 0;
  215. UNLOAD_LIBRARY(h.handle);
  216. h.handle = NULL;
  217. }
  218. int oneapi_get_device_count(oneapi_handle_t h, int driver) {
  219. if (h.handle == NULL || h.num_devices == NULL) {
  220. return 0;
  221. }
  222. if (driver > h.num_drivers) {
  223. return 0;
  224. }
  225. return (int)h.num_devices[driver];
  226. }
  227. #endif // __APPLE__