amd_windows.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. package gpu
  2. import (
  3. "bytes"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "path/filepath"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "github.com/ollama/ollama/envconfig"
  12. "github.com/ollama/ollama/format"
  13. )
  14. const (
  15. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  16. iGPUName = "AMD Radeon(TM) Graphics"
  17. )
  18. var (
  19. // Used to validate if the given ROCm lib is usable
  20. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
  21. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
  22. )
  23. func AMDGetGPUInfo() []RocmGPUInfo {
  24. resp := []RocmGPUInfo{}
  25. hl, err := NewHipLib()
  26. if err != nil {
  27. slog.Debug(err.Error())
  28. return nil
  29. }
  30. defer hl.Release()
  31. driverMajor, driverMinor, err := hl.AMDDriverVersion()
  32. if err != nil {
  33. // For now this is benign, but we may eventually need to fail compatibility checks
  34. slog.Debug("error looking up amd driver version", "error", err)
  35. }
  36. // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
  37. count := hl.HipGetDeviceCount()
  38. if count == 0 {
  39. return nil
  40. }
  41. libDir, err := AMDValidateLibDir()
  42. if err != nil {
  43. slog.Warn("unable to verify rocm library, will use cpu", "error", err)
  44. return nil
  45. }
  46. var supported []string
  47. gfxOverride := envconfig.HsaOverrideGfxVersion
  48. if gfxOverride == "" {
  49. supported, err = GetSupportedGFX(libDir)
  50. if err != nil {
  51. slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
  52. return nil
  53. }
  54. } else {
  55. slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
  56. }
  57. slog.Debug("detected hip devices", "count", count)
  58. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  59. for i := range count {
  60. err = hl.HipSetDevice(i)
  61. if err != nil {
  62. slog.Warn("set device", "id", i, "error", err)
  63. continue
  64. }
  65. props, err := hl.HipGetDeviceProperties(i)
  66. if err != nil {
  67. slog.Warn("get properties", "id", i, "error", err)
  68. continue
  69. }
  70. n := bytes.IndexByte(props.Name[:], 0)
  71. name := string(props.Name[:n])
  72. // TODO is UUID actually populated on windows?
  73. // Can luid be used on windows for setting visible devices (and is it actually set?)
  74. n = bytes.IndexByte(props.GcnArchName[:], 0)
  75. gfx := string(props.GcnArchName[:n])
  76. slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
  77. //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  78. // TODO Why isn't props.iGPU accurate!?
  79. if strings.EqualFold(name, iGPUName) {
  80. slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
  81. continue
  82. }
  83. if gfxOverride == "" {
  84. if !slices.Contains[[]string, string](supported, gfx) {
  85. slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
  86. // TODO - consider discrete markdown just for ROCM troubleshooting?
  87. slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
  88. continue
  89. } else {
  90. slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  91. }
  92. }
  93. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  94. if err != nil {
  95. slog.Warn("get mem info", "id", i, "error", err)
  96. continue
  97. }
  98. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  99. if totalMemory < IGPUMemLimit {
  100. slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
  101. continue
  102. }
  103. slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  104. slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  105. gpuInfo := RocmGPUInfo{
  106. GpuInfo: GpuInfo{
  107. Library: "rocm",
  108. memInfo: memInfo{
  109. TotalMemory: totalMemory,
  110. FreeMemory: freeMemory,
  111. },
  112. // Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
  113. UnreliableFreeMemory: true,
  114. ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
  115. DependencyPath: libDir,
  116. MinimumMemory: rocmMinimumMemory,
  117. Name: name,
  118. Compute: gfx,
  119. DriverMajor: driverMajor,
  120. DriverMinor: driverMinor,
  121. },
  122. index: i,
  123. }
  124. resp = append(resp, gpuInfo)
  125. }
  126. return resp
  127. }
  128. func AMDValidateLibDir() (string, error) {
  129. libDir, err := commonAMDValidateLibDir()
  130. if err == nil {
  131. return libDir, nil
  132. }
  133. // Installer payload (if we're running from some other location)
  134. localAppData := os.Getenv("LOCALAPPDATA")
  135. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  136. rocmTargetDir := filepath.Join(appDir, "rocm")
  137. if rocmLibUsable(rocmTargetDir) {
  138. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  139. return rocmTargetDir, nil
  140. }
  141. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  142. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  143. return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
  144. }
  145. func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
  146. if len(gpus) == 0 {
  147. return nil
  148. }
  149. hl, err := NewHipLib()
  150. if err != nil {
  151. slog.Debug(err.Error())
  152. return nil
  153. }
  154. defer hl.Release()
  155. for i := range gpus {
  156. err := hl.HipSetDevice(gpus[i].index)
  157. if err != nil {
  158. return err
  159. }
  160. freeMemory, _, err := hl.HipMemGetInfo()
  161. if err != nil {
  162. slog.Warn("get mem info", "id", i, "error", err)
  163. continue
  164. }
  165. slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
  166. gpus[i].FreeMemory = freeMemory
  167. }
  168. return nil
  169. }