memory_test.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. package llm
  2. import (
  3. "bytes"
  4. "encoding/binary"
  5. "fmt"
  6. "os"
  7. "testing"
  8. "github.com/ollama/ollama/api"
  9. "github.com/ollama/ollama/envconfig"
  10. "github.com/ollama/ollama/gpu"
  11. "github.com/stretchr/testify/assert"
  12. "github.com/stretchr/testify/require"
  13. )
  14. func TestEstimateGPULayers(t *testing.T) {
  15. envconfig.Debug = true
  16. modelName := "dummy"
  17. f, err := os.CreateTemp(t.TempDir(), modelName)
  18. require.NoError(t, err)
  19. defer f.Close()
  20. gguf := NewGGUFV3(binary.LittleEndian)
  21. inputLayerCount := 5
  22. tensors := []Tensor{
  23. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  24. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  25. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  26. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  27. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  28. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  29. }
  30. assert.Len(t, tensors, inputLayerCount+1)
  31. err = gguf.Encode(f, KV{
  32. "general.architecture": "llama",
  33. "general.name": "name",
  34. "llama.context_length": uint32(32),
  35. "llama.embedding_length": uint32(4096),
  36. "llama.block_count": uint32(inputLayerCount),
  37. "llama.attention.head_count": uint32(32),
  38. "llama.attention.head_count_kv": uint32(32),
  39. "tokenizer.ggml.tokens": []string{" "},
  40. "tokenizer.ggml.scores": []float32{0},
  41. "tokenizer.ggml.token_type": []int32{0},
  42. }, tensors)
  43. require.NoError(t, err)
  44. ggml, err := LoadModel(f.Name(), 0)
  45. if err != nil {
  46. t.Fatal(err)
  47. }
  48. // Simple CPU scenario
  49. gpus := []gpu.GpuInfo{
  50. {
  51. Library: "cpu",
  52. },
  53. }
  54. projectors := []string{}
  55. opts := api.DefaultOptions()
  56. t.Run("cpu", func(t *testing.T) {
  57. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  58. assert.Equal(t, 0, estimate.Layers)
  59. assert.Equal(t, uint64(0), estimate.Graph)
  60. })
  61. // derived from the dummy ggml file above
  62. graphPartialOffload := uint64(202377216)
  63. graphFullOffload := uint64(171968512)
  64. layerSize := uint64(33554436)
  65. projectorSize := uint64(0)
  66. memoryLayerOutput := uint64(4)
  67. // Dual CUDA scenario with assymetry
  68. gpuMinimumMemory := uint64(2048)
  69. gpus = []gpu.GpuInfo{
  70. {
  71. Library: "cuda",
  72. MinimumMemory: gpuMinimumMemory,
  73. },
  74. {
  75. Library: "cuda",
  76. MinimumMemory: gpuMinimumMemory,
  77. },
  78. }
  79. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  80. for i, s := range []struct {
  81. layer0, layer1 uint64
  82. expect0, expect1 uint64
  83. }{
  84. {1, 1, 1, 1},
  85. {2, 1, 2, 1},
  86. {2, 2, 2, 2},
  87. {1, 2, 1, 2},
  88. {3, 3, 3, 3},
  89. {4, 4, 3, 3},
  90. {6, 6, 3, 3},
  91. {0, 3, 0, 3},
  92. } {
  93. t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
  94. gpus[0].FreeMemory = 0
  95. gpus[1].FreeMemory = 0
  96. gpus[0].FreeMemory += projectorSize
  97. if s.layer0 > 0 {
  98. gpus[0].FreeMemory += memoryLayerOutput
  99. } else {
  100. gpus[1].FreeMemory += memoryLayerOutput
  101. }
  102. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
  103. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
  104. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  105. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  106. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  107. assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
  108. assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
  109. var layerSums uint64
  110. for _, b := range estimate.GPUSizes {
  111. layerSums += b
  112. }
  113. if estimate.Layers < inputLayerCount+1 {
  114. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  115. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  116. } else {
  117. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  118. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  119. }
  120. })
  121. }
  122. }