123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- package llm
- import (
- "bytes"
- "encoding/binary"
- "fmt"
- "os"
- "testing"
- "github.com/ollama/ollama/api"
- "github.com/ollama/ollama/envconfig"
- "github.com/ollama/ollama/gpu"
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
- )
- func TestEstimateGPULayers(t *testing.T) {
- envconfig.Debug = true
- modelName := "dummy"
- f, err := os.CreateTemp(t.TempDir(), modelName)
- require.NoError(t, err)
- defer f.Close()
- gguf := NewGGUFV3(binary.LittleEndian)
- inputLayerCount := 5
- tensors := []Tensor{
- {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
- }
- assert.Len(t, tensors, inputLayerCount+1)
- err = gguf.Encode(f, KV{
- "general.architecture": "llama",
- "general.name": "name",
- "llama.context_length": uint32(32),
- "llama.embedding_length": uint32(4096),
- "llama.block_count": uint32(inputLayerCount),
- "llama.attention.head_count": uint32(32),
- "llama.attention.head_count_kv": uint32(32),
- "tokenizer.ggml.tokens": []string{" "},
- "tokenizer.ggml.scores": []float32{0},
- "tokenizer.ggml.token_type": []int32{0},
- }, tensors)
- require.NoError(t, err)
- ggml, err := LoadModel(f.Name(), 0)
- if err != nil {
- t.Fatal(err)
- }
- // Simple CPU scenario
- gpus := []gpu.GpuInfo{
- {
- Library: "cpu",
- },
- }
- projectors := []string{}
- opts := api.DefaultOptions()
- t.Run("cpu", func(t *testing.T) {
- estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
- assert.Equal(t, 0, estimate.Layers)
- assert.Equal(t, uint64(0), estimate.Graph)
- })
- // derived from the dummy ggml file above
- graphPartialOffload := uint64(202377216)
- graphFullOffload := uint64(171968512)
- layerSize := uint64(33554436)
- projectorSize := uint64(0)
- memoryLayerOutput := uint64(4)
- // Dual CUDA scenario with assymetry
- gpuMinimumMemory := uint64(2048)
- gpus = []gpu.GpuInfo{
- {
- Library: "cuda",
- MinimumMemory: gpuMinimumMemory,
- },
- {
- Library: "cuda",
- MinimumMemory: gpuMinimumMemory,
- },
- }
- // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
- for i, s := range []struct {
- layer0, layer1 uint64
- expect0, expect1 uint64
- }{
- {1, 1, 1, 1},
- {2, 1, 2, 1},
- {2, 2, 2, 2},
- {1, 2, 1, 2},
- {3, 3, 3, 3},
- {4, 4, 3, 3},
- {6, 6, 3, 3},
- {0, 3, 0, 3},
- } {
- t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
- gpus[0].FreeMemory = 0
- gpus[1].FreeMemory = 0
- gpus[0].FreeMemory += projectorSize
- if s.layer0 > 0 {
- gpus[0].FreeMemory += memoryLayerOutput
- } else {
- gpus[1].FreeMemory += memoryLayerOutput
- }
- gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
- gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
- gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
- gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
- estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
- assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
- assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
- var layerSums uint64
- for _, b := range estimate.GPUSizes {
- layerSums += b
- }
- if estimate.Layers < inputLayerCount+1 {
- assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
- assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
- } else {
- assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
- assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
- }
- })
- }
- }
|