From f6c15d00c58a09e7802bf8c7b872b23f59b213b6 Mon Sep 17 00:00:00 2001 From: Zhongkai Fu Date: Thu, 9 Apr 2026 00:30:04 -0700 Subject: [PATCH 1/3] support jpg format --- InferenceConsole/Program.cs | 2 +- InferenceEngine/InferenceEngine.csproj | 4 + .../Models/Gemma3/ImageProcessor.cs | 56 ++++++++++- .../Models/Qwen35/ImageProcessor.cs | 12 +-- InferenceWeb.Tests/ImageProcessorTests.cs | 97 +++++++++++++++++++ InferenceWeb/ModelService.cs | 2 +- 6 files changed, 158 insertions(+), 15 deletions(-) create mode 100644 InferenceWeb.Tests/ImageProcessorTests.cs diff --git a/InferenceConsole/Program.cs b/InferenceConsole/Program.cs index 38f134f..8f0d07f 100644 --- a/InferenceConsole/Program.cs +++ b/InferenceConsole/Program.cs @@ -628,7 +628,7 @@ static string RunInference(ModelBase model, string rawText, List imagePa var tokenCounts = new int[imagePaths.Count]; for (int i = 0; i < imagePaths.Count; i++) { - var (width, height) = Qwen35ImageProcessor.ReadPngDimensions(imagePaths[i]); + var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]); tokenCounts[i] = processor.ComputeImageTokenCount(height, width); var (gridH, gridW) = processor.GetPatchGrid(height, width); var (resizedH, resizedW) = processor.SmartResize(height, width); diff --git a/InferenceEngine/InferenceEngine.csproj b/InferenceEngine/InferenceEngine.csproj index 548c5db..6c04c0d 100644 --- a/InferenceEngine/InferenceEngine.csproj +++ b/InferenceEngine/InferenceEngine.csproj @@ -12,7 +12,11 @@ + + + + diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/InferenceEngine/Models/Gemma3/ImageProcessor.cs index def272e..02c66f8 100644 --- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs +++ b/InferenceEngine/Models/Gemma3/ImageProcessor.cs @@ -9,6 +9,7 @@ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. using System; using System.IO; +using StbImageSharp; namespace InferenceEngine { @@ -52,15 +53,43 @@ public float[] ProcessImage(string imagePath) internal static byte[] DecodeImageToRGBA(byte[] fileBytes, out int width, out int height) { - if (fileBytes.Length >= 8 && fileBytes[0] == 0x89 && fileBytes[1] == 0x50) + if (IsPng(fileBytes)) return DecodePNG(fileBytes, out width, out height); - if (fileBytes.Length >= 2 && fileBytes[0] == 0xFF && fileBytes[1] == 0xD8) + if (IsJpeg(fileBytes)) return DecodeJPEG(fileBytes, out width, out height); throw new NotSupportedException("Only PNG and JPEG image formats are supported"); } + internal static (int width, int height) ReadImageDimensions(string imagePath) + { + byte[] fileBytes = File.ReadAllBytes(imagePath); + + if (IsPng(fileBytes)) + return ReadPngDimensions(fileBytes); + + if (IsJpeg(fileBytes)) + { + DecodeJPEG(fileBytes, out int width, out int height); + return (width, height); + } + + throw new NotSupportedException("Only PNG and JPEG image formats are supported"); + } + + private static bool IsPng(byte[] fileBytes) => + fileBytes.Length >= 8 && + fileBytes[0] == 0x89 && + fileBytes[1] == 0x50 && + fileBytes[2] == 0x4E && + fileBytes[3] == 0x47; + + private static bool IsJpeg(byte[] fileBytes) => + fileBytes.Length >= 2 && + fileBytes[0] == 0xFF && + fileBytes[1] == 0xD8; + private static byte[] DecodePNG(byte[] data, out int width, out int height) { using var ms = new MemoryStream(data); @@ -179,6 +208,16 @@ private static byte[] DecodePNG(byte[] data, out int width, out int height) return rgba; } + private static (int width, int height) ReadPngDimensions(byte[] data) + { + if (data.Length < 24 || !IsPng(data)) + throw new InvalidDataException("Not a PNG file"); + + int width = (data[16] << 24) | (data[17] << 16) | (data[18] << 8) | data[19]; + int height = (data[20] << 24) | (data[21] << 16) | (data[22] << 8) | data[23]; + return (width, height); + } + private static byte PaethPredictor(byte a, byte b, byte c) { int p = a + b - c; @@ -196,8 +235,17 @@ private static int ReadBigEndianInt32(BinaryReader reader) private static byte[] DecodeJPEG(byte[] data, out int width, out int height) { - throw new NotSupportedException( - "JPEG decoding not implemented. Please convert image to PNG format."); + try + { + ImageResult decoded = ImageResult.FromMemory(data, ColorComponents.RedGreenBlueAlpha); + width = decoded.Width; + height = decoded.Height; + return decoded.Data; + } + catch (Exception ex) + { + throw new InvalidDataException("Failed to decode JPEG image.", ex); + } } internal static byte[] CompositeOverWhite(byte[] rgba, int width, int height) diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/InferenceEngine/Models/Qwen35/ImageProcessor.cs index c7c2a40..af94030 100644 --- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs +++ b/InferenceEngine/Models/Qwen35/ImageProcessor.cs @@ -8,7 +8,6 @@ // TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. using System; -using System.Buffers.Binary; using System.IO; namespace InferenceEngine @@ -34,14 +33,9 @@ public Qwen35ImageProcessor(int patchSize = 14, int mergeSize = 2, LongestEdge = longestEdge; } - public static (int width, int height) ReadPngDimensions(string path) + public static (int width, int height) ReadImageDimensions(string path) { - using var stream = File.OpenRead(path); - Span header = stackalloc byte[24]; - stream.Read(header); - int width = BinaryPrimitives.ReadInt32BigEndian(header.Slice(16, 4)); - int height = BinaryPrimitives.ReadInt32BigEndian(header.Slice(20, 4)); - return (width, height); + return Gemma3ImageProcessor.ReadImageDimensions(path); } public (int height, int width) SmartResize(int height, int width) @@ -79,7 +73,7 @@ public int ComputeImageTokenCount(int origHeight, int origWidth) public int ComputeImageTokenCount(string imagePath) { - var (width, height) = ReadPngDimensions(imagePath); + var (width, height) = ReadImageDimensions(imagePath); return ComputeImageTokenCount(height, width); } diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs new file mode 100644 index 0000000..92dd8ca --- /dev/null +++ b/InferenceWeb.Tests/ImageProcessorTests.cs @@ -0,0 +1,97 @@ +using InferenceEngine; + +namespace InferenceWeb.Tests; + +public class ImageProcessorTests +{ + private const string EmbeddedJpegBase64 = + "/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAACAAIDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD7V/Z2+C3w91v9n74ZajqPgTwzf6heeF9MuLm7utHt5JZ5XtImd3dkJZmJJJJySSTRRRXyOL/3ip/if5nwmO/3qr/il+bP/9k="; + + [Fact] + public void Gemma3ImageProcessorProcessImageSupportsJpeg() + { + string path = WriteEmbeddedJpeg(); + try + { + var processor = new Gemma3ImageProcessor(imageSize: 32); + float[] pixels = processor.ProcessImage(path); + + Assert.Equal(3 * 32 * 32, pixels.Length); + Assert.All(pixels, value => Assert.InRange(value, -1f, 1f)); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void Gemma4ImageProcessorProcessImageSupportsJpeg() + { + string path = WriteEmbeddedJpeg(); + try + { + var processor = new Gemma4ImageProcessor(patchSize: 1, nMerge: 1, minTokens: 1, maxTokens: 4); + var (pixels, width, height) = processor.ProcessImage(path); + + Assert.Equal(2, width); + Assert.Equal(2, height); + Assert.Equal(12, pixels.Length); + Assert.All(pixels, value => Assert.InRange(value, -1f, 1f)); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void Qwen35ImageProcessorComputeImageTokenCountSupportsJpeg() + { + string path = WriteEmbeddedJpeg(); + try + { + var processor = new Qwen35ImageProcessor(patchSize: 1, mergeSize: 1, shortestEdge: 1, longestEdge: 16); + int tokenCount = processor.ComputeImageTokenCount(path); + + Assert.Equal(4, tokenCount); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void UserSuppliedJpegSmokeTestWhenConfigured() + { + string? path = Environment.GetEnvironmentVariable("TENSORSHARP_JPEG_SMOKE_PATH"); + if (string.IsNullOrWhiteSpace(path) || !File.Exists(path)) + return; + + var gemma3 = new Gemma3ImageProcessor(imageSize: 32); + float[] gemma3Pixels = gemma3.ProcessImage(path); + + var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(path); + var qwen = new Qwen35ImageProcessor(patchSize: 1, mergeSize: 1, shortestEdge: 1, longestEdge: width * height); + int qwenTokens = qwen.ComputeImageTokenCount(path); + + var gemma4 = new Gemma4ImageProcessor(patchSize: 1, nMerge: 1, minTokens: 1, maxTokens: width * height); + var (gemma4Pixels, gemma4Width, gemma4Height) = gemma4.ProcessImage(path); + + Assert.Equal(3 * 32 * 32, gemma3Pixels.Length); + Assert.True(width > 0); + Assert.True(height > 0); + Assert.True(qwenTokens > 0); + Assert.Equal(width, gemma4Width); + Assert.Equal(height, gemma4Height); + Assert.Equal(3 * width * height, gemma4Pixels.Length); + } + + private static string WriteEmbeddedJpeg() + { + string path = Path.Combine(Path.GetTempPath(), $"{Guid.NewGuid():N}.jpg"); + File.WriteAllBytes(path, Convert.FromBase64String(EmbeddedJpegBase64)); + return path; + } +} diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs index 5404584..9996425 100644 --- a/InferenceWeb/ModelService.cs +++ b/InferenceWeb/ModelService.cs @@ -290,7 +290,7 @@ private List ProcessImages(ChatMessage msg, List inputTokens, string a var tokenCounts = new int[msg.ImagePaths.Count]; for (int i = 0; i < msg.ImagePaths.Count; i++) { - var (w, h) = Qwen35ImageProcessor.ReadPngDimensions(msg.ImagePaths[i]); + var (w, h) = Qwen35ImageProcessor.ReadImageDimensions(msg.ImagePaths[i]); tokenCounts[i] = processor.ComputeImageTokenCount(h, w); } inputTokens = ChatTemplate.ExpandImageTokens(inputTokens, imagePadId, tokenCounts); From 4bb80c793895deb579688cb9878f23464765162b Mon Sep 17 00:00:00 2001 From: Zhongkai Fu Date: Thu, 9 Apr 2026 08:00:04 -0700 Subject: [PATCH 2/3] Improve audio/video/image encoder performance --- InferenceEngine/ModelBase.cs | 12 + InferenceEngine/Models/Gemma3/Gemma3Model.cs | 12 +- .../Models/Gemma3/Gemma3VisionEncoder.cs | 134 ++--- .../Models/Gemma3/ImageProcessor.cs | 99 +++- .../Models/Gemma4/Gemma4AudioEncoder.cs | 85 +-- .../Models/Gemma4/Gemma4AudioPreprocessor.cs | 91 +-- .../Models/Gemma4/Gemma4ImageProcessor.cs | 6 +- InferenceEngine/Models/Gemma4/Gemma4Model.cs | 4 + .../Models/Gemma4/Gemma4VisionEncoder.cs | 222 ++++--- InferenceEngine/Models/Qwen3/Qwen3Model.cs | 2 + .../Models/Qwen35/ImageProcessor.cs | 6 +- InferenceEngine/Models/Qwen35/Qwen35Model.cs | 2 + .../Models/Qwen35/Qwen35VisionEncoder.cs | 342 +++++------ InferenceWeb/Program.cs | 4 + README.md | 2 +- TensorSharp.GGML.Native/build-linux.sh | 55 +- TensorSharp.GGML.Native/ggml_ops.cpp | 555 ++++++++++-------- TensorSharp.GGML/GgmlBasicOps.cs | 1 + TensorSharp.GGML/GgmlNative.cs | 30 +- TensorSharp.GGML/TensorSharp.GGML.csproj | 2 +- readme_cn.md | 2 +- 21 files changed, 961 insertions(+), 707 deletions(-) diff --git a/InferenceEngine/ModelBase.cs b/InferenceEngine/ModelBase.cs index f867729..d69022f 100644 --- a/InferenceEngine/ModelBase.cs +++ b/InferenceEngine/ModelBase.cs @@ -626,6 +626,7 @@ protected void CopyToCache(Tensor cache, Tensor src, int startPos, int seqLen) { using var cacheSlice = cache.Narrow(1, startPos, seqLen); Ops.Copy(cacheSlice, src); + InvalidateTensorDeviceCache(cache); } protected Tensor ExpandKVHeads(Tensor cache, int groupSize, int totalSeqLen) @@ -653,6 +654,9 @@ protected unsafe void CopyToCacheDecode(Tensor kCache, Tensor kTensor, Buffer.MemoryCopy(kSrc + srcOffset, kCachePtr + cacheOffset, headBytes, headBytes); Buffer.MemoryCopy(vSrc + srcOffset, vCachePtr + cacheOffset, headBytes, headBytes); } + + InvalidateTensorDeviceCache(kCache); + InvalidateTensorDeviceCache(vCache); } protected unsafe void AttentionDecodePureCS(Tensor q, Tensor kCache, Tensor vCache, @@ -718,6 +722,14 @@ private static IntPtr GetStoragePtr(Tensor t) throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); } + protected void InvalidateTensorDeviceCache(Tensor tensor) + { + if (!IsGgmlBackend || tensor == null) + return; + + GgmlBasicOps.InvalidateHostBuffer(GetStoragePtr(tensor)); + } + public abstract float[] Forward(int[] tokens); public abstract void ResetKVCache(); diff --git a/InferenceEngine/Models/Gemma3/Gemma3Model.cs b/InferenceEngine/Models/Gemma3/Gemma3Model.cs index 9ab0f8e..f676505 100644 --- a/InferenceEngine/Models/Gemma3/Gemma3Model.cs +++ b/InferenceEngine/Models/Gemma3/Gemma3Model.cs @@ -127,8 +127,16 @@ public override void ResetKVCache() _cacheSeqLen = 0; if (_kvCacheK != null) { - foreach (var k in _kvCacheK) Ops.Fill(k, 0f); - foreach (var v in _kvCacheV) Ops.Fill(v, 0f); + foreach (var k in _kvCacheK) + { + Ops.Fill(k, 0f); + InvalidateTensorDeviceCache(k); + } + foreach (var v in _kvCacheV) + { + Ops.Fill(v, 0f); + InvalidateTensorDeviceCache(v); + } } } diff --git a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs b/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs index 9eea4ab..389744b 100644 --- a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs +++ b/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs @@ -11,13 +11,16 @@ using System.Collections.Generic; using TensorSharp; using TensorSharp.Cpu; +using TensorSharp.GGML; namespace InferenceEngine { public class Gemma3VisionEncoder : IDisposable { private readonly Dictionary _weights = new(); + private readonly Dictionary _transposedWeights = new(); private readonly IAllocator _allocator; + private readonly bool _useNativeAttention; private readonly int _imageSize; private readonly int _patchSize; @@ -35,6 +38,7 @@ public class Gemma3VisionEncoder : IDisposable public Gemma3VisionEncoder(string mmProjPath, IAllocator allocator) { _allocator = allocator; + _useNativeAttention = allocator is GgmlAllocator; var gguf = new GgufFile(mmProjPath); _imageSize = (int)gguf.GetUint32("clip.vision.image_size", 896); @@ -219,7 +223,16 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches, float scale = 1f / MathF.Sqrt(headDim); - // Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim] + if (_useNativeAttention) + { + using var q4 = q.View(1, numPatches, _numHeads, headDim); + using var k4 = k.View(1, numPatches, _numHeads, headDim); + using var v4 = v.View(1, numPatches, _numHeads, headDim); + using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale); + using var flat = attn4.View(numPatches, _hiddenSize); + return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias"); + } + using var qReshaped = q.View(numPatches, _numHeads, headDim); using var kReshaped = k.View(numPatches, _numHeads, headDim); using var vReshaped = v.View(numPatches, _numHeads, headDim); @@ -231,49 +244,30 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches, using var kHeads = Ops.NewContiguous(kT0); using var vHeads = Ops.NewContiguous(vT0); - // Batched Q @ K^T -> [numHeads, numPatches, numPatches] using var kT = kHeads.Transpose(1, 2); var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches); Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT); - Ops.Softmax(scores, scores); - // Batched softmax @ V -> [numHeads, numPatches, headDim] var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim); Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads); scores.Dispose(); - // Reshape back: [numHeads, numPatches, headDim] -> [numPatches, hiddenSize] using var transposed = attnOutput.Transpose(0, 1); using var contiguous = Ops.NewContiguous(transposed); - using var flat = contiguous.View(numPatches, _hiddenSize); - using var flatContig = Ops.NewContiguous(flat); + using var flatContig = contiguous.View(numPatches, _hiddenSize); attnOutput.Dispose(); return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias"); } - private unsafe Tensor VisionMLP(Tensor input, string prefix) + private Tensor VisionMLP(Tensor input, string prefix) { using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias"); - - ApplyGELU(fc1Out); - + Ops.GELU(fc1Out, fc1Out); return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias"); } - private unsafe void ApplyGELU(Tensor t) - { - float* ptr = GetFloatPtr(t); - int count = (int)t.ElementCount(); - for (int i = 0; i < count; i++) - { - double x = ptr[i]; - double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x))); - ptr[i] = (float)(x * cdf); - } - } - /// /// Multi-modal projector: vision output → text space. /// Steps: reshape to 2D grid → average pool → RMSNorm → linear projection. @@ -354,91 +348,25 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input); Tensor src = contiguousInput ?? input; - - using var wT = weight.Transpose(); - Ops.Addmm(result, 0, result, 1.0f, src, wT); + Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName)); contiguousInput?.Dispose(); if (_weights.TryGetValue(biasName, out var bias)) - { - float* rPtr = GetFloatPtr(result); - float* bPtr = GetFloatPtr(bias); - for (int s = 0; s < seqLen; s++) - { - float* row = rPtr + s * outDim; - for (int d = 0; d < outDim; d++) - row[d] += bPtr[d]; - } - } + Ops.Add(result, result, bias); return result; } - private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName) + private Tensor LayerNormOp(Tensor input, string weightName, string biasName) { - int rows = (int)input.Sizes[0]; - int dim = (int)input.Sizes[1]; - var result = new Tensor(_allocator, DType.Float32, rows, dim); - - float* src = GetFloatPtr(input); - float* dst = GetFloatPtr(result); - float* w = GetFloatPtr(_weights[weightName]); - float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null; - - for (int r = 0; r < rows; r++) - { - float* srcRow = src + r * dim; - float* dstRow = dst + r * dim; - - float mean = 0; - for (int i = 0; i < dim; i++) - mean += srcRow[i]; - mean /= dim; - - float variance = 0; - for (int i = 0; i < dim; i++) - { - float diff = srcRow[i] - mean; - variance += diff * diff; - } - variance /= dim; - - float invStd = 1f / MathF.Sqrt(variance + _eps); - for (int i = 0; i < dim; i++) - { - float normalized = (srcRow[i] - mean) * invStd; - dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f); - } - } - - return result; + _weights.TryGetValue(biasName, out var bias); + return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps); } - private unsafe Tensor RMSNormOp(Tensor input, string weightName) + private Tensor RMSNormOp(Tensor input, string weightName) { - int rows = (int)input.Sizes[0]; - int dim = (int)input.Sizes[1]; - var result = new Tensor(_allocator, DType.Float32, rows, dim); - - float* src = GetFloatPtr(input); - float* dst = GetFloatPtr(result); - float* w = GetFloatPtr(_weights[weightName]); - - for (int r = 0; r < rows; r++) - { - float* srcRow = src + r * dim; - float* dstRow = dst + r * dim; - - float sumSq = 0; - for (int i = 0; i < dim; i++) - sumSq += srcRow[i] * srcRow[i]; - float rms = 1f / MathF.Sqrt(sumSq / dim + _eps); - for (int i = 0; i < dim; i++) - dstRow[i] = w[i] * srcRow[i] * rms; - } - - return result; + return Ops.RMSNorm(null, input, _weights[weightName], null, _eps); } private unsafe void DumpTensor(Tensor t, string label, int numRows) @@ -466,8 +394,22 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows) throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); } + private Tensor GetOrCreateTransposedWeight(string weightName) + { + if (_transposedWeights.TryGetValue(weightName, out var transposed)) + return transposed; + + using var weightViewT = _weights[weightName].Transpose(); + transposed = Ops.NewContiguous(weightViewT); + _transposedWeights[weightName] = transposed; + return transposed; + } + public void Dispose() { + foreach (var w in _transposedWeights.Values) + w.Dispose(); + _transposedWeights.Clear(); foreach (var w in _weights.Values) w.Dispose(); _weights.Clear(); diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/InferenceEngine/Models/Gemma3/ImageProcessor.cs index 02c66f8..bfea928 100644 --- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs +++ b/InferenceEngine/Models/Gemma3/ImageProcessor.cs @@ -9,6 +9,7 @@ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. using System; using System.IO; +using System.Threading.Tasks; using StbImageSharp; namespace InferenceEngine @@ -43,12 +44,7 @@ public float[] ProcessImage(string imagePath) byte[] fileBytes = File.ReadAllBytes(imagePath); int origWidth, origHeight; byte[] rgba = DecodeImageToRGBA(fileBytes, out origWidth, out origHeight); - - byte[] composited = CompositeOverWhite(rgba, origWidth, origHeight); - - byte[] resized = BilinearResize(composited, origWidth, origHeight, ImageSize, ImageSize); - - return PackChannelFirst(resized, ImageSize, ImageSize); + return ResizeRgbaToChannelFirstNormalized(rgba, origWidth, origHeight, ImageSize, ImageSize); } internal static byte[] DecodeImageToRGBA(byte[] fileBytes, out int width, out int height) @@ -251,24 +247,30 @@ private static byte[] DecodeJPEG(byte[] data, out int width, out int height) internal static byte[] CompositeOverWhite(byte[] rgba, int width, int height) { byte[] result = new byte[width * height * 4]; - for (int i = 0; i < width * height; i++) + Parallel.For(0, height, y => { - int a = rgba[i * 4 + 3]; - if (a == 255) - { - result[i * 4] = rgba[i * 4]; - result[i * 4 + 1] = rgba[i * 4 + 1]; - result[i * 4 + 2] = rgba[i * 4 + 2]; - } - else + int srcRow = y * width * 4; + for (int x = 0; x < width; x++) { - float alpha = a / 255f; - result[i * 4] = (byte)(rgba[i * 4] * alpha + 255 * (1 - alpha)); - result[i * 4 + 1] = (byte)(rgba[i * 4 + 1] * alpha + 255 * (1 - alpha)); - result[i * 4 + 2] = (byte)(rgba[i * 4 + 2] * alpha + 255 * (1 - alpha)); + int pixBase = srcRow + x * 4; + int a = rgba[pixBase + 3]; + if (a == 255) + { + result[pixBase] = rgba[pixBase]; + result[pixBase + 1] = rgba[pixBase + 1]; + result[pixBase + 2] = rgba[pixBase + 2]; + } + else + { + float alpha = a / 255f; + result[pixBase] = (byte)(rgba[pixBase] * alpha + 255 * (1 - alpha)); + result[pixBase + 1] = (byte)(rgba[pixBase + 1] * alpha + 255 * (1 - alpha)); + result[pixBase + 2] = (byte)(rgba[pixBase + 2] * alpha + 255 * (1 - alpha)); + } + + result[pixBase + 3] = 255; } - result[i * 4 + 3] = 255; - } + }); return result; } @@ -278,7 +280,7 @@ internal static byte[] BilinearResize(byte[] rgba, int srcW, int srcH, int dstW, double xRatio = (double)srcW / dstW; double yRatio = (double)srcH / dstH; - for (int dy = 0; dy < dstH; dy++) + Parallel.For(0, dstH, dy => { double srcY = (dy + 0.5) * yRatio - 0.5; int y0 = Math.Max(0, (int)srcY); @@ -305,11 +307,62 @@ internal static byte[] BilinearResize(byte[] rgba, int srcW, int srcH, int dstW, } result[(dy * dstW + dx) * 4 + 3] = 255; } - } + }); + + return result; + } + + internal static float[] ResizeRgbaToChannelFirstNormalized(byte[] rgba, int srcW, int srcH, int dstW, int dstH) + { + int pixels = dstW * dstH; + float[] result = new float[3 * pixels]; + double xRatio = (double)srcW / dstW; + double yRatio = (double)srcH / dstH; + + Parallel.For(0, dstH, dy => + { + double srcY = (dy + 0.5) * yRatio - 0.5; + int y0 = Math.Max(0, (int)srcY); + int y1 = Math.Min(srcH - 1, y0 + 1); + double fy = srcY - y0; + + for (int dx = 0; dx < dstW; dx++) + { + double srcX = (dx + 0.5) * xRatio - 0.5; + int x0 = Math.Max(0, (int)srcX); + int x1 = Math.Min(srcW - 1, x0 + 1); + double fx = srcX - x0; + + int dstIdx = dy * dstW + dx; + result[dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 0); + result[pixels + dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 1); + result[2 * pixels + dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 2); + } + }); return result; } + private static float BilinearSampleNormalized(byte[] rgba, int srcW, int x0, int y0, int x1, int y1, + double fx, double fy, int channel) + { + float v00 = CompositeChannelToNormalized(rgba, (y0 * srcW + x0) * 4, channel); + float v01 = CompositeChannelToNormalized(rgba, (y0 * srcW + x1) * 4, channel); + float v10 = CompositeChannelToNormalized(rgba, (y1 * srcW + x0) * 4, channel); + float v11 = CompositeChannelToNormalized(rgba, (y1 * srcW + x1) * 4, channel); + + double v = v00 * (1 - fx) * (1 - fy) + v01 * fx * (1 - fy) + + v10 * (1 - fx) * fy + v11 * fx * fy; + return Math.Clamp((float)v, -1f, 1f); + } + + private static float CompositeChannelToNormalized(byte[] rgba, int pixelBase, int channel) + { + float alpha = rgba[pixelBase + 3] / 255f; + float composited = rgba[pixelBase + channel] * alpha + 255f * (1f - alpha); + return composited / 255f * 2f - 1f; + } + /// /// Pack RGBA pixels into channel-first float format [R..., G..., B...] normalized with mean/std. /// Matches Ollama's pack(): channel-first with (pixel/255 - mean) / std. diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs b/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs index 3baa0ee..51e5280 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs +++ b/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs @@ -11,12 +11,14 @@ using System.Collections.Generic; using TensorSharp; using TensorSharp.Cpu; +using TensorSharp.GGML; namespace InferenceEngine { public class Gemma4AudioEncoder : IDisposable { private readonly Dictionary _weights = new(); + private readonly Dictionary _transposedWeights = new(); private readonly IAllocator _allocator; private readonly int _hiddenSize; @@ -42,9 +44,11 @@ private struct ClampParams public bool HasClamp; } private readonly Dictionary _clampParams = new(); + private readonly Dictionary _positionEmbeddingCache = new(); private bool _useOllamaNames; private Tensor _onesForNorm; + private readonly float[] _causalMask; public int ProjectionDim => _projectionDim; @@ -77,6 +81,7 @@ public Gemma4AudioEncoder(string mmProjPath, IAllocator allocator) gguf.Dispose(); _useOllamaNames = _weights.ContainsKey("a.blk.0.ln1.weight"); + _causalMask = BuildCausalValidMask(); Console.WriteLine($" GGUF naming: {(_useOllamaNames ? "Ollama" : "mmproj/Unsloth")}"); } @@ -197,8 +202,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames) int outDim = (int)sscpWeight.Sizes[0]; hidDim = outDim; hiddenTensor = new Tensor(_allocator, DType.Float32, t1Out, hidDim); - using (var wT = sscpWeight.Transpose()) - Ops.Addmm(hiddenTensor, 0, hiddenTensor, 1f, projTensor, wT); + Ops.Addmm(hiddenTensor, 0, hiddenTensor, 1f, projTensor, GetOrCreateTransposedWeight(sscpWeightName)); projTensor.Dispose(); string biasName = sscpWeightName.Replace(".weight", ".bias"); @@ -215,13 +219,11 @@ public unsafe Tensor Encode(float[] melData, int numFrames) Console.Write($" proj=[{seqLen},{hidDim}]"); // Build causal-valid mask - float[] causalMask = BuildCausalValidMask(); - // Conformer blocks for (int i = 0; i < _numLayers; i++) { Console.Write($"\r Audio conformer block {i + 1}/{_numLayers}... "); - hiddenTensor = ConformerBlock(hiddenTensor, i, seqLen, hidDim, causalMask); + hiddenTensor = ConformerBlock(hiddenTensor, i, seqLen, hidDim, _causalMask); } Console.Write("\r Audio conformer done. \n"); @@ -230,8 +232,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames) { int outDim = (int)outProjWeight.Sizes[0]; var outProj = new Tensor(_allocator, DType.Float32, seqLen, outDim); - using (var wT = outProjWeight.Transpose()) - Ops.Addmm(outProj, 0, outProj, 1f, hiddenTensor, wT); + Ops.Addmm(outProj, 0, outProj, 1f, hiddenTensor, GetOrCreateTransposedWeight("a.output_proj.weight")); if (_weights.TryGetValue("a.output_proj.bias", out var outProjBias)) AddBias(outProj, outProjBias, seqLen, outDim); hiddenTensor.Dispose(); @@ -252,8 +253,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames) { int fcOutDim = (int)fcWeight.Sizes[0]; var fcOut = new Tensor(_allocator, DType.Float32, seqLen, fcOutDim); - using (var wT = fcWeight.Transpose()) - Ops.Addmm(fcOut, 0, fcOut, 1f, hiddenTensor, wT); + Ops.Addmm(fcOut, 0, fcOut, 1f, hiddenTensor, GetOrCreateTransposedWeight(fcWeightName)); string fcBiasName = fcWeightName.Replace(".weight", ".bias"); if (_weights.TryGetValue(fcBiasName, out var fcBias)) @@ -524,51 +524,49 @@ private unsafe void ChunkedAttention(float[] qArr, float[] kPadded, float[] vPad for (int h = 0; h < _numHeads; h++) { + float[] logitsBuffer = new float[ctx]; for (int qi = 0; qi < cs; qi++) { int globalQIdx = chunkIdx * cs + qi; if (globalQIdx >= seqLen) { - // Padded position - zero output continue; } - float[] logits = new float[ctx]; + Span logits = logitsBuffer; + int qOffset = globalQIdx * hidDim + h * _headDim; for (int ci = 0; ci < ctx; ci++) { - // Content-content: q[qi] dot k[ci] + int actualTime = chunkIdx * cs + ci - padLeft; + bool causalOK = causalMask[qi * ctx + ci] > 0; + bool validOK = actualTime >= 0 && actualTime < seqLen; + if (!causalOK || !validOK) + { + logits[ci] = -1e9f; + continue; + } + float dotCC = 0; - int qOffset = globalQIdx * hidDim + h * _headDim; - int kGlobalIdx = chunkIdx * cs + ci; // position in kPadded + int kGlobalIdx = chunkIdx * cs + ci; int kOffset = kGlobalIdx * hidDim + h * _headDim; for (int d = 0; d < _headDim; d++) dotCC += qArr[qOffset + d] * kPadded[kOffset + d]; - // Content-position: q[qi] dot posEmb[relPos] float dotCP = 0; - for (int d = 0; d < _headDim; d++) + int posIdx = RelativeShiftIndex(qi, ci, maxSpan); + if (posIdx >= 0 && posIdx < maxSpan) { - int posIdx = RelativeShiftIndex(qi, ci, maxSpan); - if (posIdx >= 0 && posIdx < maxSpan) - dotCP += qArr[qOffset + d] * posEmb[(posIdx * _numHeads + h) * _headDim + d]; + int posOffset = (posIdx * _numHeads + h) * _headDim; + for (int d = 0; d < _headDim; d++) + dotCP += qArr[qOffset + d] * posEmb[posOffset + d]; } logits[ci] = dotCC + dotCP; - - // Logit softcap logits[ci] = MathF.Tanh(logits[ci] / _logitCap) * _logitCap; - - // Apply mask - int actualTime = chunkIdx * cs + ci - padLeft; - bool causalOK = causalMask[qi * ctx + ci] > 0; - bool validOK = actualTime >= 0 && actualTime < seqLen; - if (!causalOK || !validOK) - logits[ci] = -1e9f; } - // Softmax float maxLogit = float.NegativeInfinity; for (int ci = 0; ci < ctx; ci++) if (logits[ci] > maxLogit) maxLogit = logits[ci]; @@ -582,7 +580,6 @@ private unsafe void ChunkedAttention(float[] qArr, float[] kPadded, float[] vPad for (int ci = 0; ci < ctx; ci++) logits[ci] *= invSum; - // Weighted sum of values int outOffset = globalQIdx * hidDim + h * _headDim; for (int d = 0; d < _headDim; d++) { @@ -610,6 +607,9 @@ private int RelativeShiftIndex(int queryInChunk, int contextIdx, int maxSpan) private float[] BuildPositionEmbeddings(string prefix, int maxSpan) { + if (_positionEmbeddingCache.TryGetValue(prefix, out var cached)) + return cached; + int halfDim = _hiddenSize / 2; double logInc = Math.Log(10000.0) / Math.Max(halfDim - 1, 1); @@ -627,7 +627,10 @@ private float[] BuildPositionEmbeddings(string prefix, int maxSpan) string relKey = ResolveName(prefix, "attn_k_rel") + ".weight"; if (!_weights.TryGetValue(relKey, out var relWeight)) + { + _positionEmbeddingCache[prefix] = sinEmb; return sinEmb; + } int relOutDim = (int)relWeight.Sizes[0]; int inDim = (int)relWeight.Sizes[1]; @@ -637,14 +640,14 @@ private float[] BuildPositionEmbeddings(string prefix, int maxSpan) using var sinSlice = sinTensor.Narrow(1, 0, inDim); using var sinContig = Ops.NewContiguous(sinSlice); - using var wT = relWeight.Transpose(); var result = new Tensor(_allocator, DType.Float32, maxSpan, relOutDim); - Ops.Addmm(result, 0, result, 1f, sinContig, wT); + Ops.Addmm(result, 0, result, 1f, sinContig, GetOrCreateTransposedWeight(relKey)); float[] projected = new float[maxSpan * relOutDim]; result.CopyToArray(projected); result.Dispose(); + _positionEmbeddingCache[prefix] = projected; return projected; } @@ -764,8 +767,7 @@ private Tensor AudioClippableLinearForward(Tensor input, string prefix, int seqL } var result = new Tensor(_allocator, DType.Float32, seqLen, outDim); - using (var wT = weight.Transpose()) - Ops.Addmm(result, 0, result, 1f, src, wT); + Ops.Addmm(result, 0, result, 1f, src, GetOrCreateTransposedWeight(weightName)); if (hasClamp && src != input) src.Dispose(); @@ -864,11 +866,26 @@ private float[] BuildCausalValidMask() throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); } + private Tensor GetOrCreateTransposedWeight(string weightName) + { + if (_transposedWeights.TryGetValue(weightName, out var transposed)) + return transposed; + + using var weightViewT = _weights[weightName].Transpose(); + transposed = Ops.NewContiguous(weightViewT); + _transposedWeights[weightName] = transposed; + return transposed; + } + #endregion public void Dispose() { _onesForNorm?.Dispose(); + foreach (var w in _transposedWeights.Values) + w.Dispose(); + _transposedWeights.Clear(); + _positionEmbeddingCache.Clear(); foreach (var w in _weights.Values) w.Dispose(); _weights.Clear(); diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs b/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs index db01c4b..325d53e 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs +++ b/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs @@ -10,6 +10,7 @@ using System; using System.IO; using System.Numerics; +using System.Threading.Tasks; using NLayer; using NVorbis; @@ -27,6 +28,10 @@ public class Gemma4AudioPreprocessor private static readonly int FrameLength = (int)Math.Round(SampleRate * FrameLengthMs / 1000.0); // 320 private static readonly int HopLength = (int)Math.Round(SampleRate * HopLengthMs / 1000.0); // 160 + private static readonly int FftLength = ComputeFftLength(); + private static readonly int NumFreqBins = FftLength / 2 + 1; + private static readonly double[] HannWindow = BuildWindow(); + private static readonly float[] MelFilters = BuildMelFilterBank(NumFreqBins, MelBins, MinFrequency, MaxFrequency, SampleRate); public static float[] DecodeAudioFile(string path) { @@ -206,43 +211,27 @@ private static float[] ResampleLinear(float[] samples, int fromRate, int toRate) public static (float[] melData, int numFrames) ComputeMelSpectrogram(float[] samples) { - int fftLen = 1; - while (fftLen < FrameLength) fftLen <<= 1; - fftLen *= 2; // fft_overdrive - - double[] window = new double[FrameLength]; - double arg = Math.PI * 2.0 / FrameLength; - for (int i = 0; i < FrameLength; i++) - window[i] = 0.5 - 0.5 * Math.Cos(arg * (i + 0.5)); - - int numFreqBins = fftLen / 2 + 1; - float[] melFilters = BuildMelFilterBank(numFreqBins, MelBins, MinFrequency, MaxFrequency, SampleRate); - int frameSizeForUnfold = FrameLength + 1; int numFrames = (samples.Length - frameSizeForUnfold) / HopLength; if (numFrames <= 0) return (null, 0); float[] result = new float[numFrames * MelBins]; - Complex[] fftInput = new Complex[fftLen]; - - for (int f = 0; f < numFrames; f++) + if (numFrames < 8) { - int start = f * HopLength; - for (int i = 0; i < FrameLength; i++) - fftInput[i] = new Complex(samples[start + i] * window[i], 0); - for (int i = FrameLength; i < fftLen; i++) - fftInput[i] = Complex.Zero; - - FFT(fftInput); - - for (int m = 0; m < MelBins; m++) - { - double melVal = 0; - for (int k = 0; k < numFreqBins; k++) - melVal += fftInput[k].Magnitude * melFilters[k * MelBins + m]; - if (melVal < MelFloor) melVal = MelFloor; - result[f * MelBins + m] = (float)Math.Log(melVal); - } + var fftInput = new Complex[FftLength]; + for (int f = 0; f < numFrames; f++) + ComputeMelFrame(samples, f, fftInput, result); + } + else + { + Parallel.For(0, numFrames, + () => new Complex[FftLength], + (f, _, fftInput) => + { + ComputeMelFrame(samples, f, fftInput, result); + return fftInput; + }, + _ => { }); } return (result, numFrames); @@ -327,9 +316,6 @@ public static int ComputeAudioTokenCount(float[] samples) int padded = samples.Length + (128 - samples.Length % 128); samples = new float[padded]; } - int fftLen = 1; - while (fftLen < FrameLength) fftLen <<= 1; - fftLen *= 2; int frameSizeForUnfold = FrameLength + 1; int numFrames = (samples.Length - frameSizeForUnfold) / HopLength; if (numFrames <= 0) return 0; @@ -338,5 +324,42 @@ public static int ComputeAudioTokenCount(float[] samples) int tConv1 = (tConv0 + 2 - 3) / 2 + 1; return tConv1; } + + private static void ComputeMelFrame(float[] samples, int frameIndex, Complex[] fftInput, float[] result) + { + int start = frameIndex * HopLength; + for (int i = 0; i < FrameLength; i++) + fftInput[i] = new Complex(samples[start + i] * HannWindow[i], 0); + for (int i = FrameLength; i < FftLength; i++) + fftInput[i] = Complex.Zero; + + FFT(fftInput); + + int dstOffset = frameIndex * MelBins; + for (int m = 0; m < MelBins; m++) + { + double melVal = 0; + for (int k = 0; k < NumFreqBins; k++) + melVal += fftInput[k].Magnitude * MelFilters[k * MelBins + m]; + if (melVal < MelFloor) melVal = MelFloor; + result[dstOffset + m] = (float)Math.Log(melVal); + } + } + + private static int ComputeFftLength() + { + int fftLen = 1; + while (fftLen < FrameLength) fftLen <<= 1; + return fftLen * 2; + } + + private static double[] BuildWindow() + { + double[] window = new double[FrameLength]; + double arg = Math.PI * 2.0 / FrameLength; + for (int i = 0; i < FrameLength; i++) + window[i] = 0.5 - 0.5 * Math.Cos(arg * (i + 0.5)); + return window; + } } } diff --git a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs b/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs index f53ff61..840abda 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs +++ b/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs @@ -39,14 +39,12 @@ public Gemma4ImageProcessor(int patchSize = 16, int nMerge = 3, int minTokens = byte[] fileBytes = File.ReadAllBytes(imagePath); int origWidth, origHeight; byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out origWidth, out origHeight); - byte[] composited = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight); int alignSize = PatchSize * NMerge; SmartResize(origWidth, origHeight, alignSize, out int targetW, out int targetH); - byte[] resized = Gemma3ImageProcessor.BilinearResize(composited, origWidth, origHeight, targetW, targetH); - - float[] pixels = PackChannelFirst(resized, targetW, targetH); + float[] pixels = Gemma3ImageProcessor.ResizeRgbaToChannelFirstNormalized( + rgba, origWidth, origHeight, targetW, targetH); return (pixels, targetW, targetH); } diff --git a/InferenceEngine/Models/Gemma4/Gemma4Model.cs b/InferenceEngine/Models/Gemma4/Gemma4Model.cs index 6fb1368..890e827 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4Model.cs +++ b/InferenceEngine/Models/Gemma4/Gemma4Model.cs @@ -359,6 +359,8 @@ public override void ResetKVCache() if (_kvDonorMap.ContainsKey(l)) continue; Ops.Fill(_kvCacheK[l], 0f); Ops.Fill(_kvCacheV[l], 0f); + InvalidateTensorDeviceCache(_kvCacheK[l]); + InvalidateTensorDeviceCache(_kvCacheV[l]); cleared.Add(l); } } @@ -1684,6 +1686,8 @@ private unsafe void CopyToCacheCircular(Tensor cache, Tensor src, int startPos, Buffer.MemoryCopy(srcRow, dstRow, headBytes, headBytes); } } + + InvalidateTensorDeviceCache(cache); } private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize) diff --git a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs b/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs index 11133d5..304529f 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs +++ b/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs @@ -11,13 +11,16 @@ using System.Collections.Generic; using TensorSharp; using TensorSharp.Cpu; +using TensorSharp.GGML; namespace InferenceEngine { public class Gemma4VisionEncoder : IDisposable { private readonly Dictionary _weights = new(); + private readonly Dictionary _transposedWeights = new(); private readonly IAllocator _allocator; + private readonly bool _useNativeAttention; private readonly int _hiddenSize; private readonly int _intermediateSize; @@ -36,13 +39,25 @@ private struct ClampParams } private readonly Dictionary _clampParams = new(); + private readonly Dictionary _ropeCache = new(); private Tensor _onesForNorm; + private sealed class Rope2DCache + { + public required int[] PosX { get; init; } + public required int[] PosY { get; init; } + public required float[] CosX { get; init; } + public required float[] SinX { get; init; } + public required float[] CosY { get; init; } + public required float[] SinY { get; init; } + } + public int ProjectionDim => _projectionDim; public Gemma4VisionEncoder(string mmProjPath, IAllocator allocator) { _allocator = allocator; + _useNativeAttention = allocator is GgmlAllocator; var gguf = new GgufFile(mmProjPath); _hiddenSize = (int)gguf.GetUint32("clip.vision.embedding_length", 768); @@ -126,22 +141,15 @@ public unsafe Tensor Encode(float[] pixelValues, int imgWidth, int imgHeight) int patchesY = imgHeight / _patchSize; int numPatches = patchesX * patchesY; int headDim = _hiddenSize / _numHeads; + Rope2DCache ropeCache = GetOrCreateRopeCache(patchesX, patchesY, headDim); var hidden = PatchEmbed(pixelValues, imgWidth, imgHeight, patchesX, patchesY); - AddPositionEmbedding2D(hidden, patchesX, patchesY, numPatches); - - int[] posXData = new int[numPatches]; - int[] posYData = new int[numPatches]; - for (int i = 0; i < numPatches; i++) - { - posXData[i] = i % patchesX; - posYData[i] = i / patchesX; - } + AddPositionEmbedding2D(hidden, ropeCache, numPatches); for (int i = 0; i < _blockCount; i++) { Console.Write($"\r Vision encoder block {i + 1}/{_blockCount}..."); - hidden = EncoderBlock(hidden, i, numPatches, headDim, posXData, posYData); + hidden = EncoderBlock(hidden, i, numPatches, headDim, ropeCache); } Console.WriteLine(" done"); @@ -194,48 +202,33 @@ private unsafe Tensor PatchEmbed(float[] pixelValues, int imgW, int imgH, int pa return result; } - private void AddPositionEmbedding2D(Tensor hidden, int patchesX, int patchesY, int numPatches) + private unsafe void AddPositionEmbedding2D(Tensor hidden, Rope2DCache ropeCache, int numPatches) { var posEmbd = _weights["v.position_embd.weight"]; + int maxPos = (int)posEmbd.Sizes[1]; + float* posPtr = GetFloatPtr(posEmbd); + float* xTable = posPtr; + float* yTable = posPtr + maxPos * _hiddenSize; + float* dstPtr = GetFloatPtr(hidden); - // posEmbd shape in TensorSharp: [2, maxPos, hiddenSize] - // tblX = posEmbd[0], tblY = posEmbd[1] - long maxPos = posEmbd.Sizes[1]; - Tensor tblXNarrow = posEmbd.Narrow(0, 0, 1); - Tensor tblX = tblXNarrow.View(maxPos, _hiddenSize); - tblXNarrow.Dispose(); - Tensor tblYNarrow = posEmbd.Narrow(0, 1, 1); - Tensor tblY = tblYNarrow.View(maxPos, _hiddenSize); - tblYNarrow.Dispose(); - - int[] xIndices = new int[numPatches]; - int[] yIndices = new int[numPatches]; - for (int py = 0; py < patchesY; py++) - for (int px = 0; px < patchesX; px++) - { - int idx = py * patchesX + px; - xIndices[idx] = px; - yIndices[idx] = py; - } - - using var xIdx = CreateIntTensor(xIndices, numPatches); - using var yIdx = CreateIntTensor(yIndices, numPatches); - using var xEmb = Ops.IndexSelect(null, tblX, xIdx); - using var yEmb = Ops.IndexSelect(null, tblY, yIdx); - Ops.Add(hidden, hidden, xEmb); - Ops.Add(hidden, hidden, yEmb); - tblX.Dispose(); - tblY.Dispose(); + for (int p = 0; p < numPatches; p++) + { + float* dstRow = dstPtr + p * _hiddenSize; + float* xRow = xTable + ropeCache.PosX[p] * _hiddenSize; + float* yRow = yTable + ropeCache.PosY[p] * _hiddenSize; + for (int d = 0; d < _hiddenSize; d++) + dstRow[d] += xRow[d] + yRow[d]; + } } private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int headDim, - int[] posXData, int[] posYData) + Rope2DCache ropeCache) { string prefix = $"v.blk.{blockIdx}"; using var attnNormed = RMSNormOp(hidden, $"{prefix}.ln1.weight"); using var attnOut = VisionSelfAttention(attnNormed, prefix, numPatches, headDim, - posXData, posYData); + ropeCache); using var postAttnNormed = RMSNormOp(attnOut, $"{prefix}.attn_post_norm.weight"); Ops.Add(postAttnNormed, postAttnNormed, hidden); @@ -252,13 +245,32 @@ private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int hea } private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches, int headDim, - int[] posXData, int[] posYData) + Rope2DCache ropeCache) { var q = ClippableLinear(input, $"{prefix}.attn_q"); var k = ClippableLinear(input, $"{prefix}.attn_k"); var v = ClippableLinear(input, $"{prefix}.attn_v"); - // Reshape to [numHeads, numPatches, headDim] + ApplyPerHeadRMSNorm(q, _weights[$"{prefix}.attn_q_norm.weight"], numPatches, headDim); + ApplyPerHeadRMSNorm(k, _weights[$"{prefix}.attn_k_norm.weight"], numPatches, headDim); + ApplyUnweightedRMSNorm(v, _numHeads * numPatches, headDim); + + Apply2DRoPE(q, ropeCache, numPatches, headDim); + Apply2DRoPE(k, ropeCache, numPatches, headDim); + + if (_useNativeAttention) + { + using var q4 = q.View(1, numPatches, _numHeads, headDim); + using var k4 = k.View(1, numPatches, _numHeads, headDim); + using var v4 = v.View(1, numPatches, _numHeads, headDim); + using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, 1f); + using var flat = attn4.View(numPatches, _hiddenSize); + q.Dispose(); + k.Dispose(); + v.Dispose(); + return ClippableLinear(flat, $"{prefix}.attn_out"); + } + using var qR = q.View(numPatches, _numHeads, headDim); using var kR = k.View(numPatches, _numHeads, headDim); using var vR = v.View(numPatches, _numHeads, headDim); @@ -272,19 +284,6 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa k.Dispose(); v.Dispose(); - // QK RMSNorm (weighted) - ApplyPerHeadRMSNorm(qHeads, _weights[$"{prefix}.attn_q_norm.weight"], _numHeads, numPatches, headDim); - ApplyPerHeadRMSNorm(kHeads, _weights[$"{prefix}.attn_k_norm.weight"], _numHeads, numPatches, headDim); - - // V RMSNorm (unweighted) - ApplyUnweightedRMSNorm(vHeads, _numHeads * numPatches, headDim); - - // 2D NeoX RoPE: split head dim in half, apply RoPE with X positions to first half, - // Y positions to second half - Apply2DRoPE(qHeads, posXData, posYData, _numHeads, numPatches, headDim); - Apply2DRoPE(kHeads, posXData, posYData, _numHeads, numPatches, headDim); - - // Attention: Q @ K^T (no scaling since QK norms handle it) using var kT = kHeads.Transpose(1, 2); var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches); Ops.AddmmBatch(scores, 0, scores, 1f, qHeads, kT); @@ -296,46 +295,38 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa using var transposed = attnOutput.Transpose(0, 1); using var contiguous = Ops.NewContiguous(transposed); - using var flat = contiguous.View(numPatches, _hiddenSize); - using var flatContig = Ops.NewContiguous(flat); + using var flatContig = contiguous.View(numPatches, _hiddenSize); attnOutput.Dispose(); return ClippableLinear(flatContig, $"{prefix}.attn_out"); } - private unsafe void Apply2DRoPE(Tensor heads, int[] posX, int[] posY, - int numHeads, int numPatches, int headDim) + private unsafe void Apply2DRoPE(Tensor data, Rope2DCache ropeCache, int numPatches, int headDim) { - float* ptr = GetFloatPtr(heads); + float* ptr = GetFloatPtr(data); int halfDim = headDim / 2; int quarterDim = halfDim / 2; - for (int h = 0; h < numHeads; h++) + for (int p = 0; p < numPatches; p++) { - for (int p = 0; p < numPatches; p++) + int ropeBase = p * quarterDim; + for (int h = 0; h < _numHeads; h++) { - float* head = ptr + ((long)h * numPatches + p) * headDim; - - // First half: apply RoPE with X positions + float* head = ptr + ((long)p * _numHeads + h) * headDim; for (int j = 0; j < quarterDim; j++) { - float freq = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim)); - float angle = posX[p] * freq; - float cos = MathF.Cos(angle); - float sin = MathF.Sin(angle); + float cos = ropeCache.CosX[ropeBase + j]; + float sin = ropeCache.SinX[ropeBase + j]; float x0 = head[j]; float x1 = head[j + quarterDim]; head[j] = x0 * cos - x1 * sin; head[j + quarterDim] = x0 * sin + x1 * cos; } - // Second half: apply RoPE with Y positions for (int j = 0; j < quarterDim; j++) { - float freq = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim)); - float angle = posY[p] * freq; - float cos = MathF.Cos(angle); - float sin = MathF.Sin(angle); + float cos = ropeCache.CosY[ropeBase + j]; + float sin = ropeCache.SinY[ropeBase + j]; float x0 = head[halfDim + j]; float x1 = head[halfDim + j + quarterDim]; head[halfDim + j] = x0 * cos - x1 * sin; @@ -345,10 +336,9 @@ private unsafe void Apply2DRoPE(Tensor heads, int[] posX, int[] posY, } } - private void ApplyPerHeadRMSNorm(Tensor data, Tensor normWeight, - int numHeads, int numPatches, int headDim) + private void ApplyPerHeadRMSNorm(Tensor data, Tensor normWeight, int numPatches, int headDim) { - int total = numHeads * numPatches; + int total = _numHeads * numPatches; using var reshaped = data.View(total, headDim); Ops.RMSNorm(reshaped, reshaped, normWeight, null, _eps); } @@ -361,7 +351,8 @@ private void ApplyUnweightedRMSNorm(Tensor data, int numVectors, int dim) _onesForNorm = new Tensor(_allocator, DType.Float32, dim); Ops.Fill(_onesForNorm, 1f); } - Ops.RMSNorm(data, data, _onesForNorm, null, _eps); + using var reshaped = data.View(numVectors, dim); + Ops.RMSNorm(reshaped, reshaped, _onesForNorm, null, _eps); } private unsafe Tensor VisionMLP(Tensor input, string prefix) @@ -402,8 +393,7 @@ private unsafe Tensor ClippableLinear(Tensor input, string prefix) Clamp(src, cp.InMin, cp.InMax); var result = new Tensor(_allocator, DType.Float32, seqLen, outDim); - using var wT = weight.Transpose(); - Ops.Addmm(result, 0, result, 1f, src, wT); + Ops.Addmm(result, 0, result, 1f, src, GetOrCreateTransposedWeight(weightName)); contiguousInput?.Dispose(); @@ -487,8 +477,7 @@ private Tensor LinearProjection(Tensor input, string weightName) int outDim = (int)weight.Sizes[0]; var result = new Tensor(_allocator, DType.Float32, seqLen, outDim); - using var wT = weight.Transpose(); - Ops.Addmm(result, 0, result, 1f, input, wT); + Ops.Addmm(result, 0, result, 1f, input, GetOrCreateTransposedWeight(weightName)); return result; } @@ -514,12 +503,79 @@ private Tensor CreateIntTensor(int[] data, params long[] sizes) throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); } + private Tensor GetOrCreateTransposedWeight(string weightName) + { + if (_transposedWeights.TryGetValue(weightName, out var transposed)) + return transposed; + + using var weightViewT = _weights[weightName].Transpose(); + transposed = Ops.NewContiguous(weightViewT); + _transposedWeights[weightName] = transposed; + return transposed; + } + + private Rope2DCache GetOrCreateRopeCache(int patchesX, int patchesY, int headDim) + { + long key = ((long)patchesX << 32) | (uint)patchesY; + if (_ropeCache.TryGetValue(key, out var cache)) + return cache; + + int numPatches = patchesX * patchesY; + int halfDim = headDim / 2; + int quarterDim = halfDim / 2; + int[] posX = new int[numPatches]; + int[] posY = new int[numPatches]; + float[] cosX = new float[numPatches * quarterDim]; + float[] sinX = new float[numPatches * quarterDim]; + float[] cosY = new float[numPatches * quarterDim]; + float[] sinY = new float[numPatches * quarterDim]; + float[] invFreq = new float[quarterDim]; + + for (int j = 0; j < quarterDim; j++) + invFreq[j] = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim)); + + for (int p = 0; p < numPatches; p++) + { + int x = p % patchesX; + int y = p / patchesX; + posX[p] = x; + posY[p] = y; + + int baseIdx = p * quarterDim; + for (int j = 0; j < quarterDim; j++) + { + float angleX = x * invFreq[j]; + float angleY = y * invFreq[j]; + cosX[baseIdx + j] = MathF.Cos(angleX); + sinX[baseIdx + j] = MathF.Sin(angleX); + cosY[baseIdx + j] = MathF.Cos(angleY); + sinY[baseIdx + j] = MathF.Sin(angleY); + } + } + + cache = new Rope2DCache + { + PosX = posX, + PosY = posY, + CosX = cosX, + SinX = sinX, + CosY = cosY, + SinY = sinY, + }; + _ropeCache[key] = cache; + return cache; + } + public void Dispose() { _onesForNorm?.Dispose(); + foreach (var w in _transposedWeights.Values) + w.Dispose(); + _transposedWeights.Clear(); foreach (var w in _weights.Values) w.Dispose(); _weights.Clear(); + _ropeCache.Clear(); } } } diff --git a/InferenceEngine/Models/Qwen3/Qwen3Model.cs b/InferenceEngine/Models/Qwen3/Qwen3Model.cs index 9bc87f9..75bf9a2 100644 --- a/InferenceEngine/Models/Qwen3/Qwen3Model.cs +++ b/InferenceEngine/Models/Qwen3/Qwen3Model.cs @@ -151,6 +151,8 @@ public override void ResetKVCache() { Ops.Fill(_kvCacheK[l], 0); Ops.Fill(_kvCacheV[l], 0); + InvalidateTensorDeviceCache(_kvCacheK[l]); + InvalidateTensorDeviceCache(_kvCacheV[l]); } _cacheSeqLen = 0; _linearTicks = _attnTicks = _normTicks = _embTicks = _lmHeadTicks = _logitsCopyTicks = 0; diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/InferenceEngine/Models/Qwen35/ImageProcessor.cs index af94030..a197845 100644 --- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs +++ b/InferenceEngine/Models/Qwen35/ImageProcessor.cs @@ -91,12 +91,10 @@ public int ComputeImageTokenCount(string imagePath) { byte[] fileBytes = File.ReadAllBytes(imagePath); byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out int origWidth, out int origHeight); - byte[] composited = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight); var (resizedH, resizedW) = SmartResize(origHeight, origWidth); - byte[] resized = Gemma3ImageProcessor.BilinearResize(composited, origWidth, origHeight, resizedW, resizedH); - - float[] pixels = PackChannelFirst(resized, resizedW, resizedH); + float[] pixels = Gemma3ImageProcessor.ResizeRgbaToChannelFirstNormalized( + rgba, origWidth, origHeight, resizedW, resizedH); return (pixels, resizedH, resizedW); } diff --git a/InferenceEngine/Models/Qwen35/Qwen35Model.cs b/InferenceEngine/Models/Qwen35/Qwen35Model.cs index 8a011e6..aaa03a3 100644 --- a/InferenceEngine/Models/Qwen35/Qwen35Model.cs +++ b/InferenceEngine/Models/Qwen35/Qwen35Model.cs @@ -197,6 +197,8 @@ public override void ResetKVCache() { Ops.Fill(_kvCacheK[l], 0); Ops.Fill(_kvCacheV[l], 0); + InvalidateTensorDeviceCache(_kvCacheK[l]); + InvalidateTensorDeviceCache(_kvCacheV[l]); } else { diff --git a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs b/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs index b456eac..c4c21dc 100644 --- a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs +++ b/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs @@ -11,13 +11,18 @@ using System.Collections.Generic; using TensorSharp; using TensorSharp.Cpu; +using TensorSharp.GGML; namespace InferenceEngine { public class Qwen35VisionEncoder : IDisposable { private readonly Dictionary _weights = new(); + private readonly Dictionary _transposedWeights = new(); + private readonly Dictionary _positionEmbeddingCache = new(); + private readonly Dictionary _ropeCache = new(); private readonly IAllocator _allocator; + private readonly bool _useNativeAttention; private readonly int _imageSize; private readonly int _patchSize; @@ -31,6 +36,12 @@ public class Qwen35VisionEncoder : IDisposable private readonly int _gridPerSide; private readonly float _ropeTheta; + private sealed class RopeCache + { + public required float[] CosTable { get; init; } + public required float[] SinTable { get; init; } + } + public int ProjectionDim => _projectionDim; public int PatchSize => _patchSize; public int SpatialMergeSize => _spatialMergeSize; @@ -38,6 +49,7 @@ public class Qwen35VisionEncoder : IDisposable public Qwen35VisionEncoder(string mmProjPath, IAllocator allocator) { _allocator = allocator; + _useNativeAttention = allocator is GgmlAllocator; var gguf = new GgufFile(mmProjPath); _imageSize = (int)gguf.GetUint32("clip.vision.image_size", 768); @@ -128,7 +140,7 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW) if (debug) DumpTensor(hidden, "After PatchEmbed (raster)", numPatches); // 2. Position embedding (bilinear interpolation, raster order) - AddPositionEmbedding(hidden, gridH, gridW, numPatches); + AddPositionEmbedding(hidden, gridH, gridW); if (debug) DumpTensor(hidden, "After PosEmbed (raster)", numPatches); // 3. Reorder from raster to block order @@ -137,19 +149,14 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW) if (debug) DumpTensor(blockOrdered, "After BlockReorder", numPatches); // 4. Build block-order grid coordinate arrays for RoPE - int[] gridY, gridX; - BuildBlockOrderCoords(gridH, gridW, out gridY, out gridX); - - // 5. Precompute RoPE cos/sin tables - float[] cosTable, sinTable; - ComputeRoPETables(gridY, gridX, numPatches, halfDim, out cosTable, out sinTable); + RopeCache ropeCache = GetOrCreateRopeCache(gridH, gridW, numPatches, halfDim); // 6. Encoder blocks for (int i = 0; i < _blockCount; i++) { Console.Write($"\r Vision encoder block {i + 1}/{_blockCount}..."); blockOrdered = EncoderBlock(blockOrdered, i, numPatches, headDim, halfDim, - cosTable, sinTable); + ropeCache.CosTable, ropeCache.SinTable); if (debug && (i == 0 || i == _blockCount - 1)) DumpTensor(blockOrdered, $"After block {i}", numPatches); } @@ -170,7 +177,7 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW) using var fc1 = LinearForwardWithBias(mergedContig, "mm.0.weight", "mm.0.bias"); mergedContig.Dispose(); - ApplyGELU(fc1); + Ops.GELU(fc1, fc1); var projected = LinearForwardWithBias(fc1, "mm.2.weight", "mm.2.bias"); if (debug) DumpTensor(projected, "Final projected", mergedPatches); @@ -235,49 +242,9 @@ private unsafe Tensor PatchEmbed(float[] pixelValues, int imgH, int imgW, int gr /// /// Add bilinearly-interpolated position embeddings (computed in raster order). /// - private unsafe void AddPositionEmbedding(Tensor hidden, int gridH, int gridW, int numPatches) + private void AddPositionEmbedding(Tensor hidden, int gridH, int gridW) { - var posEmbd = _weights["v.position_embd.weight"]; - float* posPtr = GetFloatPtr(posEmbd); - float* hidPtr = GetFloatPtr(hidden); - - float stepH = gridH > 1 ? (float)(_gridPerSide - 1) / (gridH - 1) : 0f; - float stepW = gridW > 1 ? (float)(_gridPerSide - 1) / (gridW - 1) : 0f; - - for (int h = 0; h < gridH; h++) - { - for (int w = 0; w < gridW; w++) - { - float y = h * stepH; - float x = w * stepW; - - int fy = (int)y, fx = (int)x; - int cy = Math.Min(fy + 1, _gridPerSide - 1); - int cx = Math.Min(fx + 1, _gridPerSide - 1); - float dy = y - fy, dx = x - fx; - - float w00 = (1 - dy) * (1 - dx); - float w01 = (1 - dy) * dx; - float w10 = dy * (1 - dx); - float w11 = dy * dx; - - int idx00 = fy * _gridPerSide + fx; - int idx01 = fy * _gridPerSide + cx; - int idx10 = cy * _gridPerSide + fx; - int idx11 = cy * _gridPerSide + cx; - - int patchIdx = h * gridW + w; - float* hidRow = hidPtr + patchIdx * _hiddenSize; - - float* p00 = posPtr + idx00 * _hiddenSize; - float* p01 = posPtr + idx01 * _hiddenSize; - float* p10 = posPtr + idx10 * _hiddenSize; - float* p11 = posPtr + idx11 * _hiddenSize; - - for (int d = 0; d < _hiddenSize; d++) - hidRow[d] += w00 * p00[d] + w01 * p01[d] + w10 * p10[d] + w11 * p11[d]; - } - } + Ops.Add(hidden, hidden, GetOrCreatePositionEmbedding(gridH, gridW)); } /// @@ -315,64 +282,6 @@ private unsafe Tensor ReorderToBlockOrder(Tensor input, int gridH, int gridW) return result; } - private void BuildBlockOrderCoords(int gridH, int gridW, out int[] gridY, out int[] gridX) - { - int numPatches = gridH * gridW; - gridY = new int[numPatches]; - gridX = new int[numPatches]; - int idx = 0; - for (int bh = 0; bh < gridH; bh += _spatialMergeSize) - { - for (int bw = 0; bw < gridW; bw += _spatialMergeSize) - { - for (int mh = 0; mh < _spatialMergeSize; mh++) - { - for (int mw = 0; mw < _spatialMergeSize; mw++) - { - gridY[idx] = bh + mh; - gridX[idx] = bw + mw; - idx++; - } - } - } - } - } - - /// - /// Precompute RoPE cos/sin tables for the vision encoder. - /// Interleaved y/x frequency bands matching Ollama's qwen3vl vision RoPE. - /// cosTable/sinTable: [numPatches * halfDim], row-major [patch, band]. - /// - private void ComputeRoPETables(int[] gridY, int[] gridX, int numPatches, int halfDim, - out float[] cosTable, out float[] sinTable) - { - int numBands = halfDim / 2; - cosTable = new float[numPatches * halfDim]; - sinTable = new float[numPatches * halfDim]; - - float[] invFreqs = new float[numBands]; - for (int j = 0; j < numBands; j++) - invFreqs[j] = 1f / MathF.Pow(_ropeTheta, (2f * j) / halfDim); - - for (int p = 0; p < numPatches; p++) - { - int y = gridY[p]; - int x = gridX[p]; - int baseIdx = p * halfDim; - - for (int j = 0; j < numBands; j++) - { - float angleY = y * invFreqs[j]; - float angleX = x * invFreqs[j]; - - cosTable[baseIdx + j * 2] = MathF.Cos(angleY); - sinTable[baseIdx + j * 2] = MathF.Sin(angleY); - cosTable[baseIdx + j * 2 + 1] = MathF.Cos(angleX); - sinTable[baseIdx + j * 2 + 1] = MathF.Sin(angleX); - } - } - } - private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int headDim, int halfDim, float[] cosTable, float[] sinTable) { @@ -416,7 +325,19 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa float scale = 1f / MathF.Sqrt(headDim); - // Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim] + if (_useNativeAttention) + { + using var q4 = q.View(1, numPatches, _numHeads, headDim); + using var k4 = k.View(1, numPatches, _numHeads, headDim); + using var v4 = v.View(1, numPatches, _numHeads, headDim); + using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale); + using var flat = attn4.View(numPatches, _hiddenSize); + q.Dispose(); + k.Dispose(); + v.Dispose(); + return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias"); + } + using var qR = q.View(numPatches, _numHeads, headDim); using var kR = k.View(numPatches, _numHeads, headDim); using var vR = v.View(numPatches, _numHeads, headDim); @@ -431,23 +352,18 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa k.Dispose(); v.Dispose(); - // Q @ K^T using var kT = kHeads.Transpose(1, 2); var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches); Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT); - Ops.Softmax(scores, scores); - // scores @ V var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim); Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads); scores.Dispose(); - // Reshape back to [numPatches, hiddenSize] using var transposed = attnOutput.Transpose(0, 1); using var contiguous = Ops.NewContiguous(transposed); - using var flat = contiguous.View(numPatches, _hiddenSize); - using var flatContig = Ops.NewContiguous(flat); + using var flatContig = contiguous.View(numPatches, _hiddenSize); attnOutput.Dispose(); return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias"); @@ -481,25 +397,13 @@ private unsafe void ApplyVisionRoPE(Tensor data, int numPatches, int headDim, in } } - private unsafe Tensor VisionMLP(Tensor input, string prefix) + private Tensor VisionMLP(Tensor input, string prefix) { using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias"); - ApplyGELU(fc1Out); + Ops.GELU(fc1Out, fc1Out); return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias"); } - private unsafe void ApplyGELU(Tensor t) - { - float* ptr = GetFloatPtr(t); - int count = (int)t.ElementCount(); - for (int i = 0; i < count; i++) - { - double x = ptr[i]; - double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x))); - ptr[i] = (float)(x * cdf); - } - } - private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, string biasName) { var weight = _weights[weightName]; @@ -511,64 +415,20 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input); Tensor src = contiguousInput ?? input; - using var wT = weight.Transpose(); - Ops.Addmm(result, 0, result, 1.0f, src, wT); + Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName)); contiguousInput?.Dispose(); if (_weights.TryGetValue(biasName, out var bias)) - { - float* rPtr = GetFloatPtr(result); - float* bPtr = GetFloatPtr(bias); - for (int s = 0; s < seqLen; s++) - { - float* row = rPtr + s * outDim; - for (int d = 0; d < outDim; d++) - row[d] += bPtr[d]; - } - } + Ops.Add(result, result, bias); return result; } - private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName) + private Tensor LayerNormOp(Tensor input, string weightName, string biasName) { - int rows = (int)input.Sizes[0]; - int dim = (int)input.Sizes[1]; - var result = new Tensor(_allocator, DType.Float32, rows, dim); - - float* src = GetFloatPtr(input); - float* dst = GetFloatPtr(result); - float* w = GetFloatPtr(_weights[weightName]); - float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null; - - for (int r = 0; r < rows; r++) - { - float* srcRow = src + r * dim; - float* dstRow = dst + r * dim; - - float mean = 0; - for (int i = 0; i < dim; i++) - mean += srcRow[i]; - mean /= dim; - - float variance = 0; - for (int i = 0; i < dim; i++) - { - float diff = srcRow[i] - mean; - variance += diff * diff; - } - variance /= dim; - - float invStd = 1f / MathF.Sqrt(variance + _eps); - for (int i = 0; i < dim; i++) - { - float normalized = (srcRow[i] - mean) * invStd; - dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f); - } - } - - return result; + _weights.TryGetValue(biasName, out var bias); + return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps); } private unsafe void DumpTensor(Tensor t, string label, int numRows) @@ -596,11 +456,135 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows) throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); } + private Tensor GetOrCreateTransposedWeight(string weightName) + { + if (_transposedWeights.TryGetValue(weightName, out var transposed)) + return transposed; + + using var weightViewT = _weights[weightName].Transpose(); + transposed = Ops.NewContiguous(weightViewT); + _transposedWeights[weightName] = transposed; + return transposed; + } + + private unsafe Tensor GetOrCreatePositionEmbedding(int gridH, int gridW) + { + long key = ((long)gridH << 32) | (uint)gridW; + if (_positionEmbeddingCache.TryGetValue(key, out var cached)) + return cached; + + int numPatches = gridH * gridW; + cached = new Tensor(_allocator, DType.Float32, numPatches, _hiddenSize); + float* posPtr = GetFloatPtr(_weights["v.position_embd.weight"]); + float* dstPtr = GetFloatPtr(cached); + + float stepH = gridH > 1 ? (float)(_gridPerSide - 1) / (gridH - 1) : 0f; + float stepW = gridW > 1 ? (float)(_gridPerSide - 1) / (gridW - 1) : 0f; + + for (int h = 0; h < gridH; h++) + { + for (int w = 0; w < gridW; w++) + { + float y = h * stepH; + float x = w * stepW; + + int fy = (int)y; + int fx = (int)x; + int cy = Math.Min(fy + 1, _gridPerSide - 1); + int cx = Math.Min(fx + 1, _gridPerSide - 1); + float dy = y - fy; + float dx = x - fx; + + float w00 = (1 - dy) * (1 - dx); + float w01 = (1 - dy) * dx; + float w10 = dy * (1 - dx); + float w11 = dy * dx; + + int idx00 = fy * _gridPerSide + fx; + int idx01 = fy * _gridPerSide + cx; + int idx10 = cy * _gridPerSide + fx; + int idx11 = cy * _gridPerSide + cx; + + int patchIdx = h * gridW + w; + float* dstRow = dstPtr + patchIdx * _hiddenSize; + float* p00 = posPtr + idx00 * _hiddenSize; + float* p01 = posPtr + idx01 * _hiddenSize; + float* p10 = posPtr + idx10 * _hiddenSize; + float* p11 = posPtr + idx11 * _hiddenSize; + + for (int d = 0; d < _hiddenSize; d++) + dstRow[d] = w00 * p00[d] + w01 * p01[d] + w10 * p10[d] + w11 * p11[d]; + } + } + + _positionEmbeddingCache[key] = cached; + return cached; + } + + private RopeCache GetOrCreateRopeCache(int gridH, int gridW, int numPatches, int halfDim) + { + long key = ((long)gridH << 32) | (uint)gridW; + if (_ropeCache.TryGetValue(key, out var cache)) + return cache; + + int[] gridY = new int[numPatches]; + int[] gridX = new int[numPatches]; + int idx = 0; + for (int bh = 0; bh < gridH; bh += _spatialMergeSize) + { + for (int bw = 0; bw < gridW; bw += _spatialMergeSize) + { + for (int mh = 0; mh < _spatialMergeSize; mh++) + { + for (int mw = 0; mw < _spatialMergeSize; mw++) + { + gridY[idx] = bh + mh; + gridX[idx] = bw + mw; + idx++; + } + } + } + } + + int numBands = halfDim / 2; + float[] cosTable = new float[numPatches * halfDim]; + float[] sinTable = new float[numPatches * halfDim]; + float[] invFreqs = new float[numBands]; + for (int j = 0; j < numBands; j++) + invFreqs[j] = 1f / MathF.Pow(_ropeTheta, (2f * j) / halfDim); + + for (int p = 0; p < numPatches; p++) + { + int baseIdx = p * halfDim; + for (int j = 0; j < numBands; j++) + { + float angleY = gridY[p] * invFreqs[j]; + float angleX = gridX[p] * invFreqs[j]; + + cosTable[baseIdx + j * 2] = MathF.Cos(angleY); + sinTable[baseIdx + j * 2] = MathF.Sin(angleY); + cosTable[baseIdx + j * 2 + 1] = MathF.Cos(angleX); + sinTable[baseIdx + j * 2 + 1] = MathF.Sin(angleX); + } + } + + cache = new RopeCache { CosTable = cosTable, SinTable = sinTable }; + _ropeCache[key] = cache; + return cache; + } + public void Dispose() { + foreach (var w in _positionEmbeddingCache.Values) + w.Dispose(); + _positionEmbeddingCache.Clear(); + foreach (var w in _transposedWeights.Values) + w.Dispose(); + _transposedWeights.Clear(); foreach (var w in _weights.Values) w.Dispose(); _weights.Clear(); + _ropeCache.Clear(); } } } diff --git a/InferenceWeb/Program.cs b/InferenceWeb/Program.cs index 050d2ff..e34be88 100644 --- a/InferenceWeb/Program.cs +++ b/InferenceWeb/Program.cs @@ -20,9 +20,13 @@ using Microsoft.AspNetCore.Hosting; using Microsoft.AspNetCore.Http; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; var builder = WebApplication.CreateBuilder(args); +// Keep ASP.NET Core request logs quiet by default while still surfacing warnings and errors. +builder.Logging.AddFilter("Microsoft.AspNetCore", LogLevel.Warning); + builder.WebHost.ConfigureKestrel(options => { options.Limits.MaxRequestBodySize = 500 * 1024 * 1024; // 500 MB diff --git a/README.md b/README.md index fc5e98e..7564295 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ You can also request a CUDA-enabled native build from `dotnet build`: TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release ``` -On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` builds `libGgmlOps.so` with the GGML CPU backend by default, and `build-linux.sh --cuda` enables GGML_CUDA support for NVIDIA GPUs. The build output is automatically copied to the application's output directory. +On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory. ## Usage diff --git a/TensorSharp.GGML.Native/build-linux.sh b/TensorSharp.GGML.Native/build-linux.sh index d75e0ea..41f6dcb 100644 --- a/TensorSharp.GGML.Native/build-linux.sh +++ b/TensorSharp.GGML.Native/build-linux.sh @@ -3,10 +3,50 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BUILD_DIR="${SCRIPT_DIR}/build" -ENABLE_CUDA="${TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:-OFF}" +ENABLE_CUDA="${TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:-}" BUILD_TESTS="${TENSORSHARP_GGML_NATIVE_BUILD_TESTS:-OFF}" EXTRA_CMAKE_ARGS=() +normalize_bool() { + local value="${1:-}" + case "${value}" in + ON|on|On|TRUE|true|True|YES|yes|Yes|1) + echo "ON" + ;; + OFF|off|Off|FALSE|false|False|NO|no|No|0) + echo "OFF" + ;; + *) + echo "" + ;; + esac +} + +has_cuda_toolkit() { + if command -v nvcc >/dev/null 2>&1; then + return 0 + fi + + local cuda_home="${CUDA_HOME:-${CUDA_PATH:-}}" + if [[ -n "${cuda_home}" && -x "${cuda_home}/bin/nvcc" ]]; then + return 0 + fi + + return 1 +} + +read_cached_cuda_setting() { + local cache_file="${BUILD_DIR}/CMakeCache.txt" + if [[ ! -f "${cache_file}" ]]; then + echo "" + return + fi + + local cached + cached="$(awk -F= '/^TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:BOOL=/{print $2; exit}' "${cache_file}")" + normalize_bool "${cached}" +} + while (($# > 0)); do case "$1" in --cuda) @@ -25,6 +65,19 @@ while (($# > 0)); do shift done +ENABLE_CUDA="$(normalize_bool "${ENABLE_CUDA}")" +if [[ -z "${ENABLE_CUDA}" ]]; then + ENABLE_CUDA="$(read_cached_cuda_setting)" +fi +if [[ -z "${ENABLE_CUDA}" ]] && has_cuda_toolkit; then + ENABLE_CUDA="ON" +fi +if [[ -z "${ENABLE_CUDA}" ]]; then + ENABLE_CUDA="OFF" +fi + +echo "Configuring TensorSharp.GGML.Native (CUDA=${ENABLE_CUDA}, TESTS=${BUILD_TESTS})" + cmake -S "${SCRIPT_DIR}" -B "${BUILD_DIR}" \ -DCMAKE_BUILD_TYPE=Release \ -DTENSORSHARP_GGML_NATIVE_ENABLE_CUDA="${ENABLE_CUDA}" \ diff --git a/TensorSharp.GGML.Native/ggml_ops.cpp b/TensorSharp.GGML.Native/ggml_ops.cpp index cbd8d36..eb0691a 100644 --- a/TensorSharp.GGML.Native/ggml_ops.cpp +++ b/TensorSharp.GGML.Native/ggml_ops.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -526,6 +527,11 @@ namespace return static_cast(desc.dim0) * desc.dim1 * sizeof(float); } + std::size_t logical_row_bytes(const TensorView2DDesc& desc) + { + return static_cast(desc.dim1) * sizeof(float); + } + std::size_t logical_bytes(const TensorView3DDesc& desc) { return static_cast(desc.dim0) * desc.dim1 * desc.dim2 * sizeof(float); @@ -546,6 +552,40 @@ namespace return static_cast(desc.ne0) * desc.ne1 * desc.ne2 * desc.ne3 * sizeof(float); } + constexpr std::size_t k_ggml_cuda_max_copy_bytes = static_cast(std::numeric_limits::max()); + + std::size_t raw_row_bytes(const TensorView2DDesc& desc) + { + TensorView2DDesc row_desc = desc; + row_desc.dim0 = 1; + return required_raw_bytes(row_desc); + } + + TensorView2DDesc slice_rows_2d(const TensorView2DDesc& desc, int row_start, int row_count) + { + TensorView2DDesc slice = desc; + slice.data = static_cast(desc.data) + + static_cast(row_start) * + static_cast(desc.stride0) * + sizeof(float); + slice.dim0 = row_count; + slice.raw_bytes = static_cast(required_raw_bytes(slice)); + return slice; + } + + int limit_rows_for_cuda_copy(int current_limit, const TensorView2DDesc& desc) + { + if (current_limit <= 0) + return 0; + + const std::size_t per_row_bytes = std::max(logical_row_bytes(desc), raw_row_bytes(desc)); + if (per_row_bytes == 0 || per_row_bytes > k_ggml_cuda_max_copy_bytes) + return 0; + + const int limit = static_cast(k_ggml_cuda_max_copy_bytes / per_row_bytes); + return std::min(current_limit, std::max(1, limit)); + } + bool validate_desc(const TensorView2DDesc& desc, const char* name) { if (desc.data == nullptr) @@ -857,7 +897,8 @@ namespace std::size_t bytes, ggml_backend_buffer_t& out_buffer, void*& out_addr, - bool& out_needs_upload) + bool& out_needs_upload, + enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { out_buffer = nullptr; out_addr = nullptr; @@ -903,7 +944,7 @@ namespace if (out_buffer == nullptr) return false; - ggml_backend_buffer_set_usage(out_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_buffer_set_usage(out_buffer, usage); out_addr = ggml_backend_buffer_get_base(out_buffer); out_needs_upload = true; @@ -1306,6 +1347,56 @@ namespace return 0; } + if (g_backend_type == BACKEND_TYPE_CUDA) + { + const bool needs_chunking = + logical_bytes(result_desc) > k_ggml_cuda_max_copy_bytes || + logical_bytes(m1_desc) > k_ggml_cuda_max_copy_bytes || + static_cast(result_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes || + static_cast(m1_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes || + (beta != 0.0f && ( + logical_bytes(src_desc) > k_ggml_cuda_max_copy_bytes || + static_cast(src_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes)); + + if (needs_chunking) + { + int chunk_rows = rows; + chunk_rows = limit_rows_for_cuda_copy(chunk_rows, result_desc); + chunk_rows = limit_rows_for_cuda_copy(chunk_rows, m1_desc); + if (beta != 0.0f) + { + chunk_rows = limit_rows_for_cuda_copy(chunk_rows, src_desc); + if (src_desc.dim0 != rows) + chunk_rows = (chunk_rows / src_desc.dim0) * src_desc.dim0; + } + + if (chunk_rows <= 0) + { + set_last_error("GGML CUDA addmm received a row slice larger than the backend copy limit."); + return 0; + } + + if (chunk_rows < rows) + { + for (int row_start = 0; row_start < rows; row_start += chunk_rows) + { + const int row_count = std::min(chunk_rows, rows - row_start); + const TensorView2DDesc result_slice = slice_rows_2d(result_desc, row_start, row_count); + const TensorView2DDesc m1_slice = slice_rows_2d(m1_desc, row_start, row_count); + const TensorView2DDesc src_slice = beta == 0.0f + ? TensorView2DDesc{} + : (src_desc.dim0 == rows ? slice_rows_2d(src_desc, row_start, row_count) : src_desc); + + if (!addmm_f32_impl(result_slice, src_slice, m1_slice, m2_desc, beta, alpha)) + return 0; + } + + clear_last_error(); + return 1; + } + } + } + if (!can_map_standard_view(result_desc)) { set_last_error("Result tensor layout is not supported by the ggml addmm Metal path."); @@ -1531,6 +1622,44 @@ namespace return 0; } + if (g_backend_type == BACKEND_TYPE_CUDA) + { + const bool needs_chunking = + logical_bytes(result_desc) > k_ggml_cuda_max_copy_bytes || + logical_bytes(m1_desc) > k_ggml_cuda_max_copy_bytes || + static_cast(result_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes || + static_cast(m1_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes; + + if (needs_chunking) + { + int chunk_rows = rows; + chunk_rows = limit_rows_for_cuda_copy(chunk_rows, result_desc); + chunk_rows = limit_rows_for_cuda_copy(chunk_rows, m1_desc); + + if (chunk_rows <= 0) + { + set_last_error("GGML CUDA addmm_quant received a row slice larger than the backend copy limit."); + return 0; + } + + if (chunk_rows < rows) + { + for (int row_start = 0; row_start < rows; row_start += chunk_rows) + { + const int row_count = std::min(chunk_rows, rows - row_start); + const TensorView2DDesc result_slice = slice_rows_2d(result_desc, row_start, row_count); + const TensorView2DDesc m1_slice = slice_rows_2d(m1_desc, row_start, row_count); + + if (!addmm_quant_f32_impl(result_slice, m1_slice, m2_quant)) + return 0; + } + + clear_last_error(); + return 1; + } + } + } + const std::size_t ctx_size = 1024 * 1024; PooledContextHandle context; if (!context.init(ctx_size)) @@ -6960,6 +7089,11 @@ TSG_EXPORT void TSGgml_ClearHostBufferCache() g_host_buffer_cache.clear(); } +TSG_EXPORT void TSGgml_InvalidateHostBuffer(void* ptr) +{ + invalidate_cached_buffer(ptr); +} + TSG_EXPORT size_t TSGgml_RowSize(int ggml_type, int64_t ne) { if (ggml_type < 0 || ggml_type >= GGML_TYPE_COUNT || ne <= 0) @@ -7014,6 +7148,85 @@ TSG_EXPORT int TSGgml_DequantizeToF32(int ggml_type, const void* src, int64_t nu // ============================================================================ namespace { + std::size_t kv_cache_bytes(int kv_heads, int cache_size, int head_dim) + { + return static_cast(kv_heads) * + static_cast(cache_size) * + static_cast(head_dim) * + sizeof(float); + } + + ggml_tensor* view_kv_cache_window( + ggml_context* ctx, + ggml_tensor* cache, + int head_dim, + int cache_size, + int kv_heads, + int start_idx, + int length) + { + if (ctx == nullptr || cache == nullptr || head_dim <= 0 || cache_size <= 0 || kv_heads <= 0 || length <= 0) + return nullptr; + + start_idx %= cache_size; + if (start_idx < 0) + start_idx += cache_size; + + const std::size_t nb1 = static_cast(head_dim) * sizeof(float); + const std::size_t nb2 = static_cast(cache_size) * static_cast(head_dim) * sizeof(float); + + if (start_idx + length <= cache_size) + { + return ggml_view_3d( + ctx, + cache, + head_dim, + length, + kv_heads, + nb1, + nb2, + static_cast(start_idx) * static_cast(head_dim) * sizeof(float)); + } + + const int tail_length = cache_size - start_idx; + const int head_length = length - tail_length; + ggml_tensor* tail = ggml_view_3d( + ctx, + cache, + head_dim, + tail_length, + kv_heads, + nb1, + nb2, + static_cast(start_idx) * static_cast(head_dim) * sizeof(float)); + ggml_tensor* head = ggml_view_3d(ctx, cache, head_dim, head_length, kv_heads, nb1, nb2, 0); + if (tail == nullptr || head == nullptr) + return nullptr; + + return ggml_concat(ctx, tail, head, 1); + } + + void write_flat_kv_to_host_cache( + float* cache_data, + const float* flat_data, + int kv_heads, + int cache_size, + int head_dim, + int cache_pos) + { + if (cache_data == nullptr || flat_data == nullptr || kv_heads <= 0 || cache_size <= 0 || head_dim <= 0) + return; + + const std::size_t head_bytes = static_cast(head_dim) * sizeof(float); + for (int h = 0; h < kv_heads; ++h) + { + std::memcpy( + cache_data + static_cast(h) * cache_size * head_dim + static_cast(cache_pos) * head_dim, + flat_data + static_cast(h) * head_dim, + head_bytes); + } + } + int transformer_layer_decode_impl( float* hidden_data, int hidden_size, float* attn_norm_data, @@ -7037,23 +7250,6 @@ namespace const int totalSeqLen = position + 1; const float scale = 1.0f / std::sqrt(static_cast(head_dim)); - // Create contiguous copies of cached KV (strided cache → contiguous buffer) - std::vector k_cached_buf, v_cached_buf; - if (position > 0) - { - k_cached_buf.resize(static_cast(position) * kDim); - v_cached_buf.resize(static_cast(position) * kDim); - for (int h = 0; h < num_kv_heads; h++) - { - std::memcpy(k_cached_buf.data() + h * position * head_dim, - k_cache_data + h * max_seq_len * head_dim, - static_cast(position) * head_dim * sizeof(float)); - std::memcpy(v_cached_buf.data() + h * position * head_dim, - v_cache_data + h * max_seq_len * head_dim, - static_cast(position) * head_dim * sizeof(float)); - } - } - PooledContextHandle context; if (!context.init(2 * 1024 * 1024)) { @@ -7075,14 +7271,8 @@ namespace ggml_tensor* down_w = ggml_new_tensor_2d(ctx, static_cast(down_type), down_ne0, down_ne1); ggml_tensor* pos_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - - ggml_tensor* k_cached_t = nullptr; - ggml_tensor* v_cached_t = nullptr; - if (position > 0) - { - k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads); - v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads); - } + ggml_tensor* k_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads); + ggml_tensor* v_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads); // Output download targets ggml_tensor* hidden_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); @@ -7130,18 +7320,24 @@ namespace ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope, 0, 2, 1, 3); ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_raw, head_dim, num_kv_heads, 1); ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3); - - ggml_tensor* k_full; - ggml_tensor* v_full; - if (position > 0) - { - k_full = ggml_concat(ctx, k_cached_t, ggml_cont(ctx, k_rope_perm), 1); - v_full = ggml_concat(ctx, v_cached_t, ggml_cont(ctx, v_perm), 1); - } - else + ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm); + ggml_tensor* v_write = ggml_cont(ctx, v_perm); + ggml_tensor* k_cache_updated = ggml_set_1d_inplace( + ctx, + k_cache_base, + k_write, + static_cast(position) * static_cast(head_dim) * sizeof(float)); + ggml_tensor* v_cache_updated = ggml_set_1d_inplace( + ctx, + v_cache_base, + v_write, + static_cast(position) * static_cast(head_dim) * sizeof(float)); + ggml_tensor* k_full = view_kv_cache_window(ctx, k_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen); + ggml_tensor* v_full = view_kv_cache_window(ctx, v_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen); + if (k_full == nullptr || v_full == nullptr) { - k_full = ggml_cont(ctx, k_rope_perm); - v_full = ggml_cont(ctx, v_perm); + set_last_error("Failed to create KV cache views for transformer layer decode."); + return 0; } // 7. Flash attention (handles GQA broadcasting automatically) @@ -7200,7 +7396,8 @@ namespace std::vector upload_list; std::vector ephemeral_bufs; - auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) { + auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable, + enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (t == nullptr || data == nullptr) return; @@ -7209,7 +7406,7 @@ namespace ggml_backend_buffer_t buf = nullptr; void* addr = nullptr; bool needs_upload = false; - if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload)) + if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage)) { ggml_status st = ggml_backend_tensor_alloc(buf, t, addr); if (st == GGML_STATUS_SUCCESS) @@ -7247,12 +7444,8 @@ namespace bind_or_mark(ffn_norm_w, ffn_norm_data, static_cast(hidden_size) * sizeof(float), true); bind_or_mark(q_norm_w, q_norm_data, static_cast(head_dim) * sizeof(float), true); bind_or_mark(k_norm_w, k_norm_data, static_cast(head_dim) * sizeof(float), true); - - if (position > 0) - { - bind_or_mark(k_cached_t, k_cached_buf.data(), k_cached_buf.size() * sizeof(float), false); - bind_or_mark(v_cached_t, v_cached_buf.data(), v_cached_buf.size() * sizeof(float), false); - } + bind_or_mark(k_cache_base, k_cache_data, kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); + bind_or_mark(v_cache_base, v_cache_data, kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); // Allocate backend buffer for remaining tensors (intermediates + non-host-ptr tensors) BufferHandle buffer(ggml_backend_alloc_ctx_tensors(ctx, g_backend)); @@ -7289,15 +7482,8 @@ namespace ggml_backend_tensor_get(k_new_out, k_new_buf.data(), 0, static_cast(kDim) * sizeof(float)); ggml_backend_tensor_get(v_new_out, v_new_buf.data(), 0, static_cast(kDim) * sizeof(float)); - for (int h = 0; h < num_kv_heads; h++) - { - std::memcpy(k_cache_data + h * max_seq_len * head_dim + position * head_dim, - k_new_buf.data() + h * head_dim, - static_cast(head_dim) * sizeof(float)); - std::memcpy(v_cache_data + h * max_seq_len * head_dim + position * head_dim, - v_new_buf.data() + h * head_dim, - static_cast(head_dim) * sizeof(float)); - } + write_flat_kv_to_host_cache(k_cache_data, k_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position); + write_flat_kv_to_host_cache(v_cache_data, v_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position); clear_last_error(); return 1; @@ -7377,33 +7563,6 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( const int totalSeqLen = position + 1; const float scale = 1.0f / std::sqrt(static_cast(head_dim)); - // Pre-copy cached KV for all layers - struct LayerKVCache { - std::vector k_buf; - std::vector v_buf; - }; - std::vector kv_caches(num_layers); - if (position > 0) - { - for (int l = 0; l < num_layers; l++) - { - auto& cache = kv_caches[l]; - cache.k_buf.resize(static_cast(position) * kDim); - cache.v_buf.resize(static_cast(position) * kDim); - float* kc = static_cast(k_cache_arr[l]); - float* vc = static_cast(v_cache_arr[l]); - for (int h = 0; h < num_kv_heads; h++) - { - std::memcpy(cache.k_buf.data() + h * position * head_dim, - kc + h * max_seq_len * head_dim, - static_cast(position) * head_dim * sizeof(float)); - std::memcpy(cache.v_buf.data() + h * position * head_dim, - vc + h * max_seq_len * head_dim, - static_cast(position) * head_dim * sizeof(float)); - } - } - } - // Large context for all layers const std::size_t ctx_size = 16 * 1024 * 1024; PooledContextHandle context; @@ -7428,8 +7587,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( ggml_tensor* ffn_norm_w; ggml_tensor* gu_w; ggml_tensor* down_w; - ggml_tensor* k_cached_t; - ggml_tensor* v_cached_t; + ggml_tensor* k_cache_base; + ggml_tensor* v_cache_base; ggml_tensor* k_new_out; ggml_tensor* v_new_out; ggml_tensor* out_k_cpy; @@ -7448,19 +7607,10 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( lt.ffn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); lt.gu_w = ggml_new_tensor_2d(ctx, static_cast(gu_type), gu_ne0, gu_ne1); lt.down_w = ggml_new_tensor_2d(ctx, static_cast(down_type), down_ne0, down_ne1); + lt.k_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads); + lt.v_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads); lt.k_new_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kDim); lt.v_new_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kDim); - - if (position > 0) - { - lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads); - lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads); - } - else - { - lt.k_cached_t = nullptr; - lt.v_cached_t = nullptr; - } } // Build computation graph: chain all layers @@ -7503,18 +7653,24 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope, 0, 2, 1, 3); ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_raw, head_dim, num_kv_heads, 1); ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3); - - ggml_tensor* k_full; - ggml_tensor* v_full; - if (position > 0) - { - k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, k_rope_perm), 1); - v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, v_perm), 1); - } - else + ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm); + ggml_tensor* v_write = ggml_cont(ctx, v_perm); + ggml_tensor* k_cache_updated = ggml_set_1d_inplace( + ctx, + lt.k_cache_base, + k_write, + static_cast(position) * static_cast(head_dim) * sizeof(float)); + ggml_tensor* v_cache_updated = ggml_set_1d_inplace( + ctx, + lt.v_cache_base, + v_write, + static_cast(position) * static_cast(head_dim) * sizeof(float)); + ggml_tensor* k_full = view_kv_cache_window(ctx, k_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen); + ggml_tensor* v_full = view_kv_cache_window(ctx, v_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen); + if (k_full == nullptr || v_full == nullptr) { - k_full = ggml_cont(ctx, k_rope_perm); - v_full = ggml_cont(ctx, v_perm); + set_last_error("Failed to create KV cache views for transformer model decode."); + return 0; } // Flash attention @@ -7574,7 +7730,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( std::vector upload_list; std::vector ephemeral_bufs; - auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) { + auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable, + enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (t == nullptr || data == nullptr) return; @@ -7583,7 +7740,7 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( ggml_backend_buffer_t buf = nullptr; void* addr = nullptr; bool needs_upload = false; - if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload)) + if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage)) { ggml_status st = ggml_backend_tensor_alloc(buf, t, addr); if (st == GGML_STATUS_SUCCESS) @@ -7624,12 +7781,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( bind_or_mark(lt.ffn_norm_w, ffn_norm_arr[l], static_cast(hidden_size) * sizeof(float), true); bind_or_mark(lt.q_norm_w, q_norm_arr[l], static_cast(head_dim) * sizeof(float), true); bind_or_mark(lt.k_norm_w, k_norm_arr[l], static_cast(head_dim) * sizeof(float), true); - - if (position > 0) - { - bind_or_mark(lt.k_cached_t, kv_caches[l].k_buf.data(), kv_caches[l].k_buf.size() * sizeof(float), false); - bind_or_mark(lt.v_cached_t, kv_caches[l].v_buf.data(), kv_caches[l].v_buf.size() * sizeof(float), false); - } + bind_or_mark(lt.k_cache_base, k_cache_arr[l], kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); + bind_or_mark(lt.v_cache_base, v_cache_arr[l], kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); } // Allocate backend buffer for intermediates @@ -7669,17 +7822,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode( ggml_backend_tensor_get(layers[l].k_new_out, k_new_buf.data(), 0, static_cast(kDim) * sizeof(float)); ggml_backend_tensor_get(layers[l].v_new_out, v_new_buf.data(), 0, static_cast(kDim) * sizeof(float)); - float* kc = static_cast(k_cache_arr[l]); - float* vc = static_cast(v_cache_arr[l]); - for (int h = 0; h < num_kv_heads; h++) - { - std::memcpy(kc + h * max_seq_len * head_dim + position * head_dim, - k_new_buf.data() + h * head_dim, - static_cast(head_dim) * sizeof(float)); - std::memcpy(vc + h * max_seq_len * head_dim + position * head_dim, - v_new_buf.data() + h * head_dim, - static_cast(head_dim) * sizeof(float)); - } + write_flat_kv_to_host_cache(static_cast(k_cache_arr[l]), k_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position); + write_flat_kv_to_host_cache(static_cast(v_cache_arr[l]), v_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position); } clear_last_error(); @@ -7753,7 +7897,7 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( for (int l = 0; l < num_layers; l++) if (head_dim_arr[l] > maxHd) maxHd = head_dim_arr[l]; - // Prepare per-layer contiguous KV cache copies + // Prepare per-layer KV cache metadata struct LayerInfo { int hd; int kvHeads; @@ -7764,8 +7908,6 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( bool isShared; int kvSource; int attendLen; - std::vector k_buf; - std::vector v_buf; }; std::vector li(num_layers); @@ -7786,55 +7928,6 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( info.attendLen = info.isLocal ? std::min(totalSeqLen, sliding_window) : totalSeqLen; } - // Extract KV cache data: only for unique KV source layers (avoid duplicate copies) - std::unordered_map kvSrcDone; - for (int l = 0; l < num_layers; l++) - { - auto& info = li[l]; - int kvSrc = info.kvSource; - if (kvSrcDone.count(kvSrc)) continue; - kvSrcDone[kvSrc] = 1; - - int windowLen = info.attendLen - 1; - if (windowLen <= 0) continue; - - auto& srcInfo = li[kvSrc]; - srcInfo.k_buf.resize(static_cast(windowLen) * info.kDim); - srcInfo.v_buf.resize(static_cast(windowLen) * info.kDim); - float* kc = static_cast(k_cache_arr[kvSrc]); - float* vc = static_cast(v_cache_arr[kvSrc]); - - if (info.isLocal) - { - int start = (totalSeqLen > sliding_window) ? totalSeqLen - sliding_window : 0; - for (int h = 0; h < info.kvHeads; h++) - { - float* kHead = kc + h * info.cacheSize * info.hd; - float* vHead = vc + h * info.cacheSize * info.hd; - for (int p = 0; p < windowLen; p++) - { - int cacheIdx = (start + p) % info.cacheSize; - std::memcpy(srcInfo.k_buf.data() + (h * windowLen + p) * info.hd, - kHead + cacheIdx * info.hd, info.hd * sizeof(float)); - std::memcpy(srcInfo.v_buf.data() + (h * windowLen + p) * info.hd, - vHead + cacheIdx * info.hd, info.hd * sizeof(float)); - } - } - } - else - { - for (int h = 0; h < info.kvHeads; h++) - { - std::memcpy(srcInfo.k_buf.data() + h * windowLen * info.hd, - kc + h * info.cacheSize * info.hd, - static_cast(windowLen) * info.hd * sizeof(float)); - std::memcpy(srcInfo.v_buf.data() + h * windowLen * info.hd, - vc + h * info.cacheSize * info.hd, - static_cast(windowLen) * info.hd * sizeof(float)); - } - } - } - // Create GGML context const std::size_t ctx_size = 32 * 1024 * 1024; PooledContextHandle context; @@ -7906,12 +7999,10 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( lt.v_new_out = nullptr; } - int windowLen = info.attendLen - 1; - // For shared layers, reuse donor's cached_t (set below after all layers created) - if (!info.isShared && windowLen > 0) + if (!info.isShared) { - lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, windowLen, info.kvHeads); - lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, windowLen, info.kvHeads); + lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, info.cacheSize, info.kvHeads); + lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, info.cacheSize, info.kvHeads); } else { @@ -7946,7 +8037,11 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( // Build compute graph ggml_tensor* hidden = current; - // Track new K/V tensors produced by non-shared layers for concat with cached + // Track the active KV tensors produced by each donor layer. + std::vector layer_k_full(num_layers, nullptr); + std::vector layer_v_full(num_layers, nullptr); + + // Track new K/V tensors produced by non-shared layers for download. std::vector layer_k_new(num_layers, nullptr); std::vector layer_v_new(num_layers, nullptr); @@ -7998,18 +8093,29 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope_t, 0, 2, 1, 3); ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_normed, info.hd, info.kvHeads, 1); ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3); - - int windowLen = info.attendLen - 1; - if (windowLen > 0) + ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm); + ggml_tensor* v_write = ggml_cont(ctx, v_perm); + const int cachePos = info.isLocal ? (position % info.cacheSize) : position; + const int activeStart = info.isLocal ? ((totalSeqLen - info.attendLen) % info.cacheSize) : 0; + ggml_tensor* k_cache_updated = ggml_set_1d_inplace( + ctx, + lt.k_cached_t, + k_write, + static_cast(cachePos) * static_cast(info.hd) * sizeof(float)); + ggml_tensor* v_cache_updated = ggml_set_1d_inplace( + ctx, + lt.v_cached_t, + v_write, + static_cast(cachePos) * static_cast(info.hd) * sizeof(float)); + k_full = view_kv_cache_window(ctx, k_cache_updated, info.hd, info.cacheSize, info.kvHeads, activeStart, info.attendLen); + v_full = view_kv_cache_window(ctx, v_cache_updated, info.hd, info.cacheSize, info.kvHeads, activeStart, info.attendLen); + if (k_full == nullptr || v_full == nullptr) { - k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, k_rope_perm), 1); - v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, v_perm), 1); - } - else - { - k_full = ggml_cont(ctx, k_rope_perm); - v_full = ggml_cont(ctx, v_perm); + set_last_error("Failed to create Gemma4 KV cache views."); + return 0; } + layer_k_full[l] = k_full; + layer_v_full[l] = v_full; // Store new K/V refs for KV output layer_k_new[l] = k_rope_t; @@ -8030,41 +8136,18 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( // Use the donor layer's K/V (already computed earlier in the graph) int donor = info.kvSource; - auto& donorInfo = li[donor]; - int windowLen = info.attendLen - 1; - - if (layer_k_new[donor] != nullptr && windowLen > 0) - { - // Donor's new K/V were produced - concat with cached - ggml_tensor* dk_perm = ggml_permute(ctx, layer_k_new[donor], 0, 2, 1, 3); - ggml_tensor* dv_1d = layer_v_new[donor]; - ggml_tensor* dv_3d = ggml_reshape_3d(ctx, dv_1d, donorInfo.hd, donorInfo.kvHeads, 1); - ggml_tensor* dv_perm = ggml_permute(ctx, dv_3d, 0, 2, 1, 3); - k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, dk_perm), 1); - v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, dv_perm), 1); - } - else if (layer_k_new[donor] != nullptr) + k_full = layer_k_full[donor]; + v_full = layer_v_full[donor]; + if (k_full == nullptr || v_full == nullptr) { - ggml_tensor* dk_perm = ggml_permute(ctx, layer_k_new[donor], 0, 2, 1, 3); - ggml_tensor* dv_1d = layer_v_new[donor]; - ggml_tensor* dv_3d = ggml_reshape_3d(ctx, dv_1d, donorInfo.hd, donorInfo.kvHeads, 1); - ggml_tensor* dv_perm = ggml_permute(ctx, dv_3d, 0, 2, 1, 3); - k_full = ggml_cont(ctx, dk_perm); - v_full = ggml_cont(ctx, dv_perm); - } - else if (windowLen > 0) - { - k_full = lt.k_cached_t; - v_full = lt.v_cached_t; - } - else - { - // No cached data and no new data - should not happen set_last_error("Shared layer has no KV data available."); return 0; } } + layer_k_full[l] = k_full; + layer_v_full[l] = v_full; + // Manual attention: scores = softmax(K^T @ Q), output = V_T @ scores // Gemma4 uses QK-Norm (per-head RMSNorm on Q/K), so no 1/sqrt(d) scaling ggml_tensor* q_attn = ggml_permute(ctx, q_rope, 0, 2, 1, 3); @@ -8166,7 +8249,8 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( std::vector upload_list; std::vector ephemeral_bufs; - auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) { + auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable, + enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (t == nullptr || data == nullptr) return; if (cacheable && bytes >= 4096) @@ -8174,7 +8258,7 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( ggml_backend_buffer_t buf = nullptr; void* addr = nullptr; bool needs_upload = false; - if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload)) + if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage)) { ggml_status st = ggml_backend_tensor_alloc(buf, t, addr); if (st == GGML_STATUS_SUCCESS) @@ -8223,13 +8307,8 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( if (!info.isShared) { - int windowLen = info.attendLen - 1; - if (windowLen > 0) - { - auto& srcInfo = li[info.kvSource]; - bind_or_mark(lt.k_cached_t, srcInfo.k_buf.data(), srcInfo.k_buf.size() * sizeof(float), false); - bind_or_mark(lt.v_cached_t, srcInfo.v_buf.data(), srcInfo.v_buf.size() * sizeof(float), false); - } + bind_or_mark(lt.k_cached_t, k_cache_arr[l], kv_cache_bytes(info.kvHeads, info.cacheSize, info.hd), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); + bind_or_mark(lt.v_cached_t, v_cache_arr[l], kv_cache_bytes(info.kvHeads, info.cacheSize, info.hd), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE); } if (lt.ple_gate_w != nullptr) @@ -8291,24 +8370,14 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode( ggml_backend_tensor_get(layers[l].v_new_out, v_new_buf.data(), 0, static_cast(info.kDim) * sizeof(float)); - float* kc = static_cast(k_cache_arr[l]); - float* vc = static_cast(v_cache_arr[l]); - int cachePos; if (info.isLocal) cachePos = position % info.cacheSize; else cachePos = position; - for (int h = 0; h < info.kvHeads; h++) - { - std::memcpy(kc + h * info.cacheSize * info.hd + cachePos * info.hd, - k_new_buf.data() + h * info.hd, - static_cast(info.hd) * sizeof(float)); - std::memcpy(vc + h * info.cacheSize * info.hd + cachePos * info.hd, - v_new_buf.data() + h * info.hd, - static_cast(info.hd) * sizeof(float)); - } + write_flat_kv_to_host_cache(static_cast(k_cache_arr[l]), k_new_buf.data(), info.kvHeads, info.cacheSize, info.hd, cachePos); + write_flat_kv_to_host_cache(static_cast(v_cache_arr[l]), v_new_buf.data(), info.kvHeads, info.cacheSize, info.hd, cachePos); } clear_last_error(); diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.GGML/GgmlBasicOps.cs index 3088f35..24a50c5 100644 --- a/TensorSharp.GGML/GgmlBasicOps.cs +++ b/TensorSharp.GGML/GgmlBasicOps.cs @@ -371,6 +371,7 @@ public static void AddmmQuantBatch(Tensor result, Tensor m1, IntPtr weightData, public static IntPtr AlignedAlloc(long size) => GgmlNative.AlignedAlloc(size); public static void AlignedFree(IntPtr ptr) => GgmlNative.AlignedFree(ptr); public static void ClearHostBufferCache() => GgmlNative.ClearHostBufferCache(); + public static void InvalidateHostBuffer(IntPtr ptr) => GgmlNative.InvalidateHostBuffer(ptr); public static void EnsureBackendAvailable(GgmlBackendType backendType) => GgmlNative.EnsureAvailable(backendType); public static void TransformerModelDecode( diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.GGML/GgmlNative.cs index 2d9d822..f0bb98d 100644 --- a/TensorSharp.GGML/GgmlNative.cs +++ b/TensorSharp.GGML/GgmlNative.cs @@ -415,6 +415,9 @@ private static extern int TSGgml_Gemma4ModelDecode( [DllImport(DllName, CallingConvention = CallingConventionType)] private static extern void TSGgml_ClearHostBufferCache(); + [DllImport(DllName, CallingConvention = CallingConventionType)] + private static extern void TSGgml_InvalidateHostBuffer(IntPtr ptr); + [DllImport(DllName, CallingConvention = CallingConventionType)] private static extern UIntPtr TSGgml_RowSize(int ggmlType, long ne); @@ -545,7 +548,7 @@ public static void EnsureAvailable(GgmlBackendType backendType) GgmlBackendType.Cuda => "ggml-cuda", _ => "ggml-cpu", }; - throw new InvalidOperationException($"Failed to initialize {backendName}. {GetLastErrorMessage("Build the native GGML bridge and ensure the requested GGML backend is available.")}"); + throw new InvalidOperationException($"Failed to initialize {backendName}. {GetBackendAvailabilityHint(backendType)}"); } } catch (DllNotFoundException ex) @@ -868,6 +871,12 @@ public static void ClearHostBufferCache() TSGgml_ClearHostBufferCache(); } + public static void InvalidateHostBuffer(IntPtr ptr) + { + if (ptr != IntPtr.Zero) + TSGgml_InvalidateHostBuffer(ptr); + } + /// Bytes for one row along ne[0]; 0 if type/shape invalid. internal static long RowSizeBytesOrZero(int ggmlType, long ne0) { @@ -1008,5 +1017,24 @@ private static string GetLastErrorMessage(string fallback) string message = errPtr == IntPtr.Zero ? null : Marshal.PtrToStringAnsi(errPtr); return string.IsNullOrWhiteSpace(message) ? fallback : message; } + + private static string GetBackendAvailabilityHint(GgmlBackendType backendType) + { + string defaultMessage = "Build the native GGML bridge and ensure the requested GGML backend is available."; + string backendMessage = GetLastErrorMessage(defaultMessage); + + if (backendType == GgmlBackendType.Cuda && OperatingSystem.IsLinux()) + { + const string rebuildHint = "Rebuild the native GGML bridge with CUDA enabled, for example: `bash TensorSharp.GGML.Native/build-linux.sh --cuda` or `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build`."; + + if (string.IsNullOrWhiteSpace(backendMessage)) + return rebuildHint; + + if (backendMessage.Contains("not available in this build", StringComparison.OrdinalIgnoreCase)) + return $"{backendMessage} {rebuildHint}"; + } + + return backendMessage; + } } } diff --git a/TensorSharp.GGML/TensorSharp.GGML.csproj b/TensorSharp.GGML/TensorSharp.GGML.csproj index e6157cd..2d93ad3 100644 --- a/TensorSharp.GGML/TensorSharp.GGML.csproj +++ b/TensorSharp.GGML/TensorSharp.GGML.csproj @@ -34,7 +34,7 @@ - + diff --git a/readme_cn.md b/readme_cn.md index 64a472b..c2cab4c 100644 --- a/readme_cn.md +++ b/readme_cn.md @@ -129,7 +129,7 @@ bash build-linux.sh --cuda TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release ``` -在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 默认生成带 GGML CPU 后端的 `libGgmlOps.so`,而 `build-linux.sh --cuda` 会启用面向 NVIDIA GPU 的 GGML_CUDA 支持。构建产物会自动复制到应用输出目录。 +在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 会保留已有的 CUDA 构建,并在检测到 CUDA 工具链时自动启用 GGML_CUDA;也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。 ## 使用方法 From 19572eaedccc56691b7856fe85d3e925400c6536 Mon Sep 17 00:00:00 2001 From: Zhongkai Fu Date: Thu, 9 Apr 2026 08:28:55 -0700 Subject: [PATCH 3/3] Make sampled evenly video frame configurable --- InferenceEngine/MediaHelper.cs | 66 +++++++++++++++++++++-- InferenceWeb.Tests/MediaHelperTests.cs | 73 ++++++++++++++++++++++++++ InferenceWeb/ModelService.cs | 57 +++++++++++++++++--- InferenceWeb/Program.cs | 1 + README.md | 1 + readme_cn.md | 1 + 6 files changed, 189 insertions(+), 10 deletions(-) create mode 100644 InferenceWeb.Tests/MediaHelperTests.cs diff --git a/InferenceEngine/MediaHelper.cs b/InferenceEngine/MediaHelper.cs index 2d9d106..34e899a 100644 --- a/InferenceEngine/MediaHelper.cs +++ b/InferenceEngine/MediaHelper.cs @@ -18,8 +18,26 @@ namespace InferenceEngine { public static class MediaHelper { - public static List ExtractVideoFrames(string videoPath, int maxFrames = 8, double fps = 1.0) + public const int DefaultVideoMaxFrames = 4; + + public static int GetConfiguredMaxVideoFrames(int fallback = DefaultVideoMaxFrames) + { + string raw = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES"); + if (!string.IsNullOrWhiteSpace(raw) && + int.TryParse(raw, out int parsed) && + parsed > 0) + { + return parsed; + } + + return fallback > 0 ? fallback : DefaultVideoMaxFrames; + } + + public static List ExtractVideoFrames(string videoPath, int maxFrames = 0, double fps = 1.0) { + if (maxFrames <= 0) + maxFrames = GetConfiguredMaxVideoFrames(); + string tempDir = Path.Combine(Path.GetTempPath(), $"frames_{Guid.NewGuid():N}"); Directory.CreateDirectory(tempDir); @@ -33,14 +51,18 @@ public static List ExtractVideoFrames(string videoPath, int maxFrames = throw new Exception($"Invalid video: fps={videoFps}, frames={totalFrames}"); int frameInterval = Math.Max(1, (int)Math.Round(videoFps / fps)); + var candidateFrames = new List(); + for (int frameIdx = 0; frameIdx < totalFrames; frameIdx += frameInterval) + candidateFrames.Add(frameIdx); + + var selectedPositions = SelectEvenlySpacedIndices(candidateFrames.Count, maxFrames); var frames = new List(); using var mat = new Mat(); - for (int frameIdx = 0; frames.Count < maxFrames; frameIdx += frameInterval) + foreach (int pos in selectedPositions) { - if (frameIdx >= totalFrames) - break; + int frameIdx = candidateFrames[pos]; capture.Set(VideoCaptureProperties.PosFrames, frameIdx); if (!capture.Read(mat) || mat.Empty()) @@ -54,6 +76,42 @@ public static List ExtractVideoFrames(string videoPath, int maxFrames = return frames; } + public static List SelectEvenlySpacedIndices(int count, int maxCount) + { + var indices = new List(); + if (count <= 0 || maxCount <= 0) + return indices; + + if (count <= maxCount) + { + for (int i = 0; i < count; i++) + indices.Add(i); + return indices; + } + + if (maxCount == 1) + { + indices.Add(count / 2); + return indices; + } + + double step = (double)(count - 1) / (maxCount - 1); + int previous = -1; + for (int i = 0; i < maxCount; i++) + { + int idx = (int)Math.Round(i * step); + if (idx <= previous) + idx = previous + 1; + if (idx >= count) + idx = count - 1; + + indices.Add(idx); + previous = idx; + } + + return indices; + } + private static void SaveMatAsPng(Mat mat, string path) { int width = mat.Cols; diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs new file mode 100644 index 0000000..4be19b3 --- /dev/null +++ b/InferenceWeb.Tests/MediaHelperTests.cs @@ -0,0 +1,73 @@ +using InferenceEngine; + +namespace InferenceWeb.Tests; + +public class MediaHelperTests +{ + private static readonly object EnvLock = new(); + + [Fact] + public void SelectEvenlySpacedIndicesReturnsAllIndicesWhenAlreadyUnderLimit() + { + var indices = MediaHelper.SelectEvenlySpacedIndices(count: 3, maxCount: 4); + + Assert.Equal(new[] { 0, 1, 2 }, indices); + } + + [Fact] + public void SelectEvenlySpacedIndicesIncludesEndpointsWhenDownsampling() + { + var indices = MediaHelper.SelectEvenlySpacedIndices(count: 8, maxCount: 4); + + Assert.Equal(4, indices.Count); + Assert.Equal(0, indices[0]); + Assert.Equal(7, indices[^1]); + Assert.Equal(new[] { 0, 2, 5, 7 }, indices); + } + + [Fact] + public void SelectEvenlySpacedIndicesUsesMiddleFrameWhenOnlyOneIsRequested() + { + var indices = MediaHelper.SelectEvenlySpacedIndices(count: 9, maxCount: 1); + + Assert.Equal(new[] { 4 }, indices); + } + + [Fact] + public void GetConfiguredMaxVideoFramesFallsBackToDefaultWhenUnset() + { + lock (EnvLock) + { + string? oldValue = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES"); + try + { + Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", null); + + Assert.Equal(MediaHelper.DefaultVideoMaxFrames, MediaHelper.GetConfiguredMaxVideoFrames()); + } + finally + { + Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", oldValue); + } + } + } + + [Fact] + public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride() + { + lock (EnvLock) + { + string? oldValue = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES"); + try + { + Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", "6"); + + Assert.Equal(6, MediaHelper.GetConfiguredMaxVideoFrames()); + } + finally + { + Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", oldValue); + } + } + } +} diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs index 9996425..81ffdcf 100644 --- a/InferenceWeb/ModelService.cs +++ b/InferenceWeb/ModelService.cs @@ -141,11 +141,12 @@ public async IAsyncEnumerable ChatStreamAsync( List tools = null, bool enableThinking = false) { string arch = _model.Config.Architecture; + var preparedHistory = PrepareHistoryForInference(history, arch); string rendered = ChatTemplate.RenderFromGgufTemplate( - _model.Config.ChatTemplate, history, addGenerationPrompt: true, + _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true, architecture: arch, tools: tools, enableThinking: enableThinking); - var lastMsg = history.LastOrDefault(m => m.Role == "user"); + var lastMsg = preparedHistory.LastOrDefault(m => m.Role == "user"); bool hasMultimodal = HasMultimodalContent(lastMsg); float[] logits; @@ -399,12 +400,13 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB new ChatMessage { Role = "user", Content = prompt, ImagePaths = imagePaths } }; + var preparedMessages = PrepareHistoryForInference(messages, arch); string rendered = ChatTemplate.RenderFromGgufTemplate( - _model.Config.ChatTemplate, messages, addGenerationPrompt: true, + _model.Config.ChatTemplate, preparedMessages, addGenerationPrompt: true, architecture: arch); var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true); - var lastMsg = messages[0]; + var lastMsg = preparedMessages[0]; if (lastMsg.ImagePaths != null && lastMsg.ImagePaths.Count > 0) inputTokens = ProcessMultimodal(lastMsg, inputTokens, arch); @@ -465,11 +467,12 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB List tools = null, bool enableThinking = false) { string arch = _model.Config.Architecture; + var preparedHistory = PrepareHistoryForInference(history, arch); string rendered = ChatTemplate.RenderFromGgufTemplate( - _model.Config.ChatTemplate, history, addGenerationPrompt: true, + _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true, architecture: arch, tools: tools, enableThinking: enableThinking); - var lastMsg = history.LastOrDefault(m => m.Role == "user"); + var lastMsg = preparedHistory.LastOrDefault(m => m.Role == "user"); bool hasMultimodal = HasMultimodalContent(lastMsg); int promptTokenCount; @@ -559,6 +562,48 @@ private bool TryGetCacheSuffix(string rendered, out string suffixText) return false; } + private static List PrepareHistoryForInference(List history, string arch) + { + if (history == null || history.Count == 0) + return history; + + int lastUserIdx = history.FindLastIndex(m => m.Role == "user"); + if (lastUserIdx < 0) + return history; + + var normalized = NormalizeMessageForInference(history[lastUserIdx], arch); + if (ReferenceEquals(normalized, history[lastUserIdx])) + return history; + + var prepared = new List(history); + prepared[lastUserIdx] = normalized; + return prepared; + } + + private static ChatMessage NormalizeMessageForInference(ChatMessage msg, string arch) + { + int maxVideoFrames = MediaHelper.GetConfiguredMaxVideoFrames(); + if (arch != "gemma4" || !msg.IsVideo || msg.ImagePaths == null || msg.ImagePaths.Count <= maxVideoFrames) + return msg; + + var sampled = MediaHelper.SelectEvenlySpacedIndices(msg.ImagePaths.Count, maxVideoFrames) + .Select(i => msg.ImagePaths[i]) + .ToList(); + + Console.WriteLine($"[video] Downsampled {msg.ImagePaths.Count} frames to {sampled.Count} evenly spaced frames for Gemma4 prefill stability."); + + return new ChatMessage + { + Role = msg.Role, + Content = msg.Content, + ImagePaths = sampled, + AudioPaths = msg.AudioPaths != null ? new List(msg.AudioPaths) : null, + IsVideo = msg.IsVideo, + ToolCalls = msg.ToolCalls, + Thinking = msg.Thinking + }; + } + private static bool HasMultimodalContent(ChatMessage msg) { if (msg == null) return false; diff --git a/InferenceWeb/Program.cs b/InferenceWeb/Program.cs index e34be88..162efa7 100644 --- a/InferenceWeb/Program.cs +++ b/InferenceWeb/Program.cs @@ -1471,6 +1471,7 @@ static string ResolveModelPath(string modelName, string modelDir) } Console.WriteLine($"Model directory: {modelDir}"); +Console.WriteLine($"Video max frames: {MediaHelper.GetConfiguredMaxVideoFrames()}"); Console.WriteLine("Starting InferenceWeb on http://localhost:5000"); Console.WriteLine("API endpoints:"); Console.WriteLine(" GET / - Health check"); diff --git a/README.md b/README.md index 7564295..b2c2e9e 100644 --- a/README.md +++ b/README.md @@ -237,6 +237,7 @@ Open `http://localhost:5000` in your browser. The web interface supports: |---|---| | `MODEL_DIR` | Directory containing GGUF model files | | `BACKEND` | Compute backend: `cpu`, `ggml_cpu`, `ggml_metal`, or `ggml_cuda` (default: `ggml_metal` on macOS, `ggml_cpu` elsewhere) | +| `VIDEO_MAX_FRAMES` | Maximum evenly spaced video frames extracted for video prompts (default: `4`) | | `PORT` | HTTP port (default: `5000`) | ### HTTP APIs diff --git a/readme_cn.md b/readme_cn.md index c2cab4c..4d7c362 100644 --- a/readme_cn.md +++ b/readme_cn.md @@ -237,6 +237,7 @@ MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb |---|---| | `MODEL_DIR` | GGUF 模型文件所在目录 | | `BACKEND` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`(默认:macOS 为 `ggml_metal`,其他平台为 `ggml_cpu`) | +| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限(默认:`4`) | | `PORT` | HTTP 端口(默认:`5000`) | ### HTTP API