From f6c15d00c58a09e7802bf8c7b872b23f59b213b6 Mon Sep 17 00:00:00 2001
From: Zhongkai Fu <fuzhongkai@gmail.com>
Date: Thu, 9 Apr 2026 00:30:04 -0700
Subject: [PATCH 1/3] support jpg format

---
 InferenceConsole/Program.cs                   |  2 +-
 InferenceEngine/InferenceEngine.csproj        |  4 +
 .../Models/Gemma3/ImageProcessor.cs           | 56 ++++++++++-
 .../Models/Qwen35/ImageProcessor.cs           | 12 +--
 InferenceWeb.Tests/ImageProcessorTests.cs     | 97 +++++++++++++++++++
 InferenceWeb/ModelService.cs                  |  2 +-
 6 files changed, 158 insertions(+), 15 deletions(-)
 create mode 100644 InferenceWeb.Tests/ImageProcessorTests.cs
diff --git a/InferenceConsole/Program.cs b/InferenceConsole/Program.cs
index 38f134f..8f0d07f 100644
--- a/InferenceConsole/Program.cs
+++ b/InferenceConsole/Program.cs
@@ -628,7 +628,7 @@ static string RunInference(ModelBase model, string rawText, List<string> imagePa
                         var tokenCounts = new int[imagePaths.Count];
                         for (int i = 0; i < imagePaths.Count; i++)
                         {
-                            var (width, height) = Qwen35ImageProcessor.ReadPngDimensions(imagePaths[i]);
+                            var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]);
                             tokenCounts[i] = processor.ComputeImageTokenCount(height, width);
                             var (gridH, gridW) = processor.GetPatchGrid(height, width);
                             var (resizedH, resizedW) = processor.SmartResize(height, width);
diff --git a/InferenceEngine/InferenceEngine.csproj b/InferenceEngine/InferenceEngine.csproj
index 548c5db..6c04c0d 100644
--- a/InferenceEngine/InferenceEngine.csproj
+++ b/InferenceEngine/InferenceEngine.csproj
@@ -12,7 +12,11 @@
   <ItemGroup>
     <PackageReference Include="NLayer" Version="1.16.0" />
     <PackageReference Include="NVorbis" Version="0.10.5" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.ubuntu.24.04-x64" Version="4.13.0.18" />
     <PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-arm64" Version="4.13.0.18" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-x64" Version="4.13.0.18" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.win-x64" Version="4.13.0.18" />
     <PackageReference Include="OpenCvSharp4" Version="4.13.0.20260330" />
+    <PackageReference Include="StbImageSharp" Version="2.30.15" />
   </ItemGroup>
 </Project>
diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/InferenceEngine/Models/Gemma3/ImageProcessor.cs
index def272e..02c66f8 100644
--- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs
+++ b/InferenceEngine/Models/Gemma3/ImageProcessor.cs
@@ -9,6 +9,7 @@
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
 using System;
 using System.IO;
+using StbImageSharp;
 
 namespace InferenceEngine
 {
@@ -52,15 +53,43 @@ public float[] ProcessImage(string imagePath)
 
         internal static byte[] DecodeImageToRGBA(byte[] fileBytes, out int width, out int height)
         {
-            if (fileBytes.Length >= 8 && fileBytes[0] == 0x89 && fileBytes[1] == 0x50)
+            if (IsPng(fileBytes))
                 return DecodePNG(fileBytes, out width, out height);
 
-            if (fileBytes.Length >= 2 && fileBytes[0] == 0xFF && fileBytes[1] == 0xD8)
+            if (IsJpeg(fileBytes))
                 return DecodeJPEG(fileBytes, out width, out height);
 
             throw new NotSupportedException("Only PNG and JPEG image formats are supported");
         }
 
+        internal static (int width, int height) ReadImageDimensions(string imagePath)
+        {
+            byte[] fileBytes = File.ReadAllBytes(imagePath);
+
+            if (IsPng(fileBytes))
+                return ReadPngDimensions(fileBytes);
+
+            if (IsJpeg(fileBytes))
+            {
+                DecodeJPEG(fileBytes, out int width, out int height);
+                return (width, height);
+            }
+
+            throw new NotSupportedException("Only PNG and JPEG image formats are supported");
+        }
+
+        private static bool IsPng(byte[] fileBytes) =>
+            fileBytes.Length >= 8 &&
+            fileBytes[0] == 0x89 &&
+            fileBytes[1] == 0x50 &&
+            fileBytes[2] == 0x4E &&
+            fileBytes[3] == 0x47;
+
+        private static bool IsJpeg(byte[] fileBytes) =>
+            fileBytes.Length >= 2 &&
+            fileBytes[0] == 0xFF &&
+            fileBytes[1] == 0xD8;
+
         private static byte[] DecodePNG(byte[] data, out int width, out int height)
         {
             using var ms = new MemoryStream(data);
@@ -179,6 +208,16 @@ private static byte[] DecodePNG(byte[] data, out int width, out int height)
             return rgba;
         }
 
+        private static (int width, int height) ReadPngDimensions(byte[] data)
+        {
+            if (data.Length < 24 || !IsPng(data))
+                throw new InvalidDataException("Not a PNG file");
+
+            int width = (data[16] << 24) | (data[17] << 16) | (data[18] << 8) | data[19];
+            int height = (data[20] << 24) | (data[21] << 16) | (data[22] << 8) | data[23];
+            return (width, height);
+        }
+
         private static byte PaethPredictor(byte a, byte b, byte c)
         {
             int p = a + b - c;
@@ -196,8 +235,17 @@ private static int ReadBigEndianInt32(BinaryReader reader)
 
         private static byte[] DecodeJPEG(byte[] data, out int width, out int height)
         {
-            throw new NotSupportedException(
-                "JPEG decoding not implemented. Please convert image to PNG format.");
+            try
+            {
+                ImageResult decoded = ImageResult.FromMemory(data, ColorComponents.RedGreenBlueAlpha);
+                width = decoded.Width;
+                height = decoded.Height;
+                return decoded.Data;
+            }
+            catch (Exception ex)
+            {
+                throw new InvalidDataException("Failed to decode JPEG image.", ex);
+            }
         }
 
         internal static byte[] CompositeOverWhite(byte[] rgba, int width, int height)
diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/InferenceEngine/Models/Qwen35/ImageProcessor.cs
index c7c2a40..af94030 100644
--- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs
+++ b/InferenceEngine/Models/Qwen35/ImageProcessor.cs
@@ -8,7 +8,6 @@
 // TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
 using System;
-using System.Buffers.Binary;
 using System.IO;
 
 namespace InferenceEngine
@@ -34,14 +33,9 @@ public Qwen35ImageProcessor(int patchSize = 14, int mergeSize = 2,
             LongestEdge = longestEdge;
         }
 
-        public static (int width, int height) ReadPngDimensions(string path)
+        public static (int width, int height) ReadImageDimensions(string path)
         {
-            using var stream = File.OpenRead(path);
-            Span<byte> header = stackalloc byte[24];
-            stream.Read(header);
-            int width = BinaryPrimitives.ReadInt32BigEndian(header.Slice(16, 4));
-            int height = BinaryPrimitives.ReadInt32BigEndian(header.Slice(20, 4));
-            return (width, height);
+            return Gemma3ImageProcessor.ReadImageDimensions(path);
         }
 
         public (int height, int width) SmartResize(int height, int width)
@@ -79,7 +73,7 @@ public int ComputeImageTokenCount(int origHeight, int origWidth)
 
         public int ComputeImageTokenCount(string imagePath)
         {
-            var (width, height) = ReadPngDimensions(imagePath);
+            var (width, height) = ReadImageDimensions(imagePath);
             return ComputeImageTokenCount(height, width);
         }
 
diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs
new file mode 100644
index 0000000..92dd8ca
--- /dev/null
+++ b/InferenceWeb.Tests/ImageProcessorTests.cs
@@ -0,0 +1,97 @@
+using InferenceEngine;
+
+namespace InferenceWeb.Tests;
+
+public class ImageProcessorTests
+{
+    private const string EmbeddedJpegBase64 =
+        "/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAACAAIDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD7V/Z2+C3w91v9n74ZajqPgTwzf6heeF9MuLm7utHt5JZ5XtImd3dkJZmJJJJySSTRRRXyOL/3ip/if5nwmO/3qr/il+bP/9k=";
+
+    [Fact]
+    public void Gemma3ImageProcessorProcessImageSupportsJpeg()
+    {
+        string path = WriteEmbeddedJpeg();
+        try
+        {
+            var processor = new Gemma3ImageProcessor(imageSize: 32);
+            float[] pixels = processor.ProcessImage(path);
+
+            Assert.Equal(3 * 32 * 32, pixels.Length);
+            Assert.All(pixels, value => Assert.InRange(value, -1f, 1f));
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void Gemma4ImageProcessorProcessImageSupportsJpeg()
+    {
+        string path = WriteEmbeddedJpeg();
+        try
+        {
+            var processor = new Gemma4ImageProcessor(patchSize: 1, nMerge: 1, minTokens: 1, maxTokens: 4);
+            var (pixels, width, height) = processor.ProcessImage(path);
+
+            Assert.Equal(2, width);
+            Assert.Equal(2, height);
+            Assert.Equal(12, pixels.Length);
+            Assert.All(pixels, value => Assert.InRange(value, -1f, 1f));
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void Qwen35ImageProcessorComputeImageTokenCountSupportsJpeg()
+    {
+        string path = WriteEmbeddedJpeg();
+        try
+        {
+            var processor = new Qwen35ImageProcessor(patchSize: 1, mergeSize: 1, shortestEdge: 1, longestEdge: 16);
+            int tokenCount = processor.ComputeImageTokenCount(path);
+
+            Assert.Equal(4, tokenCount);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void UserSuppliedJpegSmokeTestWhenConfigured()
+    {
+        string? path = Environment.GetEnvironmentVariable("TENSORSHARP_JPEG_SMOKE_PATH");
+        if (string.IsNullOrWhiteSpace(path) || !File.Exists(path))
+            return;
+
+        var gemma3 = new Gemma3ImageProcessor(imageSize: 32);
+        float[] gemma3Pixels = gemma3.ProcessImage(path);
+
+        var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(path);
+        var qwen = new Qwen35ImageProcessor(patchSize: 1, mergeSize: 1, shortestEdge: 1, longestEdge: width * height);
+        int qwenTokens = qwen.ComputeImageTokenCount(path);
+
+        var gemma4 = new Gemma4ImageProcessor(patchSize: 1, nMerge: 1, minTokens: 1, maxTokens: width * height);
+        var (gemma4Pixels, gemma4Width, gemma4Height) = gemma4.ProcessImage(path);
+
+        Assert.Equal(3 * 32 * 32, gemma3Pixels.Length);
+        Assert.True(width > 0);
+        Assert.True(height > 0);
+        Assert.True(qwenTokens > 0);
+        Assert.Equal(width, gemma4Width);
+        Assert.Equal(height, gemma4Height);
+        Assert.Equal(3 * width * height, gemma4Pixels.Length);
+    }
+
+    private static string WriteEmbeddedJpeg()
+    {
+        string path = Path.Combine(Path.GetTempPath(), $"{Guid.NewGuid():N}.jpg");
+        File.WriteAllBytes(path, Convert.FromBase64String(EmbeddedJpegBase64));
+        return path;
+    }
+}
diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs
index 5404584..9996425 100644
--- a/InferenceWeb/ModelService.cs
+++ b/InferenceWeb/ModelService.cs
@@ -290,7 +290,7 @@ private List<int> ProcessImages(ChatMessage msg, List<int> inputTokens, string a
                     var tokenCounts = new int[msg.ImagePaths.Count];
                     for (int i = 0; i < msg.ImagePaths.Count; i++)
                     {
-                        var (w, h) = Qwen35ImageProcessor.ReadPngDimensions(msg.ImagePaths[i]);
+                        var (w, h) = Qwen35ImageProcessor.ReadImageDimensions(msg.ImagePaths[i]);
                         tokenCounts[i] = processor.ComputeImageTokenCount(h, w);
                     }
                     inputTokens = ChatTemplate.ExpandImageTokens(inputTokens, imagePadId, tokenCounts);

From 4bb80c793895deb579688cb9878f23464765162b Mon Sep 17 00:00:00 2001
From: Zhongkai Fu <fuzhongkai@gmail.com>
Date: Thu, 9 Apr 2026 08:00:04 -0700
Subject: [PATCH 2/3] Improve audio/video/image encoder performance

---
 InferenceEngine/ModelBase.cs                  |  12 +
 InferenceEngine/Models/Gemma3/Gemma3Model.cs  |  12 +-
 .../Models/Gemma3/Gemma3VisionEncoder.cs      | 134 ++---
 .../Models/Gemma3/ImageProcessor.cs           |  99 +++-
 .../Models/Gemma4/Gemma4AudioEncoder.cs       |  85 +--
 .../Models/Gemma4/Gemma4AudioPreprocessor.cs  |  91 +--
 .../Models/Gemma4/Gemma4ImageProcessor.cs     |   6 +-
 InferenceEngine/Models/Gemma4/Gemma4Model.cs  |   4 +
 .../Models/Gemma4/Gemma4VisionEncoder.cs      | 222 ++++---
 InferenceEngine/Models/Qwen3/Qwen3Model.cs    |   2 +
 .../Models/Qwen35/ImageProcessor.cs           |   6 +-
 InferenceEngine/Models/Qwen35/Qwen35Model.cs  |   2 +
 .../Models/Qwen35/Qwen35VisionEncoder.cs      | 342 +++++------
 InferenceWeb/Program.cs                       |   4 +
 README.md                                     |   2 +-
 TensorSharp.GGML.Native/build-linux.sh        |  55 +-
 TensorSharp.GGML.Native/ggml_ops.cpp          | 555 ++++++++++--------
 TensorSharp.GGML/GgmlBasicOps.cs              |   1 +
 TensorSharp.GGML/GgmlNative.cs                |  30 +-
 TensorSharp.GGML/TensorSharp.GGML.csproj      |   2 +-
 readme_cn.md                                  |   2 +-
 21 files changed, 961 insertions(+), 707 deletions(-)

diff --git a/InferenceEngine/ModelBase.cs b/InferenceEngine/ModelBase.cs
index f867729..d69022f 100644
--- a/InferenceEngine/ModelBase.cs
+++ b/InferenceEngine/ModelBase.cs
@@ -626,6 +626,7 @@ protected void CopyToCache(Tensor cache, Tensor src, int startPos, int seqLen)
         {
             using var cacheSlice = cache.Narrow(1, startPos, seqLen);
             Ops.Copy(cacheSlice, src);
+            InvalidateTensorDeviceCache(cache);
         }
 
         protected Tensor ExpandKVHeads(Tensor cache, int groupSize, int totalSeqLen)
@@ -653,6 +654,9 @@ protected unsafe void CopyToCacheDecode(Tensor kCache, Tensor kTensor,
                 Buffer.MemoryCopy(kSrc + srcOffset, kCachePtr + cacheOffset, headBytes, headBytes);
                 Buffer.MemoryCopy(vSrc + srcOffset, vCachePtr + cacheOffset, headBytes, headBytes);
             }
+
+            InvalidateTensorDeviceCache(kCache);
+            InvalidateTensorDeviceCache(vCache);
         }
 
         protected unsafe void AttentionDecodePureCS(Tensor q, Tensor kCache, Tensor vCache,
@@ -718,6 +722,14 @@ private static IntPtr GetStoragePtr(Tensor t)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        protected void InvalidateTensorDeviceCache(Tensor tensor)
+        {
+            if (!IsGgmlBackend || tensor == null)
+                return;
+
+            GgmlBasicOps.InvalidateHostBuffer(GetStoragePtr(tensor));
+        }
+
         public abstract float[] Forward(int[] tokens);
         public abstract void ResetKVCache();
 
diff --git a/InferenceEngine/Models/Gemma3/Gemma3Model.cs b/InferenceEngine/Models/Gemma3/Gemma3Model.cs
index 9ab0f8e..f676505 100644
--- a/InferenceEngine/Models/Gemma3/Gemma3Model.cs
+++ b/InferenceEngine/Models/Gemma3/Gemma3Model.cs
@@ -127,8 +127,16 @@ public override void ResetKVCache()
             _cacheSeqLen = 0;
             if (_kvCacheK != null)
             {
-                foreach (var k in _kvCacheK) Ops.Fill(k, 0f);
-                foreach (var v in _kvCacheV) Ops.Fill(v, 0f);
+                foreach (var k in _kvCacheK)
+                {
+                    Ops.Fill(k, 0f);
+                    InvalidateTensorDeviceCache(k);
+                }
+                foreach (var v in _kvCacheV)
+                {
+                    Ops.Fill(v, 0f);
+                    InvalidateTensorDeviceCache(v);
+                }
             }
         }
 
diff --git a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs b/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
index 9eea4ab..389744b 100644
--- a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
+++ b/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
@@ -11,13 +11,16 @@
 using System.Collections.Generic;
 using TensorSharp;
 using TensorSharp.Cpu;
+using TensorSharp.GGML;
 
 namespace InferenceEngine
 {
     public class Gemma3VisionEncoder : IDisposable
     {
         private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
         private readonly IAllocator _allocator;
+        private readonly bool _useNativeAttention;
 
         private readonly int _imageSize;
         private readonly int _patchSize;
@@ -35,6 +38,7 @@ public class Gemma3VisionEncoder : IDisposable
         public Gemma3VisionEncoder(string mmProjPath, IAllocator allocator)
         {
             _allocator = allocator;
+            _useNativeAttention = allocator is GgmlAllocator;
             var gguf = new GgufFile(mmProjPath);
 
             _imageSize = (int)gguf.GetUint32("clip.vision.image_size", 896);
@@ -219,7 +223,16 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
 
             float scale = 1f / MathF.Sqrt(headDim);
 
-            // Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim]
+            if (_useNativeAttention)
+            {
+                using var q4 = q.View(1, numPatches, _numHeads, headDim);
+                using var k4 = k.View(1, numPatches, _numHeads, headDim);
+                using var v4 = v.View(1, numPatches, _numHeads, headDim);
+                using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale);
+                using var flat = attn4.View(numPatches, _hiddenSize);
+                return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
+            }
+
             using var qReshaped = q.View(numPatches, _numHeads, headDim);
             using var kReshaped = k.View(numPatches, _numHeads, headDim);
             using var vReshaped = v.View(numPatches, _numHeads, headDim);
@@ -231,49 +244,30 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
             using var kHeads = Ops.NewContiguous(kT0);
             using var vHeads = Ops.NewContiguous(vT0);
 
-            // Batched Q @ K^T -> [numHeads, numPatches, numPatches]
             using var kT = kHeads.Transpose(1, 2);
             var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
             Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);
-
             Ops.Softmax(scores, scores);
 
-            // Batched softmax @ V -> [numHeads, numPatches, headDim]
             var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim);
             Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads);
             scores.Dispose();
 
-            // Reshape back: [numHeads, numPatches, headDim] -> [numPatches, hiddenSize]
             using var transposed = attnOutput.Transpose(0, 1);
             using var contiguous = Ops.NewContiguous(transposed);
-            using var flat = contiguous.View(numPatches, _hiddenSize);
-            using var flatContig = Ops.NewContiguous(flat);
+            using var flatContig = contiguous.View(numPatches, _hiddenSize);
             attnOutput.Dispose();
 
             return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
         }
 
-        private unsafe Tensor VisionMLP(Tensor input, string prefix)
+        private Tensor VisionMLP(Tensor input, string prefix)
         {
             using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias");
-
-            ApplyGELU(fc1Out);
-
+            Ops.GELU(fc1Out, fc1Out);
             return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias");
         }
 
-        private unsafe void ApplyGELU(Tensor t)
-        {
-            float* ptr = GetFloatPtr(t);
-            int count = (int)t.ElementCount();
-            for (int i = 0; i < count; i++)
-            {
-                double x = ptr[i];
-                double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x)));
-                ptr[i] = (float)(x * cdf);
-            }
-        }
-
         /// <summary>
         /// Multi-modal projector: vision output → text space.
         /// Steps: reshape to 2D grid → average pool → RMSNorm → linear projection.
@@ -354,91 +348,25 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str
 
             Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input);
             Tensor src = contiguousInput ?? input;
-
-            using var wT = weight.Transpose();
-            Ops.Addmm(result, 0, result, 1.0f, src, wT);
+            Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName));
 
             contiguousInput?.Dispose();
 
             if (_weights.TryGetValue(biasName, out var bias))
-            {
-                float* rPtr = GetFloatPtr(result);
-                float* bPtr = GetFloatPtr(bias);
-                for (int s = 0; s < seqLen; s++)
-                {
-                    float* row = rPtr + s * outDim;
-                    for (int d = 0; d < outDim; d++)
-                        row[d] += bPtr[d];
-                }
-            }
+                Ops.Add(result, result, bias);
 
             return result;
         }
 
-        private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName)
+        private Tensor LayerNormOp(Tensor input, string weightName, string biasName)
         {
-            int rows = (int)input.Sizes[0];
-            int dim = (int)input.Sizes[1];
-            var result = new Tensor(_allocator, DType.Float32, rows, dim);
-
-            float* src = GetFloatPtr(input);
-            float* dst = GetFloatPtr(result);
-            float* w = GetFloatPtr(_weights[weightName]);
-            float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null;
-
-            for (int r = 0; r < rows; r++)
-            {
-                float* srcRow = src + r * dim;
-                float* dstRow = dst + r * dim;
-
-                float mean = 0;
-                for (int i = 0; i < dim; i++)
-                    mean += srcRow[i];
-                mean /= dim;
-
-                float variance = 0;
-                for (int i = 0; i < dim; i++)
-                {
-                    float diff = srcRow[i] - mean;
-                    variance += diff * diff;
-                }
-                variance /= dim;
-
-                float invStd = 1f / MathF.Sqrt(variance + _eps);
-                for (int i = 0; i < dim; i++)
-                {
-                    float normalized = (srcRow[i] - mean) * invStd;
-                    dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f);
-                }
-            }
-
-            return result;
+            _weights.TryGetValue(biasName, out var bias);
+            return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps);
         }
 
-        private unsafe Tensor RMSNormOp(Tensor input, string weightName)
+        private Tensor RMSNormOp(Tensor input, string weightName)
         {
-            int rows = (int)input.Sizes[0];
-            int dim = (int)input.Sizes[1];
-            var result = new Tensor(_allocator, DType.Float32, rows, dim);
-
-            float* src = GetFloatPtr(input);
-            float* dst = GetFloatPtr(result);
-            float* w = GetFloatPtr(_weights[weightName]);
-
-            for (int r = 0; r < rows; r++)
-            {
-                float* srcRow = src + r * dim;
-                float* dstRow = dst + r * dim;
-
-                float sumSq = 0;
-                for (int i = 0; i < dim; i++)
-                    sumSq += srcRow[i] * srcRow[i];
-                float rms = 1f / MathF.Sqrt(sumSq / dim + _eps);
-                for (int i = 0; i < dim; i++)
-                    dstRow[i] = w[i] * srcRow[i] * rms;
-            }
-
-            return result;
+            return Ops.RMSNorm(null, input, _weights[weightName], null, _eps);
         }
 
         private unsafe void DumpTensor(Tensor t, string label, int numRows)
@@ -466,8 +394,22 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
         public void Dispose()
         {
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
             foreach (var w in _weights.Values)
                 w.Dispose();
             _weights.Clear();
diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/InferenceEngine/Models/Gemma3/ImageProcessor.cs
index 02c66f8..bfea928 100644
--- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs
+++ b/InferenceEngine/Models/Gemma3/ImageProcessor.cs
@@ -9,6 +9,7 @@
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
 using System;
 using System.IO;
+using System.Threading.Tasks;
 using StbImageSharp;
 
 namespace InferenceEngine
@@ -43,12 +44,7 @@ public float[] ProcessImage(string imagePath)
             byte[] fileBytes = File.ReadAllBytes(imagePath);
             int origWidth, origHeight;
             byte[] rgba = DecodeImageToRGBA(fileBytes, out origWidth, out origHeight);
-
-            byte[] composited = CompositeOverWhite(rgba, origWidth, origHeight);
-
-            byte[] resized = BilinearResize(composited, origWidth, origHeight, ImageSize, ImageSize);
-
-            return PackChannelFirst(resized, ImageSize, ImageSize);
+            return ResizeRgbaToChannelFirstNormalized(rgba, origWidth, origHeight, ImageSize, ImageSize);
         }
 
         internal static byte[] DecodeImageToRGBA(byte[] fileBytes, out int width, out int height)
@@ -251,24 +247,30 @@ private static byte[] DecodeJPEG(byte[] data, out int width, out int height)
         internal static byte[] CompositeOverWhite(byte[] rgba, int width, int height)
         {
             byte[] result = new byte[width * height * 4];
-            for (int i = 0; i < width * height; i++)
+            Parallel.For(0, height, y =>
             {
-                int a = rgba[i * 4 + 3];
-                if (a == 255)
-                {
-                    result[i * 4] = rgba[i * 4];
-                    result[i * 4 + 1] = rgba[i * 4 + 1];
-                    result[i * 4 + 2] = rgba[i * 4 + 2];
-                }
-                else
+                int srcRow = y * width * 4;
+                for (int x = 0; x < width; x++)
                 {
-                    float alpha = a / 255f;
-                    result[i * 4] = (byte)(rgba[i * 4] * alpha + 255 * (1 - alpha));
-                    result[i * 4 + 1] = (byte)(rgba[i * 4 + 1] * alpha + 255 * (1 - alpha));
-                    result[i * 4 + 2] = (byte)(rgba[i * 4 + 2] * alpha + 255 * (1 - alpha));
+                    int pixBase = srcRow + x * 4;
+                    int a = rgba[pixBase + 3];
+                    if (a == 255)
+                    {
+                        result[pixBase] = rgba[pixBase];
+                        result[pixBase + 1] = rgba[pixBase + 1];
+                        result[pixBase + 2] = rgba[pixBase + 2];
+                    }
+                    else
+                    {
+                        float alpha = a / 255f;
+                        result[pixBase] = (byte)(rgba[pixBase] * alpha + 255 * (1 - alpha));
+                        result[pixBase + 1] = (byte)(rgba[pixBase + 1] * alpha + 255 * (1 - alpha));
+                        result[pixBase + 2] = (byte)(rgba[pixBase + 2] * alpha + 255 * (1 - alpha));
+                    }
+
+                    result[pixBase + 3] = 255;
                 }
-                result[i * 4 + 3] = 255;
-            }
+            });
             return result;
         }
 
@@ -278,7 +280,7 @@ internal static byte[] BilinearResize(byte[] rgba, int srcW, int srcH, int dstW,
             double xRatio = (double)srcW / dstW;
             double yRatio = (double)srcH / dstH;
 
-            for (int dy = 0; dy < dstH; dy++)
+            Parallel.For(0, dstH, dy =>
             {
                 double srcY = (dy + 0.5) * yRatio - 0.5;
                 int y0 = Math.Max(0, (int)srcY);
@@ -305,11 +307,62 @@ internal static byte[] BilinearResize(byte[] rgba, int srcW, int srcH, int dstW,
                     }
                     result[(dy * dstW + dx) * 4 + 3] = 255;
                 }
-            }
+            });
+
+            return result;
+        }
+
+        internal static float[] ResizeRgbaToChannelFirstNormalized(byte[] rgba, int srcW, int srcH, int dstW, int dstH)
+        {
+            int pixels = dstW * dstH;
+            float[] result = new float[3 * pixels];
+            double xRatio = (double)srcW / dstW;
+            double yRatio = (double)srcH / dstH;
+
+            Parallel.For(0, dstH, dy =>
+            {
+                double srcY = (dy + 0.5) * yRatio - 0.5;
+                int y0 = Math.Max(0, (int)srcY);
+                int y1 = Math.Min(srcH - 1, y0 + 1);
+                double fy = srcY - y0;
+
+                for (int dx = 0; dx < dstW; dx++)
+                {
+                    double srcX = (dx + 0.5) * xRatio - 0.5;
+                    int x0 = Math.Max(0, (int)srcX);
+                    int x1 = Math.Min(srcW - 1, x0 + 1);
+                    double fx = srcX - x0;
+
+                    int dstIdx = dy * dstW + dx;
+                    result[dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 0);
+                    result[pixels + dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 1);
+                    result[2 * pixels + dstIdx] = BilinearSampleNormalized(rgba, srcW, x0, y0, x1, y1, fx, fy, 2);
+                }
+            });
 
             return result;
         }
 
+        private static float BilinearSampleNormalized(byte[] rgba, int srcW, int x0, int y0, int x1, int y1,
+            double fx, double fy, int channel)
+        {
+            float v00 = CompositeChannelToNormalized(rgba, (y0 * srcW + x0) * 4, channel);
+            float v01 = CompositeChannelToNormalized(rgba, (y0 * srcW + x1) * 4, channel);
+            float v10 = CompositeChannelToNormalized(rgba, (y1 * srcW + x0) * 4, channel);
+            float v11 = CompositeChannelToNormalized(rgba, (y1 * srcW + x1) * 4, channel);
+
+            double v = v00 * (1 - fx) * (1 - fy) + v01 * fx * (1 - fy) +
+                       v10 * (1 - fx) * fy + v11 * fx * fy;
+            return Math.Clamp((float)v, -1f, 1f);
+        }
+
+        private static float CompositeChannelToNormalized(byte[] rgba, int pixelBase, int channel)
+        {
+            float alpha = rgba[pixelBase + 3] / 255f;
+            float composited = rgba[pixelBase + channel] * alpha + 255f * (1f - alpha);
+            return composited / 255f * 2f - 1f;
+        }
+
         /// <summary>
         /// Pack RGBA pixels into channel-first float format [R..., G..., B...] normalized with mean/std.
         /// Matches Ollama's pack(): channel-first with (pixel/255 - mean) / std.
diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs b/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs
index 3baa0ee..51e5280 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs
+++ b/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs
@@ -11,12 +11,14 @@
 using System.Collections.Generic;
 using TensorSharp;
 using TensorSharp.Cpu;
+using TensorSharp.GGML;
 
 namespace InferenceEngine
 {
     public class Gemma4AudioEncoder : IDisposable
     {
         private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
         private readonly IAllocator _allocator;
 
         private readonly int _hiddenSize;
@@ -42,9 +44,11 @@ private struct ClampParams
             public bool HasClamp;
         }
         private readonly Dictionary<string, ClampParams> _clampParams = new();
+        private readonly Dictionary<string, float[]> _positionEmbeddingCache = new();
 
         private bool _useOllamaNames;
         private Tensor _onesForNorm;
+        private readonly float[] _causalMask;
 
         public int ProjectionDim => _projectionDim;
 
@@ -77,6 +81,7 @@ public Gemma4AudioEncoder(string mmProjPath, IAllocator allocator)
             gguf.Dispose();
 
             _useOllamaNames = _weights.ContainsKey("a.blk.0.ln1.weight");
+            _causalMask = BuildCausalValidMask();
             Console.WriteLine($"  GGUF naming: {(_useOllamaNames ? "Ollama" : "mmproj/Unsloth")}");
         }
 
@@ -197,8 +202,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames)
                 int outDim = (int)sscpWeight.Sizes[0];
                 hidDim = outDim;
                 hiddenTensor = new Tensor(_allocator, DType.Float32, t1Out, hidDim);
-                using (var wT = sscpWeight.Transpose())
-                    Ops.Addmm(hiddenTensor, 0, hiddenTensor, 1f, projTensor, wT);
+                Ops.Addmm(hiddenTensor, 0, hiddenTensor, 1f, projTensor, GetOrCreateTransposedWeight(sscpWeightName));
                 projTensor.Dispose();
 
                 string biasName = sscpWeightName.Replace(".weight", ".bias");
@@ -215,13 +219,11 @@ public unsafe Tensor Encode(float[] melData, int numFrames)
             Console.Write($" proj=[{seqLen},{hidDim}]");
 
             // Build causal-valid mask
-            float[] causalMask = BuildCausalValidMask();
-
             // Conformer blocks
             for (int i = 0; i < _numLayers; i++)
             {
                 Console.Write($"\r  Audio conformer block {i + 1}/{_numLayers}...                    ");
-                hiddenTensor = ConformerBlock(hiddenTensor, i, seqLen, hidDim, causalMask);
+                hiddenTensor = ConformerBlock(hiddenTensor, i, seqLen, hidDim, _causalMask);
             }
             Console.Write("\r  Audio conformer done.                                         \n");
 
@@ -230,8 +232,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames)
             {
                 int outDim = (int)outProjWeight.Sizes[0];
                 var outProj = new Tensor(_allocator, DType.Float32, seqLen, outDim);
-                using (var wT = outProjWeight.Transpose())
-                    Ops.Addmm(outProj, 0, outProj, 1f, hiddenTensor, wT);
+                Ops.Addmm(outProj, 0, outProj, 1f, hiddenTensor, GetOrCreateTransposedWeight("a.output_proj.weight"));
                 if (_weights.TryGetValue("a.output_proj.bias", out var outProjBias))
                     AddBias(outProj, outProjBias, seqLen, outDim);
                 hiddenTensor.Dispose();
@@ -252,8 +253,7 @@ public unsafe Tensor Encode(float[] melData, int numFrames)
             {
                 int fcOutDim = (int)fcWeight.Sizes[0];
                 var fcOut = new Tensor(_allocator, DType.Float32, seqLen, fcOutDim);
-                using (var wT = fcWeight.Transpose())
-                    Ops.Addmm(fcOut, 0, fcOut, 1f, hiddenTensor, wT);
+                Ops.Addmm(fcOut, 0, fcOut, 1f, hiddenTensor, GetOrCreateTransposedWeight(fcWeightName));
 
                 string fcBiasName = fcWeightName.Replace(".weight", ".bias");
                 if (_weights.TryGetValue(fcBiasName, out var fcBias))
@@ -524,51 +524,49 @@ private unsafe void ChunkedAttention(float[] qArr, float[] kPadded, float[] vPad
 
             for (int h = 0; h < _numHeads; h++)
             {
+                float[] logitsBuffer = new float[ctx];
                 for (int qi = 0; qi < cs; qi++)
                 {
                     int globalQIdx = chunkIdx * cs + qi;
                     if (globalQIdx >= seqLen)
                     {
-                        // Padded position - zero output
                         continue;
                     }
 
-                    float[] logits = new float[ctx];
+                    Span<float> logits = logitsBuffer;
+                    int qOffset = globalQIdx * hidDim + h * _headDim;
 
                     for (int ci = 0; ci < ctx; ci++)
                     {
-                        // Content-content: q[qi] dot k[ci]
+                        int actualTime = chunkIdx * cs + ci - padLeft;
+                        bool causalOK = causalMask[qi * ctx + ci] > 0;
+                        bool validOK = actualTime >= 0 && actualTime < seqLen;
+                        if (!causalOK || !validOK)
+                        {
+                            logits[ci] = -1e9f;
+                            continue;
+                        }
+
                         float dotCC = 0;
-                        int qOffset = globalQIdx * hidDim + h * _headDim;
-                        int kGlobalIdx = chunkIdx * cs + ci; // position in kPadded
+                        int kGlobalIdx = chunkIdx * cs + ci;
                         int kOffset = kGlobalIdx * hidDim + h * _headDim;
 
                         for (int d = 0; d < _headDim; d++)
                             dotCC += qArr[qOffset + d] * kPadded[kOffset + d];
 
-                        // Content-position: q[qi] dot posEmb[relPos]
                         float dotCP = 0;
-                        for (int d = 0; d < _headDim; d++)
+                        int posIdx = RelativeShiftIndex(qi, ci, maxSpan);
+                        if (posIdx >= 0 && posIdx < maxSpan)
                         {
-                            int posIdx = RelativeShiftIndex(qi, ci, maxSpan);
-                            if (posIdx >= 0 && posIdx < maxSpan)
-                                dotCP += qArr[qOffset + d] * posEmb[(posIdx * _numHeads + h) * _headDim + d];
+                            int posOffset = (posIdx * _numHeads + h) * _headDim;
+                            for (int d = 0; d < _headDim; d++)
+                                dotCP += qArr[qOffset + d] * posEmb[posOffset + d];
                         }
 
                         logits[ci] = dotCC + dotCP;
-
-                        // Logit softcap
                         logits[ci] = MathF.Tanh(logits[ci] / _logitCap) * _logitCap;
-
-                        // Apply mask
-                        int actualTime = chunkIdx * cs + ci - padLeft;
-                        bool causalOK = causalMask[qi * ctx + ci] > 0;
-                        bool validOK = actualTime >= 0 && actualTime < seqLen;
-                        if (!causalOK || !validOK)
-                            logits[ci] = -1e9f;
                     }
 
-                    // Softmax
                     float maxLogit = float.NegativeInfinity;
                     for (int ci = 0; ci < ctx; ci++)
                         if (logits[ci] > maxLogit) maxLogit = logits[ci];
@@ -582,7 +580,6 @@ private unsafe void ChunkedAttention(float[] qArr, float[] kPadded, float[] vPad
                     for (int ci = 0; ci < ctx; ci++)
                         logits[ci] *= invSum;
 
-                    // Weighted sum of values
                     int outOffset = globalQIdx * hidDim + h * _headDim;
                     for (int d = 0; d < _headDim; d++)
                     {
@@ -610,6 +607,9 @@ private int RelativeShiftIndex(int queryInChunk, int contextIdx, int maxSpan)
 
         private float[] BuildPositionEmbeddings(string prefix, int maxSpan)
         {
+            if (_positionEmbeddingCache.TryGetValue(prefix, out var cached))
+                return cached;
+
             int halfDim = _hiddenSize / 2;
             double logInc = Math.Log(10000.0) / Math.Max(halfDim - 1, 1);
 
@@ -627,7 +627,10 @@ private float[] BuildPositionEmbeddings(string prefix, int maxSpan)
 
             string relKey = ResolveName(prefix, "attn_k_rel") + ".weight";
             if (!_weights.TryGetValue(relKey, out var relWeight))
+            {
+                _positionEmbeddingCache[prefix] = sinEmb;
                 return sinEmb;
+            }
 
             int relOutDim = (int)relWeight.Sizes[0];
             int inDim = (int)relWeight.Sizes[1];
@@ -637,14 +640,14 @@ private float[] BuildPositionEmbeddings(string prefix, int maxSpan)
 
             using var sinSlice = sinTensor.Narrow(1, 0, inDim);
             using var sinContig = Ops.NewContiguous(sinSlice);
-            using var wT = relWeight.Transpose();
             var result = new Tensor(_allocator, DType.Float32, maxSpan, relOutDim);
-            Ops.Addmm(result, 0, result, 1f, sinContig, wT);
+            Ops.Addmm(result, 0, result, 1f, sinContig, GetOrCreateTransposedWeight(relKey));
 
             float[] projected = new float[maxSpan * relOutDim];
             result.CopyToArray(projected);
             result.Dispose();
 
+            _positionEmbeddingCache[prefix] = projected;
             return projected;
         }
 
@@ -764,8 +767,7 @@ private Tensor AudioClippableLinearForward(Tensor input, string prefix, int seqL
             }
 
             var result = new Tensor(_allocator, DType.Float32, seqLen, outDim);
-            using (var wT = weight.Transpose())
-                Ops.Addmm(result, 0, result, 1f, src, wT);
+            Ops.Addmm(result, 0, result, 1f, src, GetOrCreateTransposedWeight(weightName));
 
             if (hasClamp && src != input) src.Dispose();
 
@@ -864,11 +866,26 @@ private float[] BuildCausalValidMask()
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
         #endregion
 
         public void Dispose()
         {
             _onesForNorm?.Dispose();
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
+            _positionEmbeddingCache.Clear();
             foreach (var w in _weights.Values)
                 w.Dispose();
             _weights.Clear();
diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs b/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs
index db01c4b..325d53e 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs
+++ b/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs
@@ -10,6 +10,7 @@
 using System;
 using System.IO;
 using System.Numerics;
+using System.Threading.Tasks;
 using NLayer;
 using NVorbis;
 
@@ -27,6 +28,10 @@ public class Gemma4AudioPreprocessor
 
         private static readonly int FrameLength = (int)Math.Round(SampleRate * FrameLengthMs / 1000.0); // 320
         private static readonly int HopLength = (int)Math.Round(SampleRate * HopLengthMs / 1000.0); // 160
+        private static readonly int FftLength = ComputeFftLength();
+        private static readonly int NumFreqBins = FftLength / 2 + 1;
+        private static readonly double[] HannWindow = BuildWindow();
+        private static readonly float[] MelFilters = BuildMelFilterBank(NumFreqBins, MelBins, MinFrequency, MaxFrequency, SampleRate);
 
         public static float[] DecodeAudioFile(string path)
         {
@@ -206,43 +211,27 @@ private static float[] ResampleLinear(float[] samples, int fromRate, int toRate)
 
         public static (float[] melData, int numFrames) ComputeMelSpectrogram(float[] samples)
         {
-            int fftLen = 1;
-            while (fftLen < FrameLength) fftLen <<= 1;
-            fftLen *= 2; // fft_overdrive
-
-            double[] window = new double[FrameLength];
-            double arg = Math.PI * 2.0 / FrameLength;
-            for (int i = 0; i < FrameLength; i++)
-                window[i] = 0.5 - 0.5 * Math.Cos(arg * (i + 0.5));
-
-            int numFreqBins = fftLen / 2 + 1;
-            float[] melFilters = BuildMelFilterBank(numFreqBins, MelBins, MinFrequency, MaxFrequency, SampleRate);
-
             int frameSizeForUnfold = FrameLength + 1;
             int numFrames = (samples.Length - frameSizeForUnfold) / HopLength;
             if (numFrames <= 0) return (null, 0);
 
             float[] result = new float[numFrames * MelBins];
-            Complex[] fftInput = new Complex[fftLen];
-
-            for (int f = 0; f < numFrames; f++)
+            if (numFrames < 8)
             {
-                int start = f * HopLength;
-                for (int i = 0; i < FrameLength; i++)
-                    fftInput[i] = new Complex(samples[start + i] * window[i], 0);
-                for (int i = FrameLength; i < fftLen; i++)
-                    fftInput[i] = Complex.Zero;
-
-                FFT(fftInput);
-
-                for (int m = 0; m < MelBins; m++)
-                {
-                    double melVal = 0;
-                    for (int k = 0; k < numFreqBins; k++)
-                        melVal += fftInput[k].Magnitude * melFilters[k * MelBins + m];
-                    if (melVal < MelFloor) melVal = MelFloor;
-                    result[f * MelBins + m] = (float)Math.Log(melVal);
-                }
+                var fftInput = new Complex[FftLength];
+                for (int f = 0; f < numFrames; f++)
+                    ComputeMelFrame(samples, f, fftInput, result);
+            }
+            else
+            {
+                Parallel.For(0, numFrames,
+                    () => new Complex[FftLength],
+                    (f, _, fftInput) =>
+                    {
+                        ComputeMelFrame(samples, f, fftInput, result);
+                        return fftInput;
+                    },
+                    _ => { });
             }
 
             return (result, numFrames);
@@ -327,9 +316,6 @@ public static int ComputeAudioTokenCount(float[] samples)
                 int padded = samples.Length + (128 - samples.Length % 128);
                 samples = new float[padded];
             }
-            int fftLen = 1;
-            while (fftLen < FrameLength) fftLen <<= 1;
-            fftLen *= 2;
             int frameSizeForUnfold = FrameLength + 1;
             int numFrames = (samples.Length - frameSizeForUnfold) / HopLength;
             if (numFrames <= 0) return 0;
@@ -338,5 +324,42 @@ public static int ComputeAudioTokenCount(float[] samples)
             int tConv1 = (tConv0 + 2 - 3) / 2 + 1;
             return tConv1;
         }
+
+        private static void ComputeMelFrame(float[] samples, int frameIndex, Complex[] fftInput, float[] result)
+        {
+            int start = frameIndex * HopLength;
+            for (int i = 0; i < FrameLength; i++)
+                fftInput[i] = new Complex(samples[start + i] * HannWindow[i], 0);
+            for (int i = FrameLength; i < FftLength; i++)
+                fftInput[i] = Complex.Zero;
+
+            FFT(fftInput);
+
+            int dstOffset = frameIndex * MelBins;
+            for (int m = 0; m < MelBins; m++)
+            {
+                double melVal = 0;
+                for (int k = 0; k < NumFreqBins; k++)
+                    melVal += fftInput[k].Magnitude * MelFilters[k * MelBins + m];
+                if (melVal < MelFloor) melVal = MelFloor;
+                result[dstOffset + m] = (float)Math.Log(melVal);
+            }
+        }
+
+        private static int ComputeFftLength()
+        {
+            int fftLen = 1;
+            while (fftLen < FrameLength) fftLen <<= 1;
+            return fftLen * 2;
+        }
+
+        private static double[] BuildWindow()
+        {
+            double[] window = new double[FrameLength];
+            double arg = Math.PI * 2.0 / FrameLength;
+            for (int i = 0; i < FrameLength; i++)
+                window[i] = 0.5 - 0.5 * Math.Cos(arg * (i + 0.5));
+            return window;
+        }
     }
 }
diff --git a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs b/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs
index f53ff61..840abda 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs
+++ b/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs
@@ -39,14 +39,12 @@ public Gemma4ImageProcessor(int patchSize = 16, int nMerge = 3, int minTokens =
             byte[] fileBytes = File.ReadAllBytes(imagePath);
             int origWidth, origHeight;
             byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out origWidth, out origHeight);
-            byte[] composited = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight);
 
             int alignSize = PatchSize * NMerge;
             SmartResize(origWidth, origHeight, alignSize, out int targetW, out int targetH);
 
-            byte[] resized = Gemma3ImageProcessor.BilinearResize(composited, origWidth, origHeight, targetW, targetH);
-
-            float[] pixels = PackChannelFirst(resized, targetW, targetH);
+            float[] pixels = Gemma3ImageProcessor.ResizeRgbaToChannelFirstNormalized(
+                rgba, origWidth, origHeight, targetW, targetH);
             return (pixels, targetW, targetH);
         }
 
diff --git a/InferenceEngine/Models/Gemma4/Gemma4Model.cs b/InferenceEngine/Models/Gemma4/Gemma4Model.cs
index 6fb1368..890e827 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4Model.cs
+++ b/InferenceEngine/Models/Gemma4/Gemma4Model.cs
@@ -359,6 +359,8 @@ public override void ResetKVCache()
                 if (_kvDonorMap.ContainsKey(l)) continue;
                 Ops.Fill(_kvCacheK[l], 0f);
                 Ops.Fill(_kvCacheV[l], 0f);
+                InvalidateTensorDeviceCache(_kvCacheK[l]);
+                InvalidateTensorDeviceCache(_kvCacheV[l]);
                 cleared.Add(l);
             }
         }
@@ -1684,6 +1686,8 @@ private unsafe void CopyToCacheCircular(Tensor cache, Tensor src, int startPos,
                     Buffer.MemoryCopy(srcRow, dstRow, headBytes, headBytes);
                 }
             }
+
+            InvalidateTensorDeviceCache(cache);
         }
 
         private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize)
diff --git a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs b/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs
index 11133d5..304529f 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs
+++ b/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs
@@ -11,13 +11,16 @@
 using System.Collections.Generic;
 using TensorSharp;
 using TensorSharp.Cpu;
+using TensorSharp.GGML;
 
 namespace InferenceEngine
 {
     public class Gemma4VisionEncoder : IDisposable
     {
         private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
         private readonly IAllocator _allocator;
+        private readonly bool _useNativeAttention;
 
         private readonly int _hiddenSize;
         private readonly int _intermediateSize;
@@ -36,13 +39,25 @@ private struct ClampParams
         }
 
         private readonly Dictionary<string, ClampParams> _clampParams = new();
+        private readonly Dictionary<long, Rope2DCache> _ropeCache = new();
         private Tensor _onesForNorm;
 
+        private sealed class Rope2DCache
+        {
+            public required int[] PosX { get; init; }
+            public required int[] PosY { get; init; }
+            public required float[] CosX { get; init; }
+            public required float[] SinX { get; init; }
+            public required float[] CosY { get; init; }
+            public required float[] SinY { get; init; }
+        }
+
         public int ProjectionDim => _projectionDim;
 
         public Gemma4VisionEncoder(string mmProjPath, IAllocator allocator)
         {
             _allocator = allocator;
+            _useNativeAttention = allocator is GgmlAllocator;
             var gguf = new GgufFile(mmProjPath);
 
             _hiddenSize = (int)gguf.GetUint32("clip.vision.embedding_length", 768);
@@ -126,22 +141,15 @@ public unsafe Tensor Encode(float[] pixelValues, int imgWidth, int imgHeight)
             int patchesY = imgHeight / _patchSize;
             int numPatches = patchesX * patchesY;
             int headDim = _hiddenSize / _numHeads;
+            Rope2DCache ropeCache = GetOrCreateRopeCache(patchesX, patchesY, headDim);
 
             var hidden = PatchEmbed(pixelValues, imgWidth, imgHeight, patchesX, patchesY);
-            AddPositionEmbedding2D(hidden, patchesX, patchesY, numPatches);
-
-            int[] posXData = new int[numPatches];
-            int[] posYData = new int[numPatches];
-            for (int i = 0; i < numPatches; i++)
-            {
-                posXData[i] = i % patchesX;
-                posYData[i] = i / patchesX;
-            }
+            AddPositionEmbedding2D(hidden, ropeCache, numPatches);
 
             for (int i = 0; i < _blockCount; i++)
             {
                 Console.Write($"\r  Vision encoder block {i + 1}/{_blockCount}...");
-                hidden = EncoderBlock(hidden, i, numPatches, headDim, posXData, posYData);
+                hidden = EncoderBlock(hidden, i, numPatches, headDim, ropeCache);
             }
             Console.WriteLine(" done");
 
@@ -194,48 +202,33 @@ private unsafe Tensor PatchEmbed(float[] pixelValues, int imgW, int imgH, int pa
             return result;
         }
 
-        private void AddPositionEmbedding2D(Tensor hidden, int patchesX, int patchesY, int numPatches)
+        private unsafe void AddPositionEmbedding2D(Tensor hidden, Rope2DCache ropeCache, int numPatches)
         {
             var posEmbd = _weights["v.position_embd.weight"];
+            int maxPos = (int)posEmbd.Sizes[1];
+            float* posPtr = GetFloatPtr(posEmbd);
+            float* xTable = posPtr;
+            float* yTable = posPtr + maxPos * _hiddenSize;
+            float* dstPtr = GetFloatPtr(hidden);
 
-            // posEmbd shape in TensorSharp: [2, maxPos, hiddenSize]
-            // tblX = posEmbd[0], tblY = posEmbd[1]
-            long maxPos = posEmbd.Sizes[1];
-            Tensor tblXNarrow = posEmbd.Narrow(0, 0, 1);
-            Tensor tblX = tblXNarrow.View(maxPos, _hiddenSize);
-            tblXNarrow.Dispose();
-            Tensor tblYNarrow = posEmbd.Narrow(0, 1, 1);
-            Tensor tblY = tblYNarrow.View(maxPos, _hiddenSize);
-            tblYNarrow.Dispose();
-
-            int[] xIndices = new int[numPatches];
-            int[] yIndices = new int[numPatches];
-            for (int py = 0; py < patchesY; py++)
-                for (int px = 0; px < patchesX; px++)
-                {
-                    int idx = py * patchesX + px;
-                    xIndices[idx] = px;
-                    yIndices[idx] = py;
-                }
-
-            using var xIdx = CreateIntTensor(xIndices, numPatches);
-            using var yIdx = CreateIntTensor(yIndices, numPatches);
-            using var xEmb = Ops.IndexSelect(null, tblX, xIdx);
-            using var yEmb = Ops.IndexSelect(null, tblY, yIdx);
-            Ops.Add(hidden, hidden, xEmb);
-            Ops.Add(hidden, hidden, yEmb);
-            tblX.Dispose();
-            tblY.Dispose();
+            for (int p = 0; p < numPatches; p++)
+            {
+                float* dstRow = dstPtr + p * _hiddenSize;
+                float* xRow = xTable + ropeCache.PosX[p] * _hiddenSize;
+                float* yRow = yTable + ropeCache.PosY[p] * _hiddenSize;
+                for (int d = 0; d < _hiddenSize; d++)
+                    dstRow[d] += xRow[d] + yRow[d];
+            }
         }
 
         private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int headDim,
-            int[] posXData, int[] posYData)
+            Rope2DCache ropeCache)
         {
             string prefix = $"v.blk.{blockIdx}";
 
             using var attnNormed = RMSNormOp(hidden, $"{prefix}.ln1.weight");
             using var attnOut = VisionSelfAttention(attnNormed, prefix, numPatches, headDim,
-                posXData, posYData);
+                ropeCache);
             using var postAttnNormed = RMSNormOp(attnOut, $"{prefix}.attn_post_norm.weight");
 
             Ops.Add(postAttnNormed, postAttnNormed, hidden);
@@ -252,13 +245,32 @@ private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int hea
         }
 
         private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches, int headDim,
-            int[] posXData, int[] posYData)
+            Rope2DCache ropeCache)
         {
             var q = ClippableLinear(input, $"{prefix}.attn_q");
             var k = ClippableLinear(input, $"{prefix}.attn_k");
             var v = ClippableLinear(input, $"{prefix}.attn_v");
 
-            // Reshape to [numHeads, numPatches, headDim]
+            ApplyPerHeadRMSNorm(q, _weights[$"{prefix}.attn_q_norm.weight"], numPatches, headDim);
+            ApplyPerHeadRMSNorm(k, _weights[$"{prefix}.attn_k_norm.weight"], numPatches, headDim);
+            ApplyUnweightedRMSNorm(v, _numHeads * numPatches, headDim);
+
+            Apply2DRoPE(q, ropeCache, numPatches, headDim);
+            Apply2DRoPE(k, ropeCache, numPatches, headDim);
+
+            if (_useNativeAttention)
+            {
+                using var q4 = q.View(1, numPatches, _numHeads, headDim);
+                using var k4 = k.View(1, numPatches, _numHeads, headDim);
+                using var v4 = v.View(1, numPatches, _numHeads, headDim);
+                using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, 1f);
+                using var flat = attn4.View(numPatches, _hiddenSize);
+                q.Dispose();
+                k.Dispose();
+                v.Dispose();
+                return ClippableLinear(flat, $"{prefix}.attn_out");
+            }
+
             using var qR = q.View(numPatches, _numHeads, headDim);
             using var kR = k.View(numPatches, _numHeads, headDim);
             using var vR = v.View(numPatches, _numHeads, headDim);
@@ -272,19 +284,6 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa
             k.Dispose();
             v.Dispose();
 
-            // QK RMSNorm (weighted)
-            ApplyPerHeadRMSNorm(qHeads, _weights[$"{prefix}.attn_q_norm.weight"], _numHeads, numPatches, headDim);
-            ApplyPerHeadRMSNorm(kHeads, _weights[$"{prefix}.attn_k_norm.weight"], _numHeads, numPatches, headDim);
-
-            // V RMSNorm (unweighted)
-            ApplyUnweightedRMSNorm(vHeads, _numHeads * numPatches, headDim);
-
-            // 2D NeoX RoPE: split head dim in half, apply RoPE with X positions to first half,
-            // Y positions to second half
-            Apply2DRoPE(qHeads, posXData, posYData, _numHeads, numPatches, headDim);
-            Apply2DRoPE(kHeads, posXData, posYData, _numHeads, numPatches, headDim);
-
-            // Attention: Q @ K^T (no scaling since QK norms handle it)
             using var kT = kHeads.Transpose(1, 2);
             var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
             Ops.AddmmBatch(scores, 0, scores, 1f, qHeads, kT);
@@ -296,46 +295,38 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa
 
             using var transposed = attnOutput.Transpose(0, 1);
             using var contiguous = Ops.NewContiguous(transposed);
-            using var flat = contiguous.View(numPatches, _hiddenSize);
-            using var flatContig = Ops.NewContiguous(flat);
+            using var flatContig = contiguous.View(numPatches, _hiddenSize);
             attnOutput.Dispose();
 
             return ClippableLinear(flatContig, $"{prefix}.attn_out");
         }
 
-        private unsafe void Apply2DRoPE(Tensor heads, int[] posX, int[] posY,
-            int numHeads, int numPatches, int headDim)
+        private unsafe void Apply2DRoPE(Tensor data, Rope2DCache ropeCache, int numPatches, int headDim)
         {
-            float* ptr = GetFloatPtr(heads);
+            float* ptr = GetFloatPtr(data);
             int halfDim = headDim / 2;
             int quarterDim = halfDim / 2;
 
-            for (int h = 0; h < numHeads; h++)
+            for (int p = 0; p < numPatches; p++)
             {
-                for (int p = 0; p < numPatches; p++)
+                int ropeBase = p * quarterDim;
+                for (int h = 0; h < _numHeads; h++)
                 {
-                    float* head = ptr + ((long)h * numPatches + p) * headDim;
-
-                    // First half: apply RoPE with X positions
+                    float* head = ptr + ((long)p * _numHeads + h) * headDim;
                     for (int j = 0; j < quarterDim; j++)
                     {
-                        float freq = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim));
-                        float angle = posX[p] * freq;
-                        float cos = MathF.Cos(angle);
-                        float sin = MathF.Sin(angle);
+                        float cos = ropeCache.CosX[ropeBase + j];
+                        float sin = ropeCache.SinX[ropeBase + j];
                         float x0 = head[j];
                         float x1 = head[j + quarterDim];
                         head[j] = x0 * cos - x1 * sin;
                         head[j + quarterDim] = x0 * sin + x1 * cos;
                     }
 
-                    // Second half: apply RoPE with Y positions
                     for (int j = 0; j < quarterDim; j++)
                     {
-                        float freq = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim));
-                        float angle = posY[p] * freq;
-                        float cos = MathF.Cos(angle);
-                        float sin = MathF.Sin(angle);
+                        float cos = ropeCache.CosY[ropeBase + j];
+                        float sin = ropeCache.SinY[ropeBase + j];
                         float x0 = head[halfDim + j];
                         float x1 = head[halfDim + j + quarterDim];
                         head[halfDim + j] = x0 * cos - x1 * sin;
@@ -345,10 +336,9 @@ private unsafe void Apply2DRoPE(Tensor heads, int[] posX, int[] posY,
             }
         }
 
-        private void ApplyPerHeadRMSNorm(Tensor data, Tensor normWeight,
-            int numHeads, int numPatches, int headDim)
+        private void ApplyPerHeadRMSNorm(Tensor data, Tensor normWeight, int numPatches, int headDim)
         {
-            int total = numHeads * numPatches;
+            int total = _numHeads * numPatches;
             using var reshaped = data.View(total, headDim);
             Ops.RMSNorm(reshaped, reshaped, normWeight, null, _eps);
         }
@@ -361,7 +351,8 @@ private void ApplyUnweightedRMSNorm(Tensor data, int numVectors, int dim)
                 _onesForNorm = new Tensor(_allocator, DType.Float32, dim);
                 Ops.Fill(_onesForNorm, 1f);
             }
-            Ops.RMSNorm(data, data, _onesForNorm, null, _eps);
+            using var reshaped = data.View(numVectors, dim);
+            Ops.RMSNorm(reshaped, reshaped, _onesForNorm, null, _eps);
         }
 
         private unsafe Tensor VisionMLP(Tensor input, string prefix)
@@ -402,8 +393,7 @@ private unsafe Tensor ClippableLinear(Tensor input, string prefix)
                 Clamp(src, cp.InMin, cp.InMax);
 
             var result = new Tensor(_allocator, DType.Float32, seqLen, outDim);
-            using var wT = weight.Transpose();
-            Ops.Addmm(result, 0, result, 1f, src, wT);
+            Ops.Addmm(result, 0, result, 1f, src, GetOrCreateTransposedWeight(weightName));
 
             contiguousInput?.Dispose();
 
@@ -487,8 +477,7 @@ private Tensor LinearProjection(Tensor input, string weightName)
             int outDim = (int)weight.Sizes[0];
 
             var result = new Tensor(_allocator, DType.Float32, seqLen, outDim);
-            using var wT = weight.Transpose();
-            Ops.Addmm(result, 0, result, 1f, input, wT);
+            Ops.Addmm(result, 0, result, 1f, input, GetOrCreateTransposedWeight(weightName));
             return result;
         }
 
@@ -514,12 +503,79 @@ private Tensor CreateIntTensor(int[] data, params long[] sizes)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
+        private Rope2DCache GetOrCreateRopeCache(int patchesX, int patchesY, int headDim)
+        {
+            long key = ((long)patchesX << 32) | (uint)patchesY;
+            if (_ropeCache.TryGetValue(key, out var cache))
+                return cache;
+
+            int numPatches = patchesX * patchesY;
+            int halfDim = headDim / 2;
+            int quarterDim = halfDim / 2;
+            int[] posX = new int[numPatches];
+            int[] posY = new int[numPatches];
+            float[] cosX = new float[numPatches * quarterDim];
+            float[] sinX = new float[numPatches * quarterDim];
+            float[] cosY = new float[numPatches * quarterDim];
+            float[] sinY = new float[numPatches * quarterDim];
+            float[] invFreq = new float[quarterDim];
+
+            for (int j = 0; j < quarterDim; j++)
+                invFreq[j] = (float)(1.0 / Math.Pow(_ropeTheta, 2.0 * j / halfDim));
+
+            for (int p = 0; p < numPatches; p++)
+            {
+                int x = p % patchesX;
+                int y = p / patchesX;
+                posX[p] = x;
+                posY[p] = y;
+
+                int baseIdx = p * quarterDim;
+                for (int j = 0; j < quarterDim; j++)
+                {
+                    float angleX = x * invFreq[j];
+                    float angleY = y * invFreq[j];
+                    cosX[baseIdx + j] = MathF.Cos(angleX);
+                    sinX[baseIdx + j] = MathF.Sin(angleX);
+                    cosY[baseIdx + j] = MathF.Cos(angleY);
+                    sinY[baseIdx + j] = MathF.Sin(angleY);
+                }
+            }
+
+            cache = new Rope2DCache
+            {
+                PosX = posX,
+                PosY = posY,
+                CosX = cosX,
+                SinX = sinX,
+                CosY = cosY,
+                SinY = sinY,
+            };
+            _ropeCache[key] = cache;
+            return cache;
+        }
+
         public void Dispose()
         {
             _onesForNorm?.Dispose();
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
             foreach (var w in _weights.Values)
                 w.Dispose();
             _weights.Clear();
+            _ropeCache.Clear();
         }
     }
 }
diff --git a/InferenceEngine/Models/Qwen3/Qwen3Model.cs b/InferenceEngine/Models/Qwen3/Qwen3Model.cs
index 9bc87f9..75bf9a2 100644
--- a/InferenceEngine/Models/Qwen3/Qwen3Model.cs
+++ b/InferenceEngine/Models/Qwen3/Qwen3Model.cs
@@ -151,6 +151,8 @@ public override void ResetKVCache()
             {
                 Ops.Fill(_kvCacheK[l], 0);
                 Ops.Fill(_kvCacheV[l], 0);
+                InvalidateTensorDeviceCache(_kvCacheK[l]);
+                InvalidateTensorDeviceCache(_kvCacheV[l]);
             }
             _cacheSeqLen = 0;
             _linearTicks = _attnTicks = _normTicks = _embTicks = _lmHeadTicks = _logitsCopyTicks = 0;
diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/InferenceEngine/Models/Qwen35/ImageProcessor.cs
index af94030..a197845 100644
--- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs
+++ b/InferenceEngine/Models/Qwen35/ImageProcessor.cs
@@ -91,12 +91,10 @@ public int ComputeImageTokenCount(string imagePath)
         {
             byte[] fileBytes = File.ReadAllBytes(imagePath);
             byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out int origWidth, out int origHeight);
-            byte[] composited = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight);
 
             var (resizedH, resizedW) = SmartResize(origHeight, origWidth);
-            byte[] resized = Gemma3ImageProcessor.BilinearResize(composited, origWidth, origHeight, resizedW, resizedH);
-
-            float[] pixels = PackChannelFirst(resized, resizedW, resizedH);
+            float[] pixels = Gemma3ImageProcessor.ResizeRgbaToChannelFirstNormalized(
+                rgba, origWidth, origHeight, resizedW, resizedH);
             return (pixels, resizedH, resizedW);
         }
 
diff --git a/InferenceEngine/Models/Qwen35/Qwen35Model.cs b/InferenceEngine/Models/Qwen35/Qwen35Model.cs
index 8a011e6..aaa03a3 100644
--- a/InferenceEngine/Models/Qwen35/Qwen35Model.cs
+++ b/InferenceEngine/Models/Qwen35/Qwen35Model.cs
@@ -197,6 +197,8 @@ public override void ResetKVCache()
                 {
                     Ops.Fill(_kvCacheK[l], 0);
                     Ops.Fill(_kvCacheV[l], 0);
+                    InvalidateTensorDeviceCache(_kvCacheK[l]);
+                    InvalidateTensorDeviceCache(_kvCacheV[l]);
                 }
                 else
                 {
diff --git a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs b/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs
index b456eac..c4c21dc 100644
--- a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs
+++ b/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs
@@ -11,13 +11,18 @@
 using System.Collections.Generic;
 using TensorSharp;
 using TensorSharp.Cpu;
+using TensorSharp.GGML;
 
 namespace InferenceEngine
 {
     public class Qwen35VisionEncoder : IDisposable
     {
         private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
+        private readonly Dictionary<long, Tensor> _positionEmbeddingCache = new();
+        private readonly Dictionary<long, RopeCache> _ropeCache = new();
         private readonly IAllocator _allocator;
+        private readonly bool _useNativeAttention;
 
         private readonly int _imageSize;
         private readonly int _patchSize;
@@ -31,6 +36,12 @@ public class Qwen35VisionEncoder : IDisposable
         private readonly int _gridPerSide;
         private readonly float _ropeTheta;
 
+        private sealed class RopeCache
+        {
+            public required float[] CosTable { get; init; }
+            public required float[] SinTable { get; init; }
+        }
+
         public int ProjectionDim => _projectionDim;
         public int PatchSize => _patchSize;
         public int SpatialMergeSize => _spatialMergeSize;
@@ -38,6 +49,7 @@ public class Qwen35VisionEncoder : IDisposable
         public Qwen35VisionEncoder(string mmProjPath, IAllocator allocator)
         {
             _allocator = allocator;
+            _useNativeAttention = allocator is GgmlAllocator;
             var gguf = new GgufFile(mmProjPath);
 
             _imageSize = (int)gguf.GetUint32("clip.vision.image_size", 768);
@@ -128,7 +140,7 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW)
             if (debug) DumpTensor(hidden, "After PatchEmbed (raster)", numPatches);
 
             // 2. Position embedding (bilinear interpolation, raster order)
-            AddPositionEmbedding(hidden, gridH, gridW, numPatches);
+            AddPositionEmbedding(hidden, gridH, gridW);
             if (debug) DumpTensor(hidden, "After PosEmbed (raster)", numPatches);
 
             // 3. Reorder from raster to block order
@@ -137,19 +149,14 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW)
             if (debug) DumpTensor(blockOrdered, "After BlockReorder", numPatches);
 
             // 4. Build block-order grid coordinate arrays for RoPE
-            int[] gridY, gridX;
-            BuildBlockOrderCoords(gridH, gridW, out gridY, out gridX);
-
-            // 5. Precompute RoPE cos/sin tables
-            float[] cosTable, sinTable;
-            ComputeRoPETables(gridY, gridX, numPatches, halfDim, out cosTable, out sinTable);
+            RopeCache ropeCache = GetOrCreateRopeCache(gridH, gridW, numPatches, halfDim);
 
             // 6. Encoder blocks
             for (int i = 0; i < _blockCount; i++)
             {
                 Console.Write($"\r  Vision encoder block {i + 1}/{_blockCount}...");
                 blockOrdered = EncoderBlock(blockOrdered, i, numPatches, headDim, halfDim,
-                    cosTable, sinTable);
+                    ropeCache.CosTable, ropeCache.SinTable);
                 if (debug && (i == 0 || i == _blockCount - 1))
                     DumpTensor(blockOrdered, $"After block {i}", numPatches);
             }
@@ -170,7 +177,7 @@ public unsafe Tensor Encode(float[] pixelValues, int resizedH, int resizedW)
 
             using var fc1 = LinearForwardWithBias(mergedContig, "mm.0.weight", "mm.0.bias");
             mergedContig.Dispose();
-            ApplyGELU(fc1);
+            Ops.GELU(fc1, fc1);
 
             var projected = LinearForwardWithBias(fc1, "mm.2.weight", "mm.2.bias");
             if (debug) DumpTensor(projected, "Final projected", mergedPatches);
@@ -235,49 +242,9 @@ private unsafe Tensor PatchEmbed(float[] pixelValues, int imgH, int imgW, int gr
         /// <summary>
         /// Add bilinearly-interpolated position embeddings (computed in raster order).
         /// </summary>
-        private unsafe void AddPositionEmbedding(Tensor hidden, int gridH, int gridW, int numPatches)
+        private void AddPositionEmbedding(Tensor hidden, int gridH, int gridW)
         {
-            var posEmbd = _weights["v.position_embd.weight"];
-            float* posPtr = GetFloatPtr(posEmbd);
-            float* hidPtr = GetFloatPtr(hidden);
-
-            float stepH = gridH > 1 ? (float)(_gridPerSide - 1) / (gridH - 1) : 0f;
-            float stepW = gridW > 1 ? (float)(_gridPerSide - 1) / (gridW - 1) : 0f;
-
-            for (int h = 0; h < gridH; h++)
-            {
-                for (int w = 0; w < gridW; w++)
-                {
-                    float y = h * stepH;
-                    float x = w * stepW;
-
-                    int fy = (int)y, fx = (int)x;
-                    int cy = Math.Min(fy + 1, _gridPerSide - 1);
-                    int cx = Math.Min(fx + 1, _gridPerSide - 1);
-                    float dy = y - fy, dx = x - fx;
-
-                    float w00 = (1 - dy) * (1 - dx);
-                    float w01 = (1 - dy) * dx;
-                    float w10 = dy * (1 - dx);
-                    float w11 = dy * dx;
-
-                    int idx00 = fy * _gridPerSide + fx;
-                    int idx01 = fy * _gridPerSide + cx;
-                    int idx10 = cy * _gridPerSide + fx;
-                    int idx11 = cy * _gridPerSide + cx;
-
-                    int patchIdx = h * gridW + w;
-                    float* hidRow = hidPtr + patchIdx * _hiddenSize;
-
-                    float* p00 = posPtr + idx00 * _hiddenSize;
-                    float* p01 = posPtr + idx01 * _hiddenSize;
-                    float* p10 = posPtr + idx10 * _hiddenSize;
-                    float* p11 = posPtr + idx11 * _hiddenSize;
-
-                    for (int d = 0; d < _hiddenSize; d++)
-                        hidRow[d] += w00 * p00[d] + w01 * p01[d] + w10 * p10[d] + w11 * p11[d];
-                }
-            }
+            Ops.Add(hidden, hidden, GetOrCreatePositionEmbedding(gridH, gridW));
         }
 
         /// <summary>
@@ -315,64 +282,6 @@ private unsafe Tensor ReorderToBlockOrder(Tensor input, int gridH, int gridW)
             return result;
         }
 
-        private void BuildBlockOrderCoords(int gridH, int gridW, out int[] gridY, out int[] gridX)
-        {
-            int numPatches = gridH * gridW;
-            gridY = new int[numPatches];
-            gridX = new int[numPatches];
-            int idx = 0;
-            for (int bh = 0; bh < gridH; bh += _spatialMergeSize)
-            {
-                for (int bw = 0; bw < gridW; bw += _spatialMergeSize)
-                {
-                    for (int mh = 0; mh < _spatialMergeSize; mh++)
-                    {
-                        for (int mw = 0; mw < _spatialMergeSize; mw++)
-                        {
-                            gridY[idx] = bh + mh;
-                            gridX[idx] = bw + mw;
-                            idx++;
-                        }
-                    }
-                }
-            }
-        }
-
-        /// <summary>
-        /// Precompute RoPE cos/sin tables for the vision encoder.
-        /// Interleaved y/x frequency bands matching Ollama's qwen3vl vision RoPE.
-        /// cosTable/sinTable: [numPatches * halfDim], row-major [patch, band].
-        /// </summary>
-        private void ComputeRoPETables(int[] gridY, int[] gridX, int numPatches, int halfDim,
-            out float[] cosTable, out float[] sinTable)
-        {
-            int numBands = halfDim / 2;
-            cosTable = new float[numPatches * halfDim];
-            sinTable = new float[numPatches * halfDim];
-
-            float[] invFreqs = new float[numBands];
-            for (int j = 0; j < numBands; j++)
-                invFreqs[j] = 1f / MathF.Pow(_ropeTheta, (2f * j) / halfDim);
-
-            for (int p = 0; p < numPatches; p++)
-            {
-                int y = gridY[p];
-                int x = gridX[p];
-                int baseIdx = p * halfDim;
-
-                for (int j = 0; j < numBands; j++)
-                {
-                    float angleY = y * invFreqs[j];
-                    float angleX = x * invFreqs[j];
-
-                    cosTable[baseIdx + j * 2] = MathF.Cos(angleY);
-                    sinTable[baseIdx + j * 2] = MathF.Sin(angleY);
-                    cosTable[baseIdx + j * 2 + 1] = MathF.Cos(angleX);
-                    sinTable[baseIdx + j * 2 + 1] = MathF.Sin(angleX);
-                }
-            }
-        }
-
         private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int headDim,
             int halfDim, float[] cosTable, float[] sinTable)
         {
@@ -416,7 +325,19 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa
 
             float scale = 1f / MathF.Sqrt(headDim);
 
-            // Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim]
+            if (_useNativeAttention)
+            {
+                using var q4 = q.View(1, numPatches, _numHeads, headDim);
+                using var k4 = k.View(1, numPatches, _numHeads, headDim);
+                using var v4 = v.View(1, numPatches, _numHeads, headDim);
+                using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale);
+                using var flat = attn4.View(numPatches, _hiddenSize);
+                q.Dispose();
+                k.Dispose();
+                v.Dispose();
+                return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
+            }
+
             using var qR = q.View(numPatches, _numHeads, headDim);
             using var kR = k.View(numPatches, _numHeads, headDim);
             using var vR = v.View(numPatches, _numHeads, headDim);
@@ -431,23 +352,18 @@ private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPa
             k.Dispose();
             v.Dispose();
 
-            // Q @ K^T
             using var kT = kHeads.Transpose(1, 2);
             var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
             Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);
-
             Ops.Softmax(scores, scores);
 
-            // scores @ V
             var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim);
             Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads);
             scores.Dispose();
 
-            // Reshape back to [numPatches, hiddenSize]
             using var transposed = attnOutput.Transpose(0, 1);
             using var contiguous = Ops.NewContiguous(transposed);
-            using var flat = contiguous.View(numPatches, _hiddenSize);
-            using var flatContig = Ops.NewContiguous(flat);
+            using var flatContig = contiguous.View(numPatches, _hiddenSize);
             attnOutput.Dispose();
 
             return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
@@ -481,25 +397,13 @@ private unsafe void ApplyVisionRoPE(Tensor data, int numPatches, int headDim, in
             }
         }
 
-        private unsafe Tensor VisionMLP(Tensor input, string prefix)
+        private Tensor VisionMLP(Tensor input, string prefix)
         {
             using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias");
-            ApplyGELU(fc1Out);
+            Ops.GELU(fc1Out, fc1Out);
             return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias");
         }
 
-        private unsafe void ApplyGELU(Tensor t)
-        {
-            float* ptr = GetFloatPtr(t);
-            int count = (int)t.ElementCount();
-            for (int i = 0; i < count; i++)
-            {
-                double x = ptr[i];
-                double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x)));
-                ptr[i] = (float)(x * cdf);
-            }
-        }
-
         private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, string biasName)
         {
             var weight = _weights[weightName];
@@ -511,64 +415,20 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str
             Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input);
             Tensor src = contiguousInput ?? input;
 
-            using var wT = weight.Transpose();
-            Ops.Addmm(result, 0, result, 1.0f, src, wT);
+            Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName));
 
             contiguousInput?.Dispose();
 
             if (_weights.TryGetValue(biasName, out var bias))
-            {
-                float* rPtr = GetFloatPtr(result);
-                float* bPtr = GetFloatPtr(bias);
-                for (int s = 0; s < seqLen; s++)
-                {
-                    float* row = rPtr + s * outDim;
-                    for (int d = 0; d < outDim; d++)
-                        row[d] += bPtr[d];
-                }
-            }
+                Ops.Add(result, result, bias);
 
             return result;
         }
 
-        private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName)
+        private Tensor LayerNormOp(Tensor input, string weightName, string biasName)
         {
-            int rows = (int)input.Sizes[0];
-            int dim = (int)input.Sizes[1];
-            var result = new Tensor(_allocator, DType.Float32, rows, dim);
-
-            float* src = GetFloatPtr(input);
-            float* dst = GetFloatPtr(result);
-            float* w = GetFloatPtr(_weights[weightName]);
-            float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null;
-
-            for (int r = 0; r < rows; r++)
-            {
-                float* srcRow = src + r * dim;
-                float* dstRow = dst + r * dim;
-
-                float mean = 0;
-                for (int i = 0; i < dim; i++)
-                    mean += srcRow[i];
-                mean /= dim;
-
-                float variance = 0;
-                for (int i = 0; i < dim; i++)
-                {
-                    float diff = srcRow[i] - mean;
-                    variance += diff * diff;
-                }
-                variance /= dim;
-
-                float invStd = 1f / MathF.Sqrt(variance + _eps);
-                for (int i = 0; i < dim; i++)
-                {
-                    float normalized = (srcRow[i] - mean) * invStd;
-                    dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f);
-                }
-            }
-
-            return result;
+            _weights.TryGetValue(biasName, out var bias);
+            return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps);
         }
 
         private unsafe void DumpTensor(Tensor t, string label, int numRows)
@@ -596,11 +456,135 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
+        private unsafe Tensor GetOrCreatePositionEmbedding(int gridH, int gridW)
+        {
+            long key = ((long)gridH << 32) | (uint)gridW;
+            if (_positionEmbeddingCache.TryGetValue(key, out var cached))
+                return cached;
+
+            int numPatches = gridH * gridW;
+            cached = new Tensor(_allocator, DType.Float32, numPatches, _hiddenSize);
+            float* posPtr = GetFloatPtr(_weights["v.position_embd.weight"]);
+            float* dstPtr = GetFloatPtr(cached);
+
+            float stepH = gridH > 1 ? (float)(_gridPerSide - 1) / (gridH - 1) : 0f;
+            float stepW = gridW > 1 ? (float)(_gridPerSide - 1) / (gridW - 1) : 0f;
+
+            for (int h = 0; h < gridH; h++)
+            {
+                for (int w = 0; w < gridW; w++)
+                {
+                    float y = h * stepH;
+                    float x = w * stepW;
+
+                    int fy = (int)y;
+                    int fx = (int)x;
+                    int cy = Math.Min(fy + 1, _gridPerSide - 1);
+                    int cx = Math.Min(fx + 1, _gridPerSide - 1);
+                    float dy = y - fy;
+                    float dx = x - fx;
+
+                    float w00 = (1 - dy) * (1 - dx);
+                    float w01 = (1 - dy) * dx;
+                    float w10 = dy * (1 - dx);
+                    float w11 = dy * dx;
+
+                    int idx00 = fy * _gridPerSide + fx;
+                    int idx01 = fy * _gridPerSide + cx;
+                    int idx10 = cy * _gridPerSide + fx;
+                    int idx11 = cy * _gridPerSide + cx;
+
+                    int patchIdx = h * gridW + w;
+                    float* dstRow = dstPtr + patchIdx * _hiddenSize;
+                    float* p00 = posPtr + idx00 * _hiddenSize;
+                    float* p01 = posPtr + idx01 * _hiddenSize;
+                    float* p10 = posPtr + idx10 * _hiddenSize;
+                    float* p11 = posPtr + idx11 * _hiddenSize;
+
+                    for (int d = 0; d < _hiddenSize; d++)
+                        dstRow[d] = w00 * p00[d] + w01 * p01[d] + w10 * p10[d] + w11 * p11[d];
+                }
+            }
+
+            _positionEmbeddingCache[key] = cached;
+            return cached;
+        }
+
+        private RopeCache GetOrCreateRopeCache(int gridH, int gridW, int numPatches, int halfDim)
+        {
+            long key = ((long)gridH << 32) | (uint)gridW;
+            if (_ropeCache.TryGetValue(key, out var cache))
+                return cache;
+
+            int[] gridY = new int[numPatches];
+            int[] gridX = new int[numPatches];
+            int idx = 0;
+            for (int bh = 0; bh < gridH; bh += _spatialMergeSize)
+            {
+                for (int bw = 0; bw < gridW; bw += _spatialMergeSize)
+                {
+                    for (int mh = 0; mh < _spatialMergeSize; mh++)
+                    {
+                        for (int mw = 0; mw < _spatialMergeSize; mw++)
+                        {
+                            gridY[idx] = bh + mh;
+                            gridX[idx] = bw + mw;
+                            idx++;
+                        }
+                    }
+                }
+            }
+
+            int numBands = halfDim / 2;
+            float[] cosTable = new float[numPatches * halfDim];
+            float[] sinTable = new float[numPatches * halfDim];
+            float[] invFreqs = new float[numBands];
+            for (int j = 0; j < numBands; j++)
+                invFreqs[j] = 1f / MathF.Pow(_ropeTheta, (2f * j) / halfDim);
+
+            for (int p = 0; p < numPatches; p++)
+            {
+                int baseIdx = p * halfDim;
+                for (int j = 0; j < numBands; j++)
+                {
+                    float angleY = gridY[p] * invFreqs[j];
+                    float angleX = gridX[p] * invFreqs[j];
+
+                    cosTable[baseIdx + j * 2] = MathF.Cos(angleY);
+                    sinTable[baseIdx + j * 2] = MathF.Sin(angleY);
+                    cosTable[baseIdx + j * 2 + 1] = MathF.Cos(angleX);
+                    sinTable[baseIdx + j * 2 + 1] = MathF.Sin(angleX);
+                }
+            }
+
+            cache = new RopeCache { CosTable = cosTable, SinTable = sinTable };
+            _ropeCache[key] = cache;
+            return cache;
+        }
+
         public void Dispose()
         {
+            foreach (var w in _positionEmbeddingCache.Values)
+                w.Dispose();
+            _positionEmbeddingCache.Clear();
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
             foreach (var w in _weights.Values)
                 w.Dispose();
             _weights.Clear();
+            _ropeCache.Clear();
         }
     }
 }
diff --git a/InferenceWeb/Program.cs b/InferenceWeb/Program.cs
index 050d2ff..e34be88 100644
--- a/InferenceWeb/Program.cs
+++ b/InferenceWeb/Program.cs
@@ -20,9 +20,13 @@
 using Microsoft.AspNetCore.Hosting;
 using Microsoft.AspNetCore.Http;
 using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
 
 var builder = WebApplication.CreateBuilder(args);
 
+// Keep ASP.NET Core request logs quiet by default while still surfacing warnings and errors.
+builder.Logging.AddFilter("Microsoft.AspNetCore", LogLevel.Warning);
+
 builder.WebHost.ConfigureKestrel(options =>
 {
     options.Limits.MaxRequestBodySize = 500 * 1024 * 1024; // 500 MB
diff --git a/README.md b/README.md
index fc5e98e..7564295 100644
--- a/README.md
+++ b/README.md
@@ -129,7 +129,7 @@ You can also request a CUDA-enabled native build from `dotnet build`:
 TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
 ```
 
-On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` builds `libGgmlOps.so` with the GGML CPU backend by default, and `build-linux.sh --cuda` enables GGML_CUDA support for NVIDIA GPUs. The build output is automatically copied to the application's output directory.
+On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory.
 
 ## Usage
 
diff --git a/TensorSharp.GGML.Native/build-linux.sh b/TensorSharp.GGML.Native/build-linux.sh
index d75e0ea..41f6dcb 100644
--- a/TensorSharp.GGML.Native/build-linux.sh
+++ b/TensorSharp.GGML.Native/build-linux.sh
@@ -3,10 +3,50 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BUILD_DIR="${SCRIPT_DIR}/build"
-ENABLE_CUDA="${TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:-OFF}"
+ENABLE_CUDA="${TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:-}"
 BUILD_TESTS="${TENSORSHARP_GGML_NATIVE_BUILD_TESTS:-OFF}"
 EXTRA_CMAKE_ARGS=()
 
+normalize_bool() {
+    local value="${1:-}"
+    case "${value}" in
+        ON|on|On|TRUE|true|True|YES|yes|Yes|1)
+            echo "ON"
+            ;;
+        OFF|off|Off|FALSE|false|False|NO|no|No|0)
+            echo "OFF"
+            ;;
+        *)
+            echo ""
+            ;;
+    esac
+}
+
+has_cuda_toolkit() {
+    if command -v nvcc >/dev/null 2>&1; then
+        return 0
+    fi
+
+    local cuda_home="${CUDA_HOME:-${CUDA_PATH:-}}"
+    if [[ -n "${cuda_home}" && -x "${cuda_home}/bin/nvcc" ]]; then
+        return 0
+    fi
+
+    return 1
+}
+
+read_cached_cuda_setting() {
+    local cache_file="${BUILD_DIR}/CMakeCache.txt"
+    if [[ ! -f "${cache_file}" ]]; then
+        echo ""
+        return
+    fi
+
+    local cached
+    cached="$(awk -F= '/^TENSORSHARP_GGML_NATIVE_ENABLE_CUDA:BOOL=/{print $2; exit}' "${cache_file}")"
+    normalize_bool "${cached}"
+}
+
 while (($# > 0)); do
     case "$1" in
         --cuda)
@@ -25,6 +65,19 @@ while (($# > 0)); do
     shift
 done
 
+ENABLE_CUDA="$(normalize_bool "${ENABLE_CUDA}")"
+if [[ -z "${ENABLE_CUDA}" ]]; then
+    ENABLE_CUDA="$(read_cached_cuda_setting)"
+fi
+if [[ -z "${ENABLE_CUDA}" ]] && has_cuda_toolkit; then
+    ENABLE_CUDA="ON"
+fi
+if [[ -z "${ENABLE_CUDA}" ]]; then
+    ENABLE_CUDA="OFF"
+fi
+
+echo "Configuring TensorSharp.GGML.Native (CUDA=${ENABLE_CUDA}, TESTS=${BUILD_TESTS})"
+
 cmake -S "${SCRIPT_DIR}" -B "${BUILD_DIR}" \
     -DCMAKE_BUILD_TYPE=Release \
     -DTENSORSHARP_GGML_NATIVE_ENABLE_CUDA="${ENABLE_CUDA}" \
diff --git a/TensorSharp.GGML.Native/ggml_ops.cpp b/TensorSharp.GGML.Native/ggml_ops.cpp
index cbd8d36..eb0691a 100644
--- a/TensorSharp.GGML.Native/ggml_ops.cpp
+++ b/TensorSharp.GGML.Native/ggml_ops.cpp
@@ -13,6 +13,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <mutex>
 #include <stdexcept>
 #include <string>
@@ -526,6 +527,11 @@ namespace
         return static_cast<std::size_t>(desc.dim0) * desc.dim1 * sizeof(float);
     }
 
+    std::size_t logical_row_bytes(const TensorView2DDesc& desc)
+    {
+        return static_cast<std::size_t>(desc.dim1) * sizeof(float);
+    }
+
     std::size_t logical_bytes(const TensorView3DDesc& desc)
     {
         return static_cast<std::size_t>(desc.dim0) * desc.dim1 * desc.dim2 * sizeof(float);
@@ -546,6 +552,40 @@ namespace
         return static_cast<std::size_t>(desc.ne0) * desc.ne1 * desc.ne2 * desc.ne3 * sizeof(float);
     }
 
+    constexpr std::size_t k_ggml_cuda_max_copy_bytes = static_cast<std::size_t>(std::numeric_limits<int>::max());
+
+    std::size_t raw_row_bytes(const TensorView2DDesc& desc)
+    {
+        TensorView2DDesc row_desc = desc;
+        row_desc.dim0 = 1;
+        return required_raw_bytes(row_desc);
+    }
+
+    TensorView2DDesc slice_rows_2d(const TensorView2DDesc& desc, int row_start, int row_count)
+    {
+        TensorView2DDesc slice = desc;
+        slice.data = static_cast<char*>(desc.data) +
+            static_cast<std::size_t>(row_start) *
+            static_cast<std::size_t>(desc.stride0) *
+            sizeof(float);
+        slice.dim0 = row_count;
+        slice.raw_bytes = static_cast<std::int64_t>(required_raw_bytes(slice));
+        return slice;
+    }
+
+    int limit_rows_for_cuda_copy(int current_limit, const TensorView2DDesc& desc)
+    {
+        if (current_limit <= 0)
+            return 0;
+
+        const std::size_t per_row_bytes = std::max(logical_row_bytes(desc), raw_row_bytes(desc));
+        if (per_row_bytes == 0 || per_row_bytes > k_ggml_cuda_max_copy_bytes)
+            return 0;
+
+        const int limit = static_cast<int>(k_ggml_cuda_max_copy_bytes / per_row_bytes);
+        return std::min(current_limit, std::max(1, limit));
+    }
+
     bool validate_desc(const TensorView2DDesc& desc, const char* name)
     {
         if (desc.data == nullptr)
@@ -857,7 +897,8 @@ namespace
         std::size_t bytes,
         ggml_backend_buffer_t& out_buffer,
         void*& out_addr,
-        bool& out_needs_upload)
+        bool& out_needs_upload,
+        enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
     {
         out_buffer = nullptr;
         out_addr = nullptr;
@@ -903,7 +944,7 @@ namespace
             if (out_buffer == nullptr)
                 return false;
 
-            ggml_backend_buffer_set_usage(out_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            ggml_backend_buffer_set_usage(out_buffer, usage);
             out_addr = ggml_backend_buffer_get_base(out_buffer);
             out_needs_upload = true;
 
@@ -1306,6 +1347,56 @@ namespace
             return 0;
         }
 
+        if (g_backend_type == BACKEND_TYPE_CUDA)
+        {
+            const bool needs_chunking =
+                logical_bytes(result_desc) > k_ggml_cuda_max_copy_bytes ||
+                logical_bytes(m1_desc) > k_ggml_cuda_max_copy_bytes ||
+                static_cast<std::size_t>(result_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes ||
+                static_cast<std::size_t>(m1_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes ||
+                (beta != 0.0f && (
+                    logical_bytes(src_desc) > k_ggml_cuda_max_copy_bytes ||
+                    static_cast<std::size_t>(src_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes));
+
+            if (needs_chunking)
+            {
+                int chunk_rows = rows;
+                chunk_rows = limit_rows_for_cuda_copy(chunk_rows, result_desc);
+                chunk_rows = limit_rows_for_cuda_copy(chunk_rows, m1_desc);
+                if (beta != 0.0f)
+                {
+                    chunk_rows = limit_rows_for_cuda_copy(chunk_rows, src_desc);
+                    if (src_desc.dim0 != rows)
+                        chunk_rows = (chunk_rows / src_desc.dim0) * src_desc.dim0;
+                }
+
+                if (chunk_rows <= 0)
+                {
+                    set_last_error("GGML CUDA addmm received a row slice larger than the backend copy limit.");
+                    return 0;
+                }
+
+                if (chunk_rows < rows)
+                {
+                    for (int row_start = 0; row_start < rows; row_start += chunk_rows)
+                    {
+                        const int row_count = std::min(chunk_rows, rows - row_start);
+                        const TensorView2DDesc result_slice = slice_rows_2d(result_desc, row_start, row_count);
+                        const TensorView2DDesc m1_slice = slice_rows_2d(m1_desc, row_start, row_count);
+                        const TensorView2DDesc src_slice = beta == 0.0f
+                            ? TensorView2DDesc{}
+                            : (src_desc.dim0 == rows ? slice_rows_2d(src_desc, row_start, row_count) : src_desc);
+
+                        if (!addmm_f32_impl(result_slice, src_slice, m1_slice, m2_desc, beta, alpha))
+                            return 0;
+                    }
+
+                    clear_last_error();
+                    return 1;
+                }
+            }
+        }
+
         if (!can_map_standard_view(result_desc))
         {
             set_last_error("Result tensor layout is not supported by the ggml addmm Metal path.");
@@ -1531,6 +1622,44 @@ namespace
             return 0;
         }
 
+        if (g_backend_type == BACKEND_TYPE_CUDA)
+        {
+            const bool needs_chunking =
+                logical_bytes(result_desc) > k_ggml_cuda_max_copy_bytes ||
+                logical_bytes(m1_desc) > k_ggml_cuda_max_copy_bytes ||
+                static_cast<std::size_t>(result_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes ||
+                static_cast<std::size_t>(m1_desc.raw_bytes) > k_ggml_cuda_max_copy_bytes;
+
+            if (needs_chunking)
+            {
+                int chunk_rows = rows;
+                chunk_rows = limit_rows_for_cuda_copy(chunk_rows, result_desc);
+                chunk_rows = limit_rows_for_cuda_copy(chunk_rows, m1_desc);
+
+                if (chunk_rows <= 0)
+                {
+                    set_last_error("GGML CUDA addmm_quant received a row slice larger than the backend copy limit.");
+                    return 0;
+                }
+
+                if (chunk_rows < rows)
+                {
+                    for (int row_start = 0; row_start < rows; row_start += chunk_rows)
+                    {
+                        const int row_count = std::min(chunk_rows, rows - row_start);
+                        const TensorView2DDesc result_slice = slice_rows_2d(result_desc, row_start, row_count);
+                        const TensorView2DDesc m1_slice = slice_rows_2d(m1_desc, row_start, row_count);
+
+                        if (!addmm_quant_f32_impl(result_slice, m1_slice, m2_quant))
+                            return 0;
+                    }
+
+                    clear_last_error();
+                    return 1;
+                }
+            }
+        }
+
         const std::size_t ctx_size = 1024 * 1024;
         PooledContextHandle context;
         if (!context.init(ctx_size))
@@ -6960,6 +7089,11 @@ TSG_EXPORT void TSGgml_ClearHostBufferCache()
     g_host_buffer_cache.clear();
 }
 
+TSG_EXPORT void TSGgml_InvalidateHostBuffer(void* ptr)
+{
+    invalidate_cached_buffer(ptr);
+}
+
 TSG_EXPORT size_t TSGgml_RowSize(int ggml_type, int64_t ne)
 {
     if (ggml_type < 0 || ggml_type >= GGML_TYPE_COUNT || ne <= 0)
@@ -7014,6 +7148,85 @@ TSG_EXPORT int TSGgml_DequantizeToF32(int ggml_type, const void* src, int64_t nu
 // ============================================================================
 namespace
 {
+    std::size_t kv_cache_bytes(int kv_heads, int cache_size, int head_dim)
+    {
+        return static_cast<std::size_t>(kv_heads) *
+            static_cast<std::size_t>(cache_size) *
+            static_cast<std::size_t>(head_dim) *
+            sizeof(float);
+    }
+
+    ggml_tensor* view_kv_cache_window(
+        ggml_context* ctx,
+        ggml_tensor* cache,
+        int head_dim,
+        int cache_size,
+        int kv_heads,
+        int start_idx,
+        int length)
+    {
+        if (ctx == nullptr || cache == nullptr || head_dim <= 0 || cache_size <= 0 || kv_heads <= 0 || length <= 0)
+            return nullptr;
+
+        start_idx %= cache_size;
+        if (start_idx < 0)
+            start_idx += cache_size;
+
+        const std::size_t nb1 = static_cast<std::size_t>(head_dim) * sizeof(float);
+        const std::size_t nb2 = static_cast<std::size_t>(cache_size) * static_cast<std::size_t>(head_dim) * sizeof(float);
+
+        if (start_idx + length <= cache_size)
+        {
+            return ggml_view_3d(
+                ctx,
+                cache,
+                head_dim,
+                length,
+                kv_heads,
+                nb1,
+                nb2,
+                static_cast<std::size_t>(start_idx) * static_cast<std::size_t>(head_dim) * sizeof(float));
+        }
+
+        const int tail_length = cache_size - start_idx;
+        const int head_length = length - tail_length;
+        ggml_tensor* tail = ggml_view_3d(
+            ctx,
+            cache,
+            head_dim,
+            tail_length,
+            kv_heads,
+            nb1,
+            nb2,
+            static_cast<std::size_t>(start_idx) * static_cast<std::size_t>(head_dim) * sizeof(float));
+        ggml_tensor* head = ggml_view_3d(ctx, cache, head_dim, head_length, kv_heads, nb1, nb2, 0);
+        if (tail == nullptr || head == nullptr)
+            return nullptr;
+
+        return ggml_concat(ctx, tail, head, 1);
+    }
+
+    void write_flat_kv_to_host_cache(
+        float* cache_data,
+        const float* flat_data,
+        int kv_heads,
+        int cache_size,
+        int head_dim,
+        int cache_pos)
+    {
+        if (cache_data == nullptr || flat_data == nullptr || kv_heads <= 0 || cache_size <= 0 || head_dim <= 0)
+            return;
+
+        const std::size_t head_bytes = static_cast<std::size_t>(head_dim) * sizeof(float);
+        for (int h = 0; h < kv_heads; ++h)
+        {
+            std::memcpy(
+                cache_data + static_cast<std::size_t>(h) * cache_size * head_dim + static_cast<std::size_t>(cache_pos) * head_dim,
+                flat_data + static_cast<std::size_t>(h) * head_dim,
+                head_bytes);
+        }
+    }
+
     int transformer_layer_decode_impl(
         float* hidden_data, int hidden_size,
         float* attn_norm_data,
@@ -7037,23 +7250,6 @@ namespace
         const int totalSeqLen = position + 1;
         const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
 
-        // Create contiguous copies of cached KV (strided cache → contiguous buffer)
-        std::vector<float> k_cached_buf, v_cached_buf;
-        if (position > 0)
-        {
-            k_cached_buf.resize(static_cast<std::size_t>(position) * kDim);
-            v_cached_buf.resize(static_cast<std::size_t>(position) * kDim);
-            for (int h = 0; h < num_kv_heads; h++)
-            {
-                std::memcpy(k_cached_buf.data() + h * position * head_dim,
-                            k_cache_data + h * max_seq_len * head_dim,
-                            static_cast<std::size_t>(position) * head_dim * sizeof(float));
-                std::memcpy(v_cached_buf.data() + h * position * head_dim,
-                            v_cache_data + h * max_seq_len * head_dim,
-                            static_cast<std::size_t>(position) * head_dim * sizeof(float));
-            }
-        }
-
         PooledContextHandle context;
         if (!context.init(2 * 1024 * 1024))
         {
@@ -7075,14 +7271,8 @@ namespace
         ggml_tensor* down_w  = ggml_new_tensor_2d(ctx, static_cast<ggml_type>(down_type), down_ne0, down_ne1);
 
         ggml_tensor* pos_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-
-        ggml_tensor* k_cached_t = nullptr;
-        ggml_tensor* v_cached_t = nullptr;
-        if (position > 0)
-        {
-            k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads);
-            v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads);
-        }
+        ggml_tensor* k_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads);
+        ggml_tensor* v_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads);
 
         // Output download targets
         ggml_tensor* hidden_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
@@ -7130,18 +7320,24 @@ namespace
         ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope, 0, 2, 1, 3);
         ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_raw, head_dim, num_kv_heads, 1);
         ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3);
-
-        ggml_tensor* k_full;
-        ggml_tensor* v_full;
-        if (position > 0)
-        {
-            k_full = ggml_concat(ctx, k_cached_t, ggml_cont(ctx, k_rope_perm), 1);
-            v_full = ggml_concat(ctx, v_cached_t, ggml_cont(ctx, v_perm),       1);
-        }
-        else
+        ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm);
+        ggml_tensor* v_write = ggml_cont(ctx, v_perm);
+        ggml_tensor* k_cache_updated = ggml_set_1d_inplace(
+            ctx,
+            k_cache_base,
+            k_write,
+            static_cast<std::size_t>(position) * static_cast<std::size_t>(head_dim) * sizeof(float));
+        ggml_tensor* v_cache_updated = ggml_set_1d_inplace(
+            ctx,
+            v_cache_base,
+            v_write,
+            static_cast<std::size_t>(position) * static_cast<std::size_t>(head_dim) * sizeof(float));
+        ggml_tensor* k_full = view_kv_cache_window(ctx, k_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen);
+        ggml_tensor* v_full = view_kv_cache_window(ctx, v_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen);
+        if (k_full == nullptr || v_full == nullptr)
         {
-            k_full = ggml_cont(ctx, k_rope_perm);
-            v_full = ggml_cont(ctx, v_perm);
+            set_last_error("Failed to create KV cache views for transformer layer decode.");
+            return 0;
         }
 
         // 7. Flash attention (handles GQA broadcasting automatically)
@@ -7200,7 +7396,8 @@ namespace
         std::vector<HostBinding> upload_list;
         std::vector<BufferHandle> ephemeral_bufs;
 
-        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) {
+        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable,
+                                enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             if (t == nullptr || data == nullptr)
                 return;
 
@@ -7209,7 +7406,7 @@ namespace
                 ggml_backend_buffer_t buf = nullptr;
                 void* addr = nullptr;
                 bool needs_upload = false;
-                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload))
+                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage))
                 {
                     ggml_status st = ggml_backend_tensor_alloc(buf, t, addr);
                     if (st == GGML_STATUS_SUCCESS)
@@ -7247,12 +7444,8 @@ namespace
         bind_or_mark(ffn_norm_w,  ffn_norm_data,  static_cast<std::size_t>(hidden_size) * sizeof(float), true);
         bind_or_mark(q_norm_w,    q_norm_data,    static_cast<std::size_t>(head_dim) * sizeof(float), true);
         bind_or_mark(k_norm_w,    k_norm_data,    static_cast<std::size_t>(head_dim) * sizeof(float), true);
-
-        if (position > 0)
-        {
-            bind_or_mark(k_cached_t, k_cached_buf.data(), k_cached_buf.size() * sizeof(float), false);
-            bind_or_mark(v_cached_t, v_cached_buf.data(), v_cached_buf.size() * sizeof(float), false);
-        }
+        bind_or_mark(k_cache_base, k_cache_data, kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        bind_or_mark(v_cache_base, v_cache_data, kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 
         // Allocate backend buffer for remaining tensors (intermediates + non-host-ptr tensors)
         BufferHandle buffer(ggml_backend_alloc_ctx_tensors(ctx, g_backend));
@@ -7289,15 +7482,8 @@ namespace
         ggml_backend_tensor_get(k_new_out, k_new_buf.data(), 0, static_cast<std::size_t>(kDim) * sizeof(float));
         ggml_backend_tensor_get(v_new_out, v_new_buf.data(), 0, static_cast<std::size_t>(kDim) * sizeof(float));
 
-        for (int h = 0; h < num_kv_heads; h++)
-        {
-            std::memcpy(k_cache_data + h * max_seq_len * head_dim + position * head_dim,
-                        k_new_buf.data() + h * head_dim,
-                        static_cast<std::size_t>(head_dim) * sizeof(float));
-            std::memcpy(v_cache_data + h * max_seq_len * head_dim + position * head_dim,
-                        v_new_buf.data() + h * head_dim,
-                        static_cast<std::size_t>(head_dim) * sizeof(float));
-        }
+        write_flat_kv_to_host_cache(k_cache_data, k_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position);
+        write_flat_kv_to_host_cache(v_cache_data, v_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position);
 
         clear_last_error();
         return 1;
@@ -7377,33 +7563,6 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
         const int totalSeqLen = position + 1;
         const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
 
-        // Pre-copy cached KV for all layers
-        struct LayerKVCache {
-            std::vector<float> k_buf;
-            std::vector<float> v_buf;
-        };
-        std::vector<LayerKVCache> kv_caches(num_layers);
-        if (position > 0)
-        {
-            for (int l = 0; l < num_layers; l++)
-            {
-                auto& cache = kv_caches[l];
-                cache.k_buf.resize(static_cast<std::size_t>(position) * kDim);
-                cache.v_buf.resize(static_cast<std::size_t>(position) * kDim);
-                float* kc = static_cast<float*>(k_cache_arr[l]);
-                float* vc = static_cast<float*>(v_cache_arr[l]);
-                for (int h = 0; h < num_kv_heads; h++)
-                {
-                    std::memcpy(cache.k_buf.data() + h * position * head_dim,
-                                kc + h * max_seq_len * head_dim,
-                                static_cast<std::size_t>(position) * head_dim * sizeof(float));
-                    std::memcpy(cache.v_buf.data() + h * position * head_dim,
-                                vc + h * max_seq_len * head_dim,
-                                static_cast<std::size_t>(position) * head_dim * sizeof(float));
-                }
-            }
-        }
-
         // Large context for all layers
         const std::size_t ctx_size = 16 * 1024 * 1024;
         PooledContextHandle context;
@@ -7428,8 +7587,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
             ggml_tensor* ffn_norm_w;
             ggml_tensor* gu_w;
             ggml_tensor* down_w;
-            ggml_tensor* k_cached_t;
-            ggml_tensor* v_cached_t;
+            ggml_tensor* k_cache_base;
+            ggml_tensor* v_cache_base;
             ggml_tensor* k_new_out;
             ggml_tensor* v_new_out;
             ggml_tensor* out_k_cpy;
@@ -7448,19 +7607,10 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
             lt.ffn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
             lt.gu_w   = ggml_new_tensor_2d(ctx, static_cast<ggml_type>(gu_type), gu_ne0, gu_ne1);
             lt.down_w = ggml_new_tensor_2d(ctx, static_cast<ggml_type>(down_type), down_ne0, down_ne1);
+            lt.k_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads);
+            lt.v_cache_base = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, max_seq_len, num_kv_heads);
             lt.k_new_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kDim);
             lt.v_new_out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kDim);
-
-            if (position > 0)
-            {
-                lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads);
-                lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, head_dim, position, num_kv_heads);
-            }
-            else
-            {
-                lt.k_cached_t = nullptr;
-                lt.v_cached_t = nullptr;
-            }
         }
 
         // Build computation graph: chain all layers
@@ -7503,18 +7653,24 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
             ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope, 0, 2, 1, 3);
             ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_raw, head_dim, num_kv_heads, 1);
             ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3);
-
-            ggml_tensor* k_full;
-            ggml_tensor* v_full;
-            if (position > 0)
-            {
-                k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, k_rope_perm), 1);
-                v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, v_perm), 1);
-            }
-            else
+            ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm);
+            ggml_tensor* v_write = ggml_cont(ctx, v_perm);
+            ggml_tensor* k_cache_updated = ggml_set_1d_inplace(
+                ctx,
+                lt.k_cache_base,
+                k_write,
+                static_cast<std::size_t>(position) * static_cast<std::size_t>(head_dim) * sizeof(float));
+            ggml_tensor* v_cache_updated = ggml_set_1d_inplace(
+                ctx,
+                lt.v_cache_base,
+                v_write,
+                static_cast<std::size_t>(position) * static_cast<std::size_t>(head_dim) * sizeof(float));
+            ggml_tensor* k_full = view_kv_cache_window(ctx, k_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen);
+            ggml_tensor* v_full = view_kv_cache_window(ctx, v_cache_updated, head_dim, max_seq_len, num_kv_heads, 0, totalSeqLen);
+            if (k_full == nullptr || v_full == nullptr)
             {
-                k_full = ggml_cont(ctx, k_rope_perm);
-                v_full = ggml_cont(ctx, v_perm);
+                set_last_error("Failed to create KV cache views for transformer model decode.");
+                return 0;
             }
 
             // Flash attention
@@ -7574,7 +7730,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
         std::vector<HostBinding> upload_list;
         std::vector<BufferHandle> ephemeral_bufs;
 
-        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) {
+        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable,
+                                enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             if (t == nullptr || data == nullptr)
                 return;
 
@@ -7583,7 +7740,7 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
                 ggml_backend_buffer_t buf = nullptr;
                 void* addr = nullptr;
                 bool needs_upload = false;
-                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload))
+                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage))
                 {
                     ggml_status st = ggml_backend_tensor_alloc(buf, t, addr);
                     if (st == GGML_STATUS_SUCCESS)
@@ -7624,12 +7781,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
             bind_or_mark(lt.ffn_norm_w,  ffn_norm_arr[l],  static_cast<std::size_t>(hidden_size) * sizeof(float), true);
             bind_or_mark(lt.q_norm_w,    q_norm_arr[l],    static_cast<std::size_t>(head_dim) * sizeof(float), true);
             bind_or_mark(lt.k_norm_w,    k_norm_arr[l],    static_cast<std::size_t>(head_dim) * sizeof(float), true);
-
-            if (position > 0)
-            {
-                bind_or_mark(lt.k_cached_t, kv_caches[l].k_buf.data(), kv_caches[l].k_buf.size() * sizeof(float), false);
-                bind_or_mark(lt.v_cached_t, kv_caches[l].v_buf.data(), kv_caches[l].v_buf.size() * sizeof(float), false);
-            }
+            bind_or_mark(lt.k_cache_base, k_cache_arr[l], kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            bind_or_mark(lt.v_cache_base, v_cache_arr[l], kv_cache_bytes(num_kv_heads, max_seq_len, head_dim), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
 
         // Allocate backend buffer for intermediates
@@ -7669,17 +7822,8 @@ TSG_EXPORT int TSGgml_TransformerModelDecode(
             ggml_backend_tensor_get(layers[l].k_new_out, k_new_buf.data(), 0, static_cast<std::size_t>(kDim) * sizeof(float));
             ggml_backend_tensor_get(layers[l].v_new_out, v_new_buf.data(), 0, static_cast<std::size_t>(kDim) * sizeof(float));
 
-            float* kc = static_cast<float*>(k_cache_arr[l]);
-            float* vc = static_cast<float*>(v_cache_arr[l]);
-            for (int h = 0; h < num_kv_heads; h++)
-            {
-                std::memcpy(kc + h * max_seq_len * head_dim + position * head_dim,
-                            k_new_buf.data() + h * head_dim,
-                            static_cast<std::size_t>(head_dim) * sizeof(float));
-                std::memcpy(vc + h * max_seq_len * head_dim + position * head_dim,
-                            v_new_buf.data() + h * head_dim,
-                            static_cast<std::size_t>(head_dim) * sizeof(float));
-            }
+            write_flat_kv_to_host_cache(static_cast<float*>(k_cache_arr[l]), k_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position);
+            write_flat_kv_to_host_cache(static_cast<float*>(v_cache_arr[l]), v_new_buf.data(), num_kv_heads, max_seq_len, head_dim, position);
         }
 
         clear_last_error();
@@ -7753,7 +7897,7 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
         for (int l = 0; l < num_layers; l++)
             if (head_dim_arr[l] > maxHd) maxHd = head_dim_arr[l];
 
-        // Prepare per-layer contiguous KV cache copies
+        // Prepare per-layer KV cache metadata
         struct LayerInfo {
             int hd;
             int kvHeads;
@@ -7764,8 +7908,6 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
             bool isShared;
             int kvSource;
             int attendLen;
-            std::vector<float> k_buf;
-            std::vector<float> v_buf;
         };
         std::vector<LayerInfo> li(num_layers);
 
@@ -7786,55 +7928,6 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
             info.attendLen = info.isLocal ? std::min(totalSeqLen, sliding_window) : totalSeqLen;
         }
 
-        // Extract KV cache data: only for unique KV source layers (avoid duplicate copies)
-        std::unordered_map<int, int> kvSrcDone;
-        for (int l = 0; l < num_layers; l++)
-        {
-            auto& info = li[l];
-            int kvSrc = info.kvSource;
-            if (kvSrcDone.count(kvSrc)) continue;
-            kvSrcDone[kvSrc] = 1;
-
-            int windowLen = info.attendLen - 1;
-            if (windowLen <= 0) continue;
-
-            auto& srcInfo = li[kvSrc];
-            srcInfo.k_buf.resize(static_cast<std::size_t>(windowLen) * info.kDim);
-            srcInfo.v_buf.resize(static_cast<std::size_t>(windowLen) * info.kDim);
-            float* kc = static_cast<float*>(k_cache_arr[kvSrc]);
-            float* vc = static_cast<float*>(v_cache_arr[kvSrc]);
-
-            if (info.isLocal)
-            {
-                int start = (totalSeqLen > sliding_window) ? totalSeqLen - sliding_window : 0;
-                for (int h = 0; h < info.kvHeads; h++)
-                {
-                    float* kHead = kc + h * info.cacheSize * info.hd;
-                    float* vHead = vc + h * info.cacheSize * info.hd;
-                    for (int p = 0; p < windowLen; p++)
-                    {
-                        int cacheIdx = (start + p) % info.cacheSize;
-                        std::memcpy(srcInfo.k_buf.data() + (h * windowLen + p) * info.hd,
-                                   kHead + cacheIdx * info.hd, info.hd * sizeof(float));
-                        std::memcpy(srcInfo.v_buf.data() + (h * windowLen + p) * info.hd,
-                                   vHead + cacheIdx * info.hd, info.hd * sizeof(float));
-                    }
-                }
-            }
-            else
-            {
-                for (int h = 0; h < info.kvHeads; h++)
-                {
-                    std::memcpy(srcInfo.k_buf.data() + h * windowLen * info.hd,
-                               kc + h * info.cacheSize * info.hd,
-                               static_cast<std::size_t>(windowLen) * info.hd * sizeof(float));
-                    std::memcpy(srcInfo.v_buf.data() + h * windowLen * info.hd,
-                               vc + h * info.cacheSize * info.hd,
-                               static_cast<std::size_t>(windowLen) * info.hd * sizeof(float));
-                }
-            }
-        }
-
         // Create GGML context
         const std::size_t ctx_size = 32 * 1024 * 1024;
         PooledContextHandle context;
@@ -7906,12 +7999,10 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
                 lt.v_new_out = nullptr;
             }
 
-            int windowLen = info.attendLen - 1;
-            // For shared layers, reuse donor's cached_t (set below after all layers created)
-            if (!info.isShared && windowLen > 0)
+            if (!info.isShared)
             {
-                lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, windowLen, info.kvHeads);
-                lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, windowLen, info.kvHeads);
+                lt.k_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, info.cacheSize, info.kvHeads);
+                lt.v_cached_t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, info.hd, info.cacheSize, info.kvHeads);
             }
             else
             {
@@ -7946,7 +8037,11 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
         // Build compute graph
         ggml_tensor* hidden = current;
 
-        // Track new K/V tensors produced by non-shared layers for concat with cached
+        // Track the active KV tensors produced by each donor layer.
+        std::vector<ggml_tensor*> layer_k_full(num_layers, nullptr);
+        std::vector<ggml_tensor*> layer_v_full(num_layers, nullptr);
+
+        // Track new K/V tensors produced by non-shared layers for download.
         std::vector<ggml_tensor*> layer_k_new(num_layers, nullptr);
         std::vector<ggml_tensor*> layer_v_new(num_layers, nullptr);
 
@@ -7998,18 +8093,29 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
                 ggml_tensor* k_rope_perm = ggml_permute(ctx, k_rope_t, 0, 2, 1, 3);
                 ggml_tensor* v_3d = ggml_reshape_3d(ctx, v_normed, info.hd, info.kvHeads, 1);
                 ggml_tensor* v_perm = ggml_permute(ctx, v_3d, 0, 2, 1, 3);
-
-                int windowLen = info.attendLen - 1;
-                if (windowLen > 0)
+                ggml_tensor* k_write = ggml_cont(ctx, k_rope_perm);
+                ggml_tensor* v_write = ggml_cont(ctx, v_perm);
+                const int cachePos = info.isLocal ? (position % info.cacheSize) : position;
+                const int activeStart = info.isLocal ? ((totalSeqLen - info.attendLen) % info.cacheSize) : 0;
+                ggml_tensor* k_cache_updated = ggml_set_1d_inplace(
+                    ctx,
+                    lt.k_cached_t,
+                    k_write,
+                    static_cast<std::size_t>(cachePos) * static_cast<std::size_t>(info.hd) * sizeof(float));
+                ggml_tensor* v_cache_updated = ggml_set_1d_inplace(
+                    ctx,
+                    lt.v_cached_t,
+                    v_write,
+                    static_cast<std::size_t>(cachePos) * static_cast<std::size_t>(info.hd) * sizeof(float));
+                k_full = view_kv_cache_window(ctx, k_cache_updated, info.hd, info.cacheSize, info.kvHeads, activeStart, info.attendLen);
+                v_full = view_kv_cache_window(ctx, v_cache_updated, info.hd, info.cacheSize, info.kvHeads, activeStart, info.attendLen);
+                if (k_full == nullptr || v_full == nullptr)
                 {
-                    k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, k_rope_perm), 1);
-                    v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, v_perm), 1);
-                }
-                else
-                {
-                    k_full = ggml_cont(ctx, k_rope_perm);
-                    v_full = ggml_cont(ctx, v_perm);
+                    set_last_error("Failed to create Gemma4 KV cache views.");
+                    return 0;
                 }
+                layer_k_full[l] = k_full;
+                layer_v_full[l] = v_full;
 
                 // Store new K/V refs for KV output
                 layer_k_new[l] = k_rope_t;
@@ -8030,41 +8136,18 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
 
                 // Use the donor layer's K/V (already computed earlier in the graph)
                 int donor = info.kvSource;
-                auto& donorInfo = li[donor];
-                int windowLen = info.attendLen - 1;
-
-                if (layer_k_new[donor] != nullptr && windowLen > 0)
-                {
-                    // Donor's new K/V were produced - concat with cached
-                    ggml_tensor* dk_perm = ggml_permute(ctx, layer_k_new[donor], 0, 2, 1, 3);
-                    ggml_tensor* dv_1d = layer_v_new[donor];
-                    ggml_tensor* dv_3d = ggml_reshape_3d(ctx, dv_1d, donorInfo.hd, donorInfo.kvHeads, 1);
-                    ggml_tensor* dv_perm = ggml_permute(ctx, dv_3d, 0, 2, 1, 3);
-                    k_full = ggml_concat(ctx, lt.k_cached_t, ggml_cont(ctx, dk_perm), 1);
-                    v_full = ggml_concat(ctx, lt.v_cached_t, ggml_cont(ctx, dv_perm), 1);
-                }
-                else if (layer_k_new[donor] != nullptr)
+                k_full = layer_k_full[donor];
+                v_full = layer_v_full[donor];
+                if (k_full == nullptr || v_full == nullptr)
                 {
-                    ggml_tensor* dk_perm = ggml_permute(ctx, layer_k_new[donor], 0, 2, 1, 3);
-                    ggml_tensor* dv_1d = layer_v_new[donor];
-                    ggml_tensor* dv_3d = ggml_reshape_3d(ctx, dv_1d, donorInfo.hd, donorInfo.kvHeads, 1);
-                    ggml_tensor* dv_perm = ggml_permute(ctx, dv_3d, 0, 2, 1, 3);
-                    k_full = ggml_cont(ctx, dk_perm);
-                    v_full = ggml_cont(ctx, dv_perm);
-                }
-                else if (windowLen > 0)
-                {
-                    k_full = lt.k_cached_t;
-                    v_full = lt.v_cached_t;
-                }
-                else
-                {
-                    // No cached data and no new data - should not happen
                     set_last_error("Shared layer has no KV data available.");
                     return 0;
                 }
             }
 
+            layer_k_full[l] = k_full;
+            layer_v_full[l] = v_full;
+
             // Manual attention: scores = softmax(K^T @ Q), output = V_T @ scores
             // Gemma4 uses QK-Norm (per-head RMSNorm on Q/K), so no 1/sqrt(d) scaling
             ggml_tensor* q_attn = ggml_permute(ctx, q_rope, 0, 2, 1, 3);
@@ -8166,7 +8249,8 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
         std::vector<HostBinding> upload_list;
         std::vector<BufferHandle> ephemeral_bufs;
 
-        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable) {
+        auto bind_or_mark = [&](ggml_tensor* t, void* data, std::size_t bytes, bool cacheable,
+                                enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             if (t == nullptr || data == nullptr) return;
 
             if (cacheable && bytes >= 4096)
@@ -8174,7 +8258,7 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
                 ggml_backend_buffer_t buf = nullptr;
                 void* addr = nullptr;
                 bool needs_upload = false;
-                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload))
+                if (try_get_cacheable_tensor_buffer(g_backend, dev, t, data, bytes, buf, addr, needs_upload, usage))
                 {
                     ggml_status st = ggml_backend_tensor_alloc(buf, t, addr);
                     if (st == GGML_STATUS_SUCCESS)
@@ -8223,13 +8307,8 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
 
             if (!info.isShared)
             {
-                int windowLen = info.attendLen - 1;
-                if (windowLen > 0)
-                {
-                    auto& srcInfo = li[info.kvSource];
-                    bind_or_mark(lt.k_cached_t, srcInfo.k_buf.data(), srcInfo.k_buf.size() * sizeof(float), false);
-                    bind_or_mark(lt.v_cached_t, srcInfo.v_buf.data(), srcInfo.v_buf.size() * sizeof(float), false);
-                }
+                bind_or_mark(lt.k_cached_t, k_cache_arr[l], kv_cache_bytes(info.kvHeads, info.cacheSize, info.hd), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+                bind_or_mark(lt.v_cached_t, v_cache_arr[l], kv_cache_bytes(info.kvHeads, info.cacheSize, info.hd), true, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
             }
 
             if (lt.ple_gate_w != nullptr)
@@ -8291,24 +8370,14 @@ TSG_EXPORT int TSGgml_Gemma4ModelDecode(
             ggml_backend_tensor_get(layers[l].v_new_out, v_new_buf.data(), 0,
                 static_cast<std::size_t>(info.kDim) * sizeof(float));
 
-            float* kc = static_cast<float*>(k_cache_arr[l]);
-            float* vc = static_cast<float*>(v_cache_arr[l]);
-
             int cachePos;
             if (info.isLocal)
                 cachePos = position % info.cacheSize;
             else
                 cachePos = position;
 
-            for (int h = 0; h < info.kvHeads; h++)
-            {
-                std::memcpy(kc + h * info.cacheSize * info.hd + cachePos * info.hd,
-                           k_new_buf.data() + h * info.hd,
-                           static_cast<std::size_t>(info.hd) * sizeof(float));
-                std::memcpy(vc + h * info.cacheSize * info.hd + cachePos * info.hd,
-                           v_new_buf.data() + h * info.hd,
-                           static_cast<std::size_t>(info.hd) * sizeof(float));
-            }
+            write_flat_kv_to_host_cache(static_cast<float*>(k_cache_arr[l]), k_new_buf.data(), info.kvHeads, info.cacheSize, info.hd, cachePos);
+            write_flat_kv_to_host_cache(static_cast<float*>(v_cache_arr[l]), v_new_buf.data(), info.kvHeads, info.cacheSize, info.hd, cachePos);
         }
 
         clear_last_error();
diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.GGML/GgmlBasicOps.cs
index 3088f35..24a50c5 100644
--- a/TensorSharp.GGML/GgmlBasicOps.cs
+++ b/TensorSharp.GGML/GgmlBasicOps.cs
@@ -371,6 +371,7 @@ public static void AddmmQuantBatch(Tensor result, Tensor m1, IntPtr weightData,
         public static IntPtr AlignedAlloc(long size) => GgmlNative.AlignedAlloc(size);
         public static void AlignedFree(IntPtr ptr) => GgmlNative.AlignedFree(ptr);
         public static void ClearHostBufferCache() => GgmlNative.ClearHostBufferCache();
+        public static void InvalidateHostBuffer(IntPtr ptr) => GgmlNative.InvalidateHostBuffer(ptr);
         public static void EnsureBackendAvailable(GgmlBackendType backendType) => GgmlNative.EnsureAvailable(backendType);
 
         public static void TransformerModelDecode(
diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.GGML/GgmlNative.cs
index 2d9d822..f0bb98d 100644
--- a/TensorSharp.GGML/GgmlNative.cs
+++ b/TensorSharp.GGML/GgmlNative.cs
@@ -415,6 +415,9 @@ private static extern int TSGgml_Gemma4ModelDecode(
         [DllImport(DllName, CallingConvention = CallingConventionType)]
         private static extern void TSGgml_ClearHostBufferCache();
 
+        [DllImport(DllName, CallingConvention = CallingConventionType)]
+        private static extern void TSGgml_InvalidateHostBuffer(IntPtr ptr);
+
         [DllImport(DllName, CallingConvention = CallingConventionType)]
         private static extern UIntPtr TSGgml_RowSize(int ggmlType, long ne);
 
@@ -545,7 +548,7 @@ public static void EnsureAvailable(GgmlBackendType backendType)
                         GgmlBackendType.Cuda => "ggml-cuda",
                         _ => "ggml-cpu",
                     };
-                    throw new InvalidOperationException($"Failed to initialize {backendName}. {GetLastErrorMessage("Build the native GGML bridge and ensure the requested GGML backend is available.")}");
+                    throw new InvalidOperationException($"Failed to initialize {backendName}. {GetBackendAvailabilityHint(backendType)}");
                 }
             }
             catch (DllNotFoundException ex)
@@ -868,6 +871,12 @@ public static void ClearHostBufferCache()
             TSGgml_ClearHostBufferCache();
         }
 
+        public static void InvalidateHostBuffer(IntPtr ptr)
+        {
+            if (ptr != IntPtr.Zero)
+                TSGgml_InvalidateHostBuffer(ptr);
+        }
+
         /// <summary>Bytes for one row along ne[0]; 0 if type/shape invalid.</summary>
         internal static long RowSizeBytesOrZero(int ggmlType, long ne0)
         {
@@ -1008,5 +1017,24 @@ private static string GetLastErrorMessage(string fallback)
             string message = errPtr == IntPtr.Zero ? null : Marshal.PtrToStringAnsi(errPtr);
             return string.IsNullOrWhiteSpace(message) ? fallback : message;
         }
+
+        private static string GetBackendAvailabilityHint(GgmlBackendType backendType)
+        {
+            string defaultMessage = "Build the native GGML bridge and ensure the requested GGML backend is available.";
+            string backendMessage = GetLastErrorMessage(defaultMessage);
+
+            if (backendType == GgmlBackendType.Cuda && OperatingSystem.IsLinux())
+            {
+                const string rebuildHint = "Rebuild the native GGML bridge with CUDA enabled, for example: `bash TensorSharp.GGML.Native/build-linux.sh --cuda` or `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build`.";
+
+                if (string.IsNullOrWhiteSpace(backendMessage))
+                    return rebuildHint;
+
+                if (backendMessage.Contains("not available in this build", StringComparison.OrdinalIgnoreCase))
+                    return $"{backendMessage} {rebuildHint}";
+            }
+
+            return backendMessage;
+        }
     }
 }
diff --git a/TensorSharp.GGML/TensorSharp.GGML.csproj b/TensorSharp.GGML/TensorSharp.GGML.csproj
index e6157cd..2d93ad3 100644
--- a/TensorSharp.GGML/TensorSharp.GGML.csproj
+++ b/TensorSharp.GGML/TensorSharp.GGML.csproj
@@ -34,7 +34,7 @@
     <ProjectReference Include="..\AdvUtils\AdvUtils.csproj" />
     <ProjectReference Include="..\TensorSharp\TensorSharp.csproj" />
   </ItemGroup>
-  <Target Name="BuildGgmlNative" BeforeTargets="BeforeBuild" Condition="'$(GgmlNativeBuildScript)' != '' And ( !Exists('$(GgmlNativeBuildDir)/$(GgmlNativeBinaryName)') Or '$(GgmlNativeForceBuild)' == 'true' )">
+  <Target Name="BuildGgmlNative" BeforeTargets="BeforeBuild" Condition="'$(GgmlNativeBuildScript)' != ''">
     <Exec Command="bash &quot;$(MSBuildProjectDirectory)/../TensorSharp.GGML.Native/$(GgmlNativeBuildScript)&quot; $(GgmlNativeBuildArgs)" />
   </Target>
   <Target Name="CopyGgmlNativeBinary" AfterTargets="Build" Condition="'$(GgmlNativeBinaryName)' != ''">
diff --git a/readme_cn.md b/readme_cn.md
index 64a472b..c2cab4c 100644
--- a/readme_cn.md
+++ b/readme_cn.md
@@ -129,7 +129,7 @@ bash build-linux.sh --cuda
 TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
 ```
 
-在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上，`build-linux.sh` 默认生成带 GGML CPU 后端的 `libGgmlOps.so`，而 `build-linux.sh --cuda` 会启用面向 NVIDIA GPU 的 GGML_CUDA 支持。构建产物会自动复制到应用输出目录。
+在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上，`build-linux.sh` 会保留已有的 CUDA 构建，并在检测到 CUDA 工具链时自动启用 GGML_CUDA；也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。
 
 ## 使用方法
 

From 19572eaedccc56691b7856fe85d3e925400c6536 Mon Sep 17 00:00:00 2001
From: Zhongkai Fu <fuzhongkai@gmail.com>
Date: Thu, 9 Apr 2026 08:28:55 -0700
Subject: [PATCH 3/3] Make sampled evenly video frame configurable

---
 InferenceEngine/MediaHelper.cs         | 66 +++++++++++++++++++++--
 InferenceWeb.Tests/MediaHelperTests.cs | 73 ++++++++++++++++++++++++++
 InferenceWeb/ModelService.cs           | 57 +++++++++++++++++---
 InferenceWeb/Program.cs                |  1 +
 README.md                              |  1 +
 readme_cn.md                           |  1 +
 6 files changed, 189 insertions(+), 10 deletions(-)
 create mode 100644 InferenceWeb.Tests/MediaHelperTests.cs

diff --git a/InferenceEngine/MediaHelper.cs b/InferenceEngine/MediaHelper.cs
index 2d9d106..34e899a 100644
--- a/InferenceEngine/MediaHelper.cs
+++ b/InferenceEngine/MediaHelper.cs
@@ -18,8 +18,26 @@ namespace InferenceEngine
 {
     public static class MediaHelper
     {
-        public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 8, double fps = 1.0)
+        public const int DefaultVideoMaxFrames = 4;
+
+        public static int GetConfiguredMaxVideoFrames(int fallback = DefaultVideoMaxFrames)
+        {
+            string raw = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES");
+            if (!string.IsNullOrWhiteSpace(raw) &&
+                int.TryParse(raw, out int parsed) &&
+                parsed > 0)
+            {
+                return parsed;
+            }
+
+            return fallback > 0 ? fallback : DefaultVideoMaxFrames;
+        }
+
+        public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 0, double fps = 1.0)
         {
+            if (maxFrames <= 0)
+                maxFrames = GetConfiguredMaxVideoFrames();
+
             string tempDir = Path.Combine(Path.GetTempPath(), $"frames_{Guid.NewGuid():N}");
             Directory.CreateDirectory(tempDir);
 
@@ -33,14 +51,18 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
                 throw new Exception($"Invalid video: fps={videoFps}, frames={totalFrames}");
 
             int frameInterval = Math.Max(1, (int)Math.Round(videoFps / fps));
+            var candidateFrames = new List<int>();
+            for (int frameIdx = 0; frameIdx < totalFrames; frameIdx += frameInterval)
+                candidateFrames.Add(frameIdx);
+
+            var selectedPositions = SelectEvenlySpacedIndices(candidateFrames.Count, maxFrames);
 
             var frames = new List<string>();
             using var mat = new Mat();
 
-            for (int frameIdx = 0; frames.Count < maxFrames; frameIdx += frameInterval)
+            foreach (int pos in selectedPositions)
             {
-                if (frameIdx >= totalFrames)
-                    break;
+                int frameIdx = candidateFrames[pos];
 
                 capture.Set(VideoCaptureProperties.PosFrames, frameIdx);
                 if (!capture.Read(mat) || mat.Empty())
@@ -54,6 +76,42 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
             return frames;
         }
 
+        public static List<int> SelectEvenlySpacedIndices(int count, int maxCount)
+        {
+            var indices = new List<int>();
+            if (count <= 0 || maxCount <= 0)
+                return indices;
+
+            if (count <= maxCount)
+            {
+                for (int i = 0; i < count; i++)
+                    indices.Add(i);
+                return indices;
+            }
+
+            if (maxCount == 1)
+            {
+                indices.Add(count / 2);
+                return indices;
+            }
+
+            double step = (double)(count - 1) / (maxCount - 1);
+            int previous = -1;
+            for (int i = 0; i < maxCount; i++)
+            {
+                int idx = (int)Math.Round(i * step);
+                if (idx <= previous)
+                    idx = previous + 1;
+                if (idx >= count)
+                    idx = count - 1;
+
+                indices.Add(idx);
+                previous = idx;
+            }
+
+            return indices;
+        }
+
         private static void SaveMatAsPng(Mat mat, string path)
         {
             int width = mat.Cols;
diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs
new file mode 100644
index 0000000..4be19b3
--- /dev/null
+++ b/InferenceWeb.Tests/MediaHelperTests.cs
@@ -0,0 +1,73 @@
+using InferenceEngine;
+
+namespace InferenceWeb.Tests;
+
+public class MediaHelperTests
+{
+    private static readonly object EnvLock = new();
+
+    [Fact]
+    public void SelectEvenlySpacedIndicesReturnsAllIndicesWhenAlreadyUnderLimit()
+    {
+        var indices = MediaHelper.SelectEvenlySpacedIndices(count: 3, maxCount: 4);
+
+        Assert.Equal(new[] { 0, 1, 2 }, indices);
+    }
+
+    [Fact]
+    public void SelectEvenlySpacedIndicesIncludesEndpointsWhenDownsampling()
+    {
+        var indices = MediaHelper.SelectEvenlySpacedIndices(count: 8, maxCount: 4);
+
+        Assert.Equal(4, indices.Count);
+        Assert.Equal(0, indices[0]);
+        Assert.Equal(7, indices[^1]);
+        Assert.Equal(new[] { 0, 2, 5, 7 }, indices);
+    }
+
+    [Fact]
+    public void SelectEvenlySpacedIndicesUsesMiddleFrameWhenOnlyOneIsRequested()
+    {
+        var indices = MediaHelper.SelectEvenlySpacedIndices(count: 9, maxCount: 1);
+
+        Assert.Equal(new[] { 4 }, indices);
+    }
+
+    [Fact]
+    public void GetConfiguredMaxVideoFramesFallsBackToDefaultWhenUnset()
+    {
+        lock (EnvLock)
+        {
+            string? oldValue = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES");
+            try
+            {
+                Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", null);
+
+                Assert.Equal(MediaHelper.DefaultVideoMaxFrames, MediaHelper.GetConfiguredMaxVideoFrames());
+            }
+            finally
+            {
+                Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", oldValue);
+            }
+        }
+    }
+
+    [Fact]
+    public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride()
+    {
+        lock (EnvLock)
+        {
+            string? oldValue = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES");
+            try
+            {
+                Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", "6");
+
+                Assert.Equal(6, MediaHelper.GetConfiguredMaxVideoFrames());
+            }
+            finally
+            {
+                Environment.SetEnvironmentVariable("VIDEO_MAX_FRAMES", oldValue);
+            }
+        }
+    }
+}
diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs
index 9996425..81ffdcf 100644
--- a/InferenceWeb/ModelService.cs
+++ b/InferenceWeb/ModelService.cs
@@ -141,11 +141,12 @@ public async IAsyncEnumerable<string> ChatStreamAsync(
             List<ToolFunction> tools = null, bool enableThinking = false)
         {
             string arch = _model.Config.Architecture;
+            var preparedHistory = PrepareHistoryForInference(history, arch);
             string rendered = ChatTemplate.RenderFromGgufTemplate(
-                _model.Config.ChatTemplate, history, addGenerationPrompt: true,
+                _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true,
                 architecture: arch, tools: tools, enableThinking: enableThinking);
 
-            var lastMsg = history.LastOrDefault(m => m.Role == "user");
+            var lastMsg = preparedHistory.LastOrDefault(m => m.Role == "user");
             bool hasMultimodal = HasMultimodalContent(lastMsg);
 
             float[] logits;
@@ -399,12 +400,13 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB
                 new ChatMessage { Role = "user", Content = prompt, ImagePaths = imagePaths }
             };
 
+            var preparedMessages = PrepareHistoryForInference(messages, arch);
             string rendered = ChatTemplate.RenderFromGgufTemplate(
-                _model.Config.ChatTemplate, messages, addGenerationPrompt: true,
+                _model.Config.ChatTemplate, preparedMessages, addGenerationPrompt: true,
                 architecture: arch);
 
             var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true);
-            var lastMsg = messages[0];
+            var lastMsg = preparedMessages[0];
             if (lastMsg.ImagePaths != null && lastMsg.ImagePaths.Count > 0)
                 inputTokens = ProcessMultimodal(lastMsg, inputTokens, arch);
 
@@ -465,11 +467,12 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB
                 List<ToolFunction> tools = null, bool enableThinking = false)
         {
             string arch = _model.Config.Architecture;
+            var preparedHistory = PrepareHistoryForInference(history, arch);
             string rendered = ChatTemplate.RenderFromGgufTemplate(
-                _model.Config.ChatTemplate, history, addGenerationPrompt: true,
+                _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true,
                 architecture: arch, tools: tools, enableThinking: enableThinking);
 
-            var lastMsg = history.LastOrDefault(m => m.Role == "user");
+            var lastMsg = preparedHistory.LastOrDefault(m => m.Role == "user");
             bool hasMultimodal = HasMultimodalContent(lastMsg);
 
             int promptTokenCount;
@@ -559,6 +562,48 @@ private bool TryGetCacheSuffix(string rendered, out string suffixText)
             return false;
         }
 
+        private static List<ChatMessage> PrepareHistoryForInference(List<ChatMessage> history, string arch)
+        {
+            if (history == null || history.Count == 0)
+                return history;
+
+            int lastUserIdx = history.FindLastIndex(m => m.Role == "user");
+            if (lastUserIdx < 0)
+                return history;
+
+            var normalized = NormalizeMessageForInference(history[lastUserIdx], arch);
+            if (ReferenceEquals(normalized, history[lastUserIdx]))
+                return history;
+
+            var prepared = new List<ChatMessage>(history);
+            prepared[lastUserIdx] = normalized;
+            return prepared;
+        }
+
+        private static ChatMessage NormalizeMessageForInference(ChatMessage msg, string arch)
+        {
+            int maxVideoFrames = MediaHelper.GetConfiguredMaxVideoFrames();
+            if (arch != "gemma4" || !msg.IsVideo || msg.ImagePaths == null || msg.ImagePaths.Count <= maxVideoFrames)
+                return msg;
+
+            var sampled = MediaHelper.SelectEvenlySpacedIndices(msg.ImagePaths.Count, maxVideoFrames)
+                .Select(i => msg.ImagePaths[i])
+                .ToList();
+
+            Console.WriteLine($"[video] Downsampled {msg.ImagePaths.Count} frames to {sampled.Count} evenly spaced frames for Gemma4 prefill stability.");
+
+            return new ChatMessage
+            {
+                Role = msg.Role,
+                Content = msg.Content,
+                ImagePaths = sampled,
+                AudioPaths = msg.AudioPaths != null ? new List<string>(msg.AudioPaths) : null,
+                IsVideo = msg.IsVideo,
+                ToolCalls = msg.ToolCalls,
+                Thinking = msg.Thinking
+            };
+        }
+
         private static bool HasMultimodalContent(ChatMessage msg)
         {
             if (msg == null) return false;
diff --git a/InferenceWeb/Program.cs b/InferenceWeb/Program.cs
index e34be88..162efa7 100644
--- a/InferenceWeb/Program.cs
+++ b/InferenceWeb/Program.cs
@@ -1471,6 +1471,7 @@ static string ResolveModelPath(string modelName, string modelDir)
 }
 
 Console.WriteLine($"Model directory: {modelDir}");
+Console.WriteLine($"Video max frames: {MediaHelper.GetConfiguredMaxVideoFrames()}");
 Console.WriteLine("Starting InferenceWeb on http://localhost:5000");
 Console.WriteLine("API endpoints:");
 Console.WriteLine("  GET  /                         - Health check");
diff --git a/README.md b/README.md
index 7564295..b2c2e9e 100644
--- a/README.md
+++ b/README.md
@@ -237,6 +237,7 @@ Open `http://localhost:5000` in your browser. The web interface supports:
 |---|---|
 | `MODEL_DIR` | Directory containing GGUF model files |
 | `BACKEND` | Compute backend: `cpu`, `ggml_cpu`, `ggml_metal`, or `ggml_cuda` (default: `ggml_metal` on macOS, `ggml_cpu` elsewhere) |
+| `VIDEO_MAX_FRAMES` | Maximum evenly spaced video frames extracted for video prompts (default: `4`) |
 | `PORT` | HTTP port (default: `5000`) |
 
 ### HTTP APIs
diff --git a/readme_cn.md b/readme_cn.md
index c2cab4c..4d7c362 100644
--- a/readme_cn.md
+++ b/readme_cn.md
@@ -237,6 +237,7 @@ MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
 |---|---|
 | `MODEL_DIR` | GGUF 模型文件所在目录 |
 | `BACKEND` | 计算后端：`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`（默认：macOS 为 `ggml_metal`，其他平台为 `ggml_cpu`） |
+| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限（默认：`4`） |
 | `PORT` | HTTP 端口（默认：`5000`） |
 
 ### HTTP API