zhongkaifu · zhongkaifu · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/InferenceConsole/Program.cs b/InferenceConsole/Program.cs
@@ -628,7 +628,7 @@ static string RunInference(ModelBase model, string rawText, List<string> imagePa
                         var tokenCounts = new int[imagePaths.Count];
                         for (int i = 0; i < imagePaths.Count; i++)
                         {
-                            var (width, height) = Qwen35ImageProcessor.ReadPngDimensions(imagePaths[i]);
+                            var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]);
                             tokenCounts[i] = processor.ComputeImageTokenCount(height, width);
                             var (gridH, gridW) = processor.GetPatchGrid(height, width);
                             var (resizedH, resizedW) = processor.SmartResize(height, width);

diff --git a/InferenceEngine/InferenceEngine.csproj b/InferenceEngine/InferenceEngine.csproj
@@ -12,7 +12,11 @@
   <ItemGroup>
     <PackageReference Include="NLayer" Version="1.16.0" />
     <PackageReference Include="NVorbis" Version="0.10.5" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.ubuntu.24.04-x64" Version="4.13.0.18" />
     <PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-arm64" Version="4.13.0.18" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-x64" Version="4.13.0.18" />
+    <PackageReference Include="OneWare.OpenCvSharp4.runtime.win-x64" Version="4.13.0.18" />
     <PackageReference Include="OpenCvSharp4" Version="4.13.0.20260330" />
+    <PackageReference Include="StbImageSharp" Version="2.30.15" />
   </ItemGroup>
 </Project>
diff --git a/InferenceEngine/MediaHelper.cs b/InferenceEngine/MediaHelper.cs
@@ -18,8 +18,26 @@ namespace InferenceEngine
 {
     public static class MediaHelper
     {
-        public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 8, double fps = 1.0)
+        public const int DefaultVideoMaxFrames = 4;
+
+        public static int GetConfiguredMaxVideoFrames(int fallback = DefaultVideoMaxFrames)
+        {
+            string raw = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES");
+            if (!string.IsNullOrWhiteSpace(raw) &&
+                int.TryParse(raw, out int parsed) &&
+                parsed > 0)
+            {
+                return parsed;
+            }
+
+            return fallback > 0 ? fallback : DefaultVideoMaxFrames;
+        }
+
+        public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 0, double fps = 1.0)
         {
+            if (maxFrames <= 0)
+                maxFrames = GetConfiguredMaxVideoFrames();
+
             string tempDir = Path.Combine(Path.GetTempPath(), $"frames_{Guid.NewGuid():N}");
             Directory.CreateDirectory(tempDir);
 
@@ -33,14 +51,18 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
                 throw new Exception($"Invalid video: fps={videoFps}, frames={totalFrames}");
 
             int frameInterval = Math.Max(1, (int)Math.Round(videoFps / fps));
+            var candidateFrames = new List<int>();
+            for (int frameIdx = 0; frameIdx < totalFrames; frameIdx += frameInterval)
+                candidateFrames.Add(frameIdx);
+
+            var selectedPositions = SelectEvenlySpacedIndices(candidateFrames.Count, maxFrames);
 
             var frames = new List<string>();
             using var mat = new Mat();
 
-            for (int frameIdx = 0; frames.Count < maxFrames; frameIdx += frameInterval)
+            foreach (int pos in selectedPositions)
             {
-                if (frameIdx >= totalFrames)
-                    break;
+                int frameIdx = candidateFrames[pos];
 
                 capture.Set(VideoCaptureProperties.PosFrames, frameIdx);
                 if (!capture.Read(mat) || mat.Empty())
@@ -54,6 +76,42 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
             return frames;
         }
 
+        public static List<int> SelectEvenlySpacedIndices(int count, int maxCount)
+        {
+            var indices = new List<int>();
+            if (count <= 0 || maxCount <= 0)
+                return indices;
+
+            if (count <= maxCount)
+            {
+                for (int i = 0; i < count; i++)
+                    indices.Add(i);
+                return indices;
+            }
+
+            if (maxCount == 1)
+            {
+                indices.Add(count / 2);
+                return indices;
+            }
+
+            double step = (double)(count - 1) / (maxCount - 1);
+            int previous = -1;
+            for (int i = 0; i < maxCount; i++)
+            {
+                int idx = (int)Math.Round(i * step);
+                if (idx <= previous)
+                    idx = previous + 1;
+                if (idx >= count)
+                    idx = count - 1;
+
+                indices.Add(idx);
+                previous = idx;
+            }
+
+            return indices;
+        }
+
         private static void SaveMatAsPng(Mat mat, string path)
         {
             int width = mat.Cols;

diff --git a/InferenceEngine/ModelBase.cs b/InferenceEngine/ModelBase.cs
@@ -626,6 +626,7 @@ protected void CopyToCache(Tensor cache, Tensor src, int startPos, int seqLen)
         {
             using var cacheSlice = cache.Narrow(1, startPos, seqLen);
             Ops.Copy(cacheSlice, src);
+            InvalidateTensorDeviceCache(cache);
         }
 
         protected Tensor ExpandKVHeads(Tensor cache, int groupSize, int totalSeqLen)
@@ -653,6 +654,9 @@ protected unsafe void CopyToCacheDecode(Tensor kCache, Tensor kTensor,
                 Buffer.MemoryCopy(kSrc + srcOffset, kCachePtr + cacheOffset, headBytes, headBytes);
                 Buffer.MemoryCopy(vSrc + srcOffset, vCachePtr + cacheOffset, headBytes, headBytes);
             }
+
+            InvalidateTensorDeviceCache(kCache);
+            InvalidateTensorDeviceCache(vCache);
         }
 
         protected unsafe void AttentionDecodePureCS(Tensor q, Tensor kCache, Tensor vCache,
@@ -718,6 +722,14 @@ private static IntPtr GetStoragePtr(Tensor t)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        protected void InvalidateTensorDeviceCache(Tensor tensor)
+        {
+            if (!IsGgmlBackend || tensor == null)
+                return;
+
+            GgmlBasicOps.InvalidateHostBuffer(GetStoragePtr(tensor));
+        }
+
         public abstract float[] Forward(int[] tokens);
         public abstract void ResetKVCache();
 

diff --git a/InferenceEngine/Models/Gemma3/Gemma3Model.cs b/InferenceEngine/Models/Gemma3/Gemma3Model.cs
@@ -127,8 +127,16 @@ public override void ResetKVCache()
             _cacheSeqLen = 0;
             if (_kvCacheK != null)
             {
-                foreach (var k in _kvCacheK) Ops.Fill(k, 0f);
-                foreach (var v in _kvCacheV) Ops.Fill(v, 0f);
+                foreach (var k in _kvCacheK)
+                {
+                    Ops.Fill(k, 0f);
+                    InvalidateTensorDeviceCache(k);
+                }
+                foreach (var v in _kvCacheV)
+                {
+                    Ops.Fill(v, 0f);
+                    InvalidateTensorDeviceCache(v);
+                }
             }
         }
 

diff --git a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs b/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
@@ -11,13 +11,16 @@
 using System.Collections.Generic;
 using TensorSharp;
 using TensorSharp.Cpu;
+using TensorSharp.GGML;
 
 namespace InferenceEngine
 {
     public class Gemma3VisionEncoder : IDisposable
     {
         private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
         private readonly IAllocator _allocator;
+        private readonly bool _useNativeAttention;
 
         private readonly int _imageSize;
         private readonly int _patchSize;
@@ -35,6 +38,7 @@ public class Gemma3VisionEncoder : IDisposable
         public Gemma3VisionEncoder(string mmProjPath, IAllocator allocator)
         {
             _allocator = allocator;
+            _useNativeAttention = allocator is GgmlAllocator;
             var gguf = new GgufFile(mmProjPath);
 
             _imageSize = (int)gguf.GetUint32("clip.vision.image_size", 896);
@@ -219,7 +223,16 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
 
             float scale = 1f / MathF.Sqrt(headDim);
 
-            // Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim]
+            if (_useNativeAttention)
+            {
+                using var q4 = q.View(1, numPatches, _numHeads, headDim);
+                using var k4 = k.View(1, numPatches, _numHeads, headDim);
+                using var v4 = v.View(1, numPatches, _numHeads, headDim);
+                using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale);
+                using var flat = attn4.View(numPatches, _hiddenSize);
+                return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
+            }
+
             using var qReshaped = q.View(numPatches, _numHeads, headDim);
             using var kReshaped = k.View(numPatches, _numHeads, headDim);
             using var vReshaped = v.View(numPatches, _numHeads, headDim);
@@ -231,49 +244,30 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
             using var kHeads = Ops.NewContiguous(kT0);
             using var vHeads = Ops.NewContiguous(vT0);
 
-            // Batched Q @ K^T -> [numHeads, numPatches, numPatches]
             using var kT = kHeads.Transpose(1, 2);
             var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
             Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);
-
             Ops.Softmax(scores, scores);
 
-            // Batched softmax @ V -> [numHeads, numPatches, headDim]
             var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim);
             Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads);
             scores.Dispose();
 
-            // Reshape back: [numHeads, numPatches, headDim] -> [numPatches, hiddenSize]
             using var transposed = attnOutput.Transpose(0, 1);
             using var contiguous = Ops.NewContiguous(transposed);
-            using var flat = contiguous.View(numPatches, _hiddenSize);
-            using var flatContig = Ops.NewContiguous(flat);
+            using var flatContig = contiguous.View(numPatches, _hiddenSize);
             attnOutput.Dispose();
 
             return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
         }
 
-        private unsafe Tensor VisionMLP(Tensor input, string prefix)
+        private Tensor VisionMLP(Tensor input, string prefix)
         {
             using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias");
-
-            ApplyGELU(fc1Out);
-
+            Ops.GELU(fc1Out, fc1Out);
             return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias");
         }
 
-        private unsafe void ApplyGELU(Tensor t)
-        {
-            float* ptr = GetFloatPtr(t);
-            int count = (int)t.ElementCount();
-            for (int i = 0; i < count; i++)
-            {
-                double x = ptr[i];
-                double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x)));
-                ptr[i] = (float)(x * cdf);
-            }
-        }
-
         /// <summary>
         /// Multi-modal projector: vision output → text space.
         /// Steps: reshape to 2D grid → average pool → RMSNorm → linear projection.
@@ -354,91 +348,25 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str
 
             Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input);
             Tensor src = contiguousInput ?? input;
-
-            using var wT = weight.Transpose();
-            Ops.Addmm(result, 0, result, 1.0f, src, wT);
+            Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName));
 
             contiguousInput?.Dispose();
 
             if (_weights.TryGetValue(biasName, out var bias))
-            {
-                float* rPtr = GetFloatPtr(result);
-                float* bPtr = GetFloatPtr(bias);
-                for (int s = 0; s < seqLen; s++)
-                {
-                    float* row = rPtr + s * outDim;
-                    for (int d = 0; d < outDim; d++)
-                        row[d] += bPtr[d];
-                }
-            }
+                Ops.Add(result, result, bias);
 
             return result;
         }
 
-        private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName)
+        private Tensor LayerNormOp(Tensor input, string weightName, string biasName)
         {
-            int rows = (int)input.Sizes[0];
-            int dim = (int)input.Sizes[1];
-            var result = new Tensor(_allocator, DType.Float32, rows, dim);
-
-            float* src = GetFloatPtr(input);
-            float* dst = GetFloatPtr(result);
-            float* w = GetFloatPtr(_weights[weightName]);
-            float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null;
-
-            for (int r = 0; r < rows; r++)
-            {
-                float* srcRow = src + r * dim;
-                float* dstRow = dst + r * dim;
-
-                float mean = 0;
-                for (int i = 0; i < dim; i++)
-                    mean += srcRow[i];
-                mean /= dim;
-
-                float variance = 0;
-                for (int i = 0; i < dim; i++)
-                {
-                    float diff = srcRow[i] - mean;
-                    variance += diff * diff;
-                }
-                variance /= dim;
-
-                float invStd = 1f / MathF.Sqrt(variance + _eps);
-                for (int i = 0; i < dim; i++)
-                {
-                    float normalized = (srcRow[i] - mean) * invStd;
-                    dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f);
-                }
-            }
-
-            return result;
+            _weights.TryGetValue(biasName, out var bias);
+            return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps);
         }
 
-        private unsafe Tensor RMSNormOp(Tensor input, string weightName)
+        private Tensor RMSNormOp(Tensor input, string weightName)
         {
-            int rows = (int)input.Sizes[0];
-            int dim = (int)input.Sizes[1];
-            var result = new Tensor(_allocator, DType.Float32, rows, dim);
-
-            float* src = GetFloatPtr(input);
-            float* dst = GetFloatPtr(result);
-            float* w = GetFloatPtr(_weights[weightName]);
-
-            for (int r = 0; r < rows; r++)
-            {
-                float* srcRow = src + r * dim;
-                float* dstRow = dst + r * dim;
-
-                float sumSq = 0;
-                for (int i = 0; i < dim; i++)
-                    sumSq += srcRow[i] * srcRow[i];
-                float rms = 1f / MathF.Sqrt(sumSq / dim + _eps);
-                for (int i = 0; i < dim; i++)
-                    dstRow[i] = w[i] * srcRow[i] * rms;
-            }
-
-            return result;
+            return Ops.RMSNorm(null, input, _weights[weightName], null, _eps);
         }
 
         private unsafe void DumpTensor(Tensor t, string label, int numRows)
@@ -466,8 +394,22 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows)
             throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
         }
 
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
         public void Dispose()
         {
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
             foreach (var w in _weights.Values)
                 w.Dispose();
             _weights.Clear();