Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion InferenceConsole/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ static string RunInference(ModelBase model, string rawText, List<string> imagePa
var tokenCounts = new int[imagePaths.Count];
for (int i = 0; i < imagePaths.Count; i++)
{
var (width, height) = Qwen35ImageProcessor.ReadPngDimensions(imagePaths[i]);
var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]);
tokenCounts[i] = processor.ComputeImageTokenCount(height, width);
var (gridH, gridW) = processor.GetPatchGrid(height, width);
var (resizedH, resizedW) = processor.SmartResize(height, width);
Expand Down
4 changes: 4 additions & 0 deletions InferenceEngine/InferenceEngine.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
<ItemGroup>
<PackageReference Include="NLayer" Version="1.16.0" />
<PackageReference Include="NVorbis" Version="0.10.5" />
<PackageReference Include="OneWare.OpenCvSharp4.runtime.ubuntu.24.04-x64" Version="4.13.0.18" />
<PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-arm64" Version="4.13.0.18" />
<PackageReference Include="OneWare.OpenCvSharp4.runtime.osx-x64" Version="4.13.0.18" />
<PackageReference Include="OneWare.OpenCvSharp4.runtime.win-x64" Version="4.13.0.18" />
<PackageReference Include="OpenCvSharp4" Version="4.13.0.20260330" />
<PackageReference Include="StbImageSharp" Version="2.30.15" />
</ItemGroup>
</Project>
66 changes: 62 additions & 4 deletions InferenceEngine/MediaHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,26 @@ namespace InferenceEngine
{
public static class MediaHelper
{
public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 8, double fps = 1.0)
public const int DefaultVideoMaxFrames = 4;

public static int GetConfiguredMaxVideoFrames(int fallback = DefaultVideoMaxFrames)
{
string raw = Environment.GetEnvironmentVariable("VIDEO_MAX_FRAMES");
if (!string.IsNullOrWhiteSpace(raw) &&
int.TryParse(raw, out int parsed) &&
parsed > 0)
{
return parsed;
}

return fallback > 0 ? fallback : DefaultVideoMaxFrames;
}

public static List<string> ExtractVideoFrames(string videoPath, int maxFrames = 0, double fps = 1.0)
{
if (maxFrames <= 0)
maxFrames = GetConfiguredMaxVideoFrames();

string tempDir = Path.Combine(Path.GetTempPath(), $"frames_{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);

Expand All @@ -33,14 +51,18 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
throw new Exception($"Invalid video: fps={videoFps}, frames={totalFrames}");

int frameInterval = Math.Max(1, (int)Math.Round(videoFps / fps));
var candidateFrames = new List<int>();
for (int frameIdx = 0; frameIdx < totalFrames; frameIdx += frameInterval)
candidateFrames.Add(frameIdx);

var selectedPositions = SelectEvenlySpacedIndices(candidateFrames.Count, maxFrames);

var frames = new List<string>();
using var mat = new Mat();

for (int frameIdx = 0; frames.Count < maxFrames; frameIdx += frameInterval)
foreach (int pos in selectedPositions)
{
if (frameIdx >= totalFrames)
break;
int frameIdx = candidateFrames[pos];

capture.Set(VideoCaptureProperties.PosFrames, frameIdx);
if (!capture.Read(mat) || mat.Empty())
Expand All @@ -54,6 +76,42 @@ public static List<string> ExtractVideoFrames(string videoPath, int maxFrames =
return frames;
}

public static List<int> SelectEvenlySpacedIndices(int count, int maxCount)
{
var indices = new List<int>();
if (count <= 0 || maxCount <= 0)
return indices;

if (count <= maxCount)
{
for (int i = 0; i < count; i++)
indices.Add(i);
return indices;
}

if (maxCount == 1)
{
indices.Add(count / 2);
return indices;
}

double step = (double)(count - 1) / (maxCount - 1);
int previous = -1;
for (int i = 0; i < maxCount; i++)
{
int idx = (int)Math.Round(i * step);
if (idx <= previous)
idx = previous + 1;
if (idx >= count)
idx = count - 1;

indices.Add(idx);
previous = idx;
}

return indices;
}

private static void SaveMatAsPng(Mat mat, string path)
{
int width = mat.Cols;
Expand Down
12 changes: 12 additions & 0 deletions InferenceEngine/ModelBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ protected void CopyToCache(Tensor cache, Tensor src, int startPos, int seqLen)
{
using var cacheSlice = cache.Narrow(1, startPos, seqLen);
Ops.Copy(cacheSlice, src);
InvalidateTensorDeviceCache(cache);
}

protected Tensor ExpandKVHeads(Tensor cache, int groupSize, int totalSeqLen)
Expand Down Expand Up @@ -653,6 +654,9 @@ protected unsafe void CopyToCacheDecode(Tensor kCache, Tensor kTensor,
Buffer.MemoryCopy(kSrc + srcOffset, kCachePtr + cacheOffset, headBytes, headBytes);
Buffer.MemoryCopy(vSrc + srcOffset, vCachePtr + cacheOffset, headBytes, headBytes);
}

InvalidateTensorDeviceCache(kCache);
InvalidateTensorDeviceCache(vCache);
}

protected unsafe void AttentionDecodePureCS(Tensor q, Tensor kCache, Tensor vCache,
Expand Down Expand Up @@ -718,6 +722,14 @@ private static IntPtr GetStoragePtr(Tensor t)
throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
}

protected void InvalidateTensorDeviceCache(Tensor tensor)
{
if (!IsGgmlBackend || tensor == null)
return;

GgmlBasicOps.InvalidateHostBuffer(GetStoragePtr(tensor));
}

public abstract float[] Forward(int[] tokens);
public abstract void ResetKVCache();

Expand Down
12 changes: 10 additions & 2 deletions InferenceEngine/Models/Gemma3/Gemma3Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,16 @@ public override void ResetKVCache()
_cacheSeqLen = 0;
if (_kvCacheK != null)
{
foreach (var k in _kvCacheK) Ops.Fill(k, 0f);
foreach (var v in _kvCacheV) Ops.Fill(v, 0f);
foreach (var k in _kvCacheK)
{
Ops.Fill(k, 0f);
InvalidateTensorDeviceCache(k);
}
foreach (var v in _kvCacheV)
{
Ops.Fill(v, 0f);
InvalidateTensorDeviceCache(v);
}
}
}

Expand Down
134 changes: 38 additions & 96 deletions InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@
using System.Collections.Generic;
using TensorSharp;
using TensorSharp.Cpu;
using TensorSharp.GGML;

namespace InferenceEngine
{
public class Gemma3VisionEncoder : IDisposable
{
private readonly Dictionary<string, Tensor> _weights = new();
private readonly Dictionary<string, Tensor> _transposedWeights = new();
private readonly IAllocator _allocator;
private readonly bool _useNativeAttention;

private readonly int _imageSize;
private readonly int _patchSize;
Expand All @@ -35,6 +38,7 @@ public class Gemma3VisionEncoder : IDisposable
public Gemma3VisionEncoder(string mmProjPath, IAllocator allocator)
{
_allocator = allocator;
_useNativeAttention = allocator is GgmlAllocator;
var gguf = new GgufFile(mmProjPath);

_imageSize = (int)gguf.GetUint32("clip.vision.image_size", 896);
Expand Down Expand Up @@ -219,7 +223,16 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,

float scale = 1f / MathF.Sqrt(headDim);

// Reshape [numPatches, hiddenSize] -> [numHeads, numPatches, headDim]
if (_useNativeAttention)
{
using var q4 = q.View(1, numPatches, _numHeads, headDim);
using var k4 = k.View(1, numPatches, _numHeads, headDim);
using var v4 = v.View(1, numPatches, _numHeads, headDim);
using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale);
using var flat = attn4.View(numPatches, _hiddenSize);
return LinearForwardWithBias(flat, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
}

using var qReshaped = q.View(numPatches, _numHeads, headDim);
using var kReshaped = k.View(numPatches, _numHeads, headDim);
using var vReshaped = v.View(numPatches, _numHeads, headDim);
Expand All @@ -231,49 +244,30 @@ private Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
using var kHeads = Ops.NewContiguous(kT0);
using var vHeads = Ops.NewContiguous(vT0);

// Batched Q @ K^T -> [numHeads, numPatches, numPatches]
using var kT = kHeads.Transpose(1, 2);
var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);

Ops.Softmax(scores, scores);

// Batched softmax @ V -> [numHeads, numPatches, headDim]
var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, headDim);
Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads);
scores.Dispose();

// Reshape back: [numHeads, numPatches, headDim] -> [numPatches, hiddenSize]
using var transposed = attnOutput.Transpose(0, 1);
using var contiguous = Ops.NewContiguous(transposed);
using var flat = contiguous.View(numPatches, _hiddenSize);
using var flatContig = Ops.NewContiguous(flat);
using var flatContig = contiguous.View(numPatches, _hiddenSize);
attnOutput.Dispose();

return LinearForwardWithBias(flatContig, $"{prefix}.attn_out.weight", $"{prefix}.attn_out.bias");
}

private unsafe Tensor VisionMLP(Tensor input, string prefix)
private Tensor VisionMLP(Tensor input, string prefix)
{
using var fc1Out = LinearForwardWithBias(input, $"{prefix}.ffn_down.weight", $"{prefix}.ffn_down.bias");

ApplyGELU(fc1Out);

Ops.GELU(fc1Out, fc1Out);
return LinearForwardWithBias(fc1Out, $"{prefix}.ffn_up.weight", $"{prefix}.ffn_up.bias");
}

private unsafe void ApplyGELU(Tensor t)
{
float* ptr = GetFloatPtr(t);
int count = (int)t.ElementCount();
for (int i = 0; i < count; i++)
{
double x = ptr[i];
double cdf = 0.5 * (1.0 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x)));
ptr[i] = (float)(x * cdf);
}
}

/// <summary>
/// Multi-modal projector: vision output → text space.
/// Steps: reshape to 2D grid → average pool → RMSNorm → linear projection.
Expand Down Expand Up @@ -354,91 +348,25 @@ private unsafe Tensor LinearForwardWithBias(Tensor input, string weightName, str

Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input);
Tensor src = contiguousInput ?? input;

using var wT = weight.Transpose();
Ops.Addmm(result, 0, result, 1.0f, src, wT);
Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName));

contiguousInput?.Dispose();

if (_weights.TryGetValue(biasName, out var bias))
{
float* rPtr = GetFloatPtr(result);
float* bPtr = GetFloatPtr(bias);
for (int s = 0; s < seqLen; s++)
{
float* row = rPtr + s * outDim;
for (int d = 0; d < outDim; d++)
row[d] += bPtr[d];
}
}
Ops.Add(result, result, bias);

return result;
}

private unsafe Tensor LayerNormOp(Tensor input, string weightName, string biasName)
private Tensor LayerNormOp(Tensor input, string weightName, string biasName)
{
int rows = (int)input.Sizes[0];
int dim = (int)input.Sizes[1];
var result = new Tensor(_allocator, DType.Float32, rows, dim);

float* src = GetFloatPtr(input);
float* dst = GetFloatPtr(result);
float* w = GetFloatPtr(_weights[weightName]);
float* b = _weights.ContainsKey(biasName) ? GetFloatPtr(_weights[biasName]) : null;

for (int r = 0; r < rows; r++)
{
float* srcRow = src + r * dim;
float* dstRow = dst + r * dim;

float mean = 0;
for (int i = 0; i < dim; i++)
mean += srcRow[i];
mean /= dim;

float variance = 0;
for (int i = 0; i < dim; i++)
{
float diff = srcRow[i] - mean;
variance += diff * diff;
}
variance /= dim;

float invStd = 1f / MathF.Sqrt(variance + _eps);
for (int i = 0; i < dim; i++)
{
float normalized = (srcRow[i] - mean) * invStd;
dstRow[i] = w[i] * normalized + (b != null ? b[i] : 0f);
}
}

return result;
_weights.TryGetValue(biasName, out var bias);
return Ops.LayerNorm(null, input, _weights[weightName], bias, _eps);
}

private unsafe Tensor RMSNormOp(Tensor input, string weightName)
private Tensor RMSNormOp(Tensor input, string weightName)
{
int rows = (int)input.Sizes[0];
int dim = (int)input.Sizes[1];
var result = new Tensor(_allocator, DType.Float32, rows, dim);

float* src = GetFloatPtr(input);
float* dst = GetFloatPtr(result);
float* w = GetFloatPtr(_weights[weightName]);

for (int r = 0; r < rows; r++)
{
float* srcRow = src + r * dim;
float* dstRow = dst + r * dim;

float sumSq = 0;
for (int i = 0; i < dim; i++)
sumSq += srcRow[i] * srcRow[i];
float rms = 1f / MathF.Sqrt(sumSq / dim + _eps);
for (int i = 0; i < dim; i++)
dstRow[i] = w[i] * srcRow[i] * rms;
}

return result;
return Ops.RMSNorm(null, input, _weights[weightName], null, _eps);
}

private unsafe void DumpTensor(Tensor t, string label, int numRows)
Expand Down Expand Up @@ -466,8 +394,22 @@ private unsafe void DumpTensor(Tensor t, string label, int numRows)
throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
}

private Tensor GetOrCreateTransposedWeight(string weightName)
{
if (_transposedWeights.TryGetValue(weightName, out var transposed))
return transposed;

using var weightViewT = _weights[weightName].Transpose();
transposed = Ops.NewContiguous(weightViewT);
_transposedWeights[weightName] = transposed;
return transposed;
}

public void Dispose()
{
foreach (var w in _transposedWeights.Values)
w.Dispose();
_transposedWeights.Clear();
foreach (var w in _weights.Values)
w.Dispose();
_weights.Clear();
Expand Down
Loading