zhongkaifu · zhongkaifu · Apr 5, 2026 · chatgpt-codex-connector · Apr 5, 2026
diff --git a/InferenceConsole/Program.cs b/InferenceConsole/Program.cs
@@ -93,7 +93,7 @@ static void Main(string[] args)
                 Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}");
                 Console.Error.WriteLine("Usage: InferenceConsole --model <path.gguf> [--input <input.txt>] " +
                     "[--input-jsonl <requests.jsonl>] [--image <image.png>] [--output <output.txt>] " +
-                    "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal]");
+                    "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]");
                 return;
             }
 
@@ -102,7 +102,8 @@ static void Main(string[] args)
                 "cpu" => BackendType.Cpu,
                 "ggml_cpu" => BackendType.GgmlCpu,
                 "ggml_metal" => BackendType.GgmlMetal,
-                _ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal"),
+                "ggml_cuda" => BackendType.GgmlCuda,
+                _ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal, ggml_cuda"),
             };
 
             using var model = ModelBase.Create(modelPath, backend);

diff --git a/InferenceEngine/ModelBase.cs b/InferenceEngine/ModelBase.cs
@@ -24,6 +24,7 @@ public enum BackendType
         Cpu,
         GgmlCpu,
         GgmlMetal,
+        GgmlCuda,
     }
 
     public class ModelConfig
@@ -124,6 +125,10 @@ protected ModelBase(string ggufPath, BackendType backend)
                     _ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Metal);
                     _allocator = new GgmlAllocator(_ggmlContext, 0);
                     break;
+                case BackendType.GgmlCuda:
+                    _ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Cuda);
+                    _allocator = new GgmlAllocator(_ggmlContext, 0);
+                    break;
                 case BackendType.Cpu:
                     _allocator = new CpuAllocator(BlasEnum.DotNet);
                     break;
@@ -135,7 +140,7 @@ protected ModelBase(string ggufPath, BackendType backend)
             _gguf = new GgufFile(ggufPath);
         }
 
-        protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal;
+        protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal || _backend == BackendType.GgmlCuda;
 
         protected void ParseBaseConfig()
         {

diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs
@@ -48,6 +48,7 @@ public void LoadModel(string modelPath, string mmProjPath, string backendStr)
                 _backend = backendStr switch
                 {
                     "ggml_metal" => BackendType.GgmlMetal,
+                    "ggml_cuda" => BackendType.GgmlCuda,
                     "ggml_cpu" => BackendType.GgmlCpu,
                     "cpu" => BackendType.Cpu,
                     _ => BackendType.GgmlCpu

diff --git a/InferenceWeb/wwwroot/index.html b/InferenceWeb/wwwroot/index.html
@@ -473,6 +473,7 @@ <h2>Load Model</h2>
       <label>Backend</label>
       <select id="backend-select">
         <option value="ggml_metal">GGML Metal (GPU)</option>
+        <option value="ggml_cuda">GGML CUDA (GPU)</option>
         <option value="ggml_cpu">GGML CPU</option>
         <option value="cpu">CPU</option>
       </select>

diff --git a/TensorSharp.GGML.Native/CMakeLists.txt b/TensorSharp.GGML.Native/CMakeLists.txt
@@ -10,6 +10,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 
 find_package(Threads REQUIRED)
+option(TENSORSHARP_ENABLE_CUDA "Build TensorSharp GGML bridge with CUDA backend support." OFF)
 
 set(GGML_VERSION "0.0.0" CACHE STRING "" FORCE)
 set(GGML_VERSION_MAJOR 0 CACHE STRING "" FORCE)
@@ -22,7 +23,7 @@ set(GGML_CPU ON CACHE BOOL "" FORCE)
 set(GGML_METAL ON CACHE BOOL "" FORCE)
 set(GGML_METAL_NDEBUG ON CACHE BOOL "" FORCE)
 set(GGML_METAL_EMBED_LIBRARY ON CACHE BOOL "" FORCE)
-set(GGML_CUDA OFF CACHE BOOL "" FORCE)
+set(GGML_CUDA ${TENSORSHARP_ENABLE_CUDA} CACHE BOOL "" FORCE)
 set(GGML_HIP OFF CACHE BOOL "" FORCE)
 set(GGML_VULKAN OFF CACHE BOOL "" FORCE)
 set(GGML_OPENCL OFF CACHE BOOL "" FORCE)

diff --git a/TensorSharp.GGML.Native/ggml_ops.cpp b/TensorSharp.GGML.Native/ggml_ops.cpp
@@ -23,6 +23,9 @@
 #include "ggml-backend.h"
 #include "ggml-metal.h"
 #include "ggml-cpu.h"
+#if defined(GGML_USE_CUDA)
+#include "ggml-cuda.h"
+#endif
 #include "ggml-quants.h"
 
 // GGML context memory pool: reuse mem_buffers to avoid per-op allocation overhead
@@ -265,6 +268,7 @@ namespace
 
     constexpr int BACKEND_TYPE_METAL = 1;
     constexpr int BACKEND_TYPE_CPU = 2;
+    constexpr int BACKEND_TYPE_CUDA = 3;
 
     void initialize_backend()
     {
@@ -288,6 +292,20 @@ namespace
                 return;
             }
         }
+        else if (g_backend_type == BACKEND_TYPE_CUDA)
+        {
+#if defined(GGML_USE_CUDA)
+            g_backend = ggml_backend_cuda_init(0);
+            if (g_backend == nullptr)
+            {
+                set_last_error("ggml-cuda backend initialization failed.");
+                return;
+            }
+#else
+            set_last_error("ggml-cuda backend requested, but this native bridge was built without CUDA support.");
+            return;
+#endif
+        }
         else
         {
             set_last_error("Unknown GGML backend type requested.");
@@ -299,7 +317,7 @@ namespace
 
     bool ensure_backend(int backend_type)
     {
-        if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU)
+        if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU && backend_type != BACKEND_TYPE_CUDA)
         {
             set_last_error("Invalid GGML backend type.");
             return false;

diff --git a/TensorSharp.GGML/GgmlAllocator.cs b/TensorSharp.GGML/GgmlAllocator.cs
@@ -23,7 +23,12 @@ public GgmlAllocator(GgmlContext context, int deviceId)
             this.deviceId = deviceId;
         }
 
-        public BlasEnum BlasEnum => context.BackendType == GgmlBackendType.Metal ? BlasEnum.GGML_METAL : BlasEnum.GGML_CPU;
+        public BlasEnum BlasEnum => context.BackendType switch
+        {
+            GgmlBackendType.Metal => BlasEnum.GGML_METAL,
+            GgmlBackendType.Cuda => BlasEnum.CUDA,
+            _ => BlasEnum.GGML_CPU,
+        };
 
         public int DeviceId => deviceId;
 
@@ -33,7 +38,7 @@ public Storage Allocate(DType elementType, long elementCount)
         {
             if (elementType == DType.Float16)
             {
-                throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
+                throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
             }
 
             return new GgmlStorage(this, context, elementType, elementCount);

diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.GGML/GgmlBasicOps.cs
@@ -1171,13 +1171,13 @@ public static Tensor RoPEEx(
         [RegisterOpStorageType("float2half", typeof(GgmlStorage))]
         public Tensor Float2Half(Tensor result, Tensor src)
         {
-            throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
+            throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
         }
 
         [RegisterOpStorageType("half2float", typeof(GgmlStorage))]
         public Tensor Half2Float(Tensor result, Tensor src)
         {
-            throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
+            throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
         }
 
         private static Tensor ExecuteUnary(Tensor result, Tensor src, GgmlUnaryOp op, string opName)

diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.GGML/GgmlNative.cs
@@ -20,6 +20,7 @@ public enum GgmlBackendType
 {
     Metal = 1,
     Cpu = 2,
+    Cuda = 3,
 }
 
     [StructLayout(LayoutKind.Sequential)]
@@ -532,7 +533,12 @@ public static void EnsureAvailable(GgmlBackendType backendType)
             {
                 if (TSGgml_IsBackendAvailable((int)backendType) == 0)
                 {
-                    string backendName = backendType == GgmlBackendType.Metal ? "ggml-metal" : "ggml-cpu";
+                    string backendName = backendType switch
+                    {
+                        GgmlBackendType.Metal => "ggml-metal",
+                        GgmlBackendType.Cuda => "ggml-cuda",
+                        _ => "ggml-cpu",
+                    };
                     throw new InvalidOperationException($"Failed to initialize {backendName}. {GetLastErrorMessage("Build the native GGML bridge and ensure the requested GGML backend is available.")}");
                 }
             }

diff --git a/TensorSharp.GGML/GgmlStorage.cs b/TensorSharp.GGML/GgmlStorage.cs
@@ -179,7 +179,7 @@ public override void SetElementsAsFloat(long index, float[] value)
 
         public override void SetElementsAsHalf(long index, half[] value)
         {
-            throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
+            throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
         }
 
         public override void CopyToStorage(long storageIndex, IntPtr src, long byteCount)