diff --git a/InferenceConsole/Program.cs b/InferenceConsole/Program.cs index 70561c0..8056525 100644 --- a/InferenceConsole/Program.cs +++ b/InferenceConsole/Program.cs @@ -93,7 +93,7 @@ static void Main(string[] args) Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}"); Console.Error.WriteLine("Usage: InferenceConsole --model [--input ] " + "[--input-jsonl ] [--image ] [--output ] " + - "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal]"); + "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]"); return; } @@ -102,7 +102,8 @@ static void Main(string[] args) "cpu" => BackendType.Cpu, "ggml_cpu" => BackendType.GgmlCpu, "ggml_metal" => BackendType.GgmlMetal, - _ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal"), + "ggml_cuda" => BackendType.GgmlCuda, + _ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal, ggml_cuda"), }; using var model = ModelBase.Create(modelPath, backend); diff --git a/InferenceEngine/ModelBase.cs b/InferenceEngine/ModelBase.cs index 4e96d7e..20e23c0 100644 --- a/InferenceEngine/ModelBase.cs +++ b/InferenceEngine/ModelBase.cs @@ -24,6 +24,7 @@ public enum BackendType Cpu, GgmlCpu, GgmlMetal, + GgmlCuda, } public class ModelConfig @@ -124,6 +125,10 @@ protected ModelBase(string ggufPath, BackendType backend) _ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Metal); _allocator = new GgmlAllocator(_ggmlContext, 0); break; + case BackendType.GgmlCuda: + _ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Cuda); + _allocator = new GgmlAllocator(_ggmlContext, 0); + break; case BackendType.Cpu: _allocator = new CpuAllocator(BlasEnum.DotNet); break; @@ -135,7 +140,7 @@ protected ModelBase(string ggufPath, BackendType backend) _gguf = new GgufFile(ggufPath); } - protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal; + protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal || _backend == BackendType.GgmlCuda; protected void ParseBaseConfig() { diff --git a/InferenceWeb/ModelService.cs b/InferenceWeb/ModelService.cs index 0a95301..a2169f9 100644 --- a/InferenceWeb/ModelService.cs +++ b/InferenceWeb/ModelService.cs @@ -48,6 +48,7 @@ public void LoadModel(string modelPath, string mmProjPath, string backendStr) _backend = backendStr switch { "ggml_metal" => BackendType.GgmlMetal, + "ggml_cuda" => BackendType.GgmlCuda, "ggml_cpu" => BackendType.GgmlCpu, "cpu" => BackendType.Cpu, _ => BackendType.GgmlCpu diff --git a/InferenceWeb/wwwroot/index.html b/InferenceWeb/wwwroot/index.html index 5686f59..16c1015 100644 --- a/InferenceWeb/wwwroot/index.html +++ b/InferenceWeb/wwwroot/index.html @@ -473,6 +473,7 @@

Load Model

diff --git a/TensorSharp.GGML.Native/CMakeLists.txt b/TensorSharp.GGML.Native/CMakeLists.txt index 04d51db..9e1d05b 100644 --- a/TensorSharp.GGML.Native/CMakeLists.txt +++ b/TensorSharp.GGML.Native/CMakeLists.txt @@ -10,6 +10,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") find_package(Threads REQUIRED) +option(TENSORSHARP_ENABLE_CUDA "Build TensorSharp GGML bridge with CUDA backend support." OFF) set(GGML_VERSION "0.0.0" CACHE STRING "" FORCE) set(GGML_VERSION_MAJOR 0 CACHE STRING "" FORCE) @@ -22,7 +23,7 @@ set(GGML_CPU ON CACHE BOOL "" FORCE) set(GGML_METAL ON CACHE BOOL "" FORCE) set(GGML_METAL_NDEBUG ON CACHE BOOL "" FORCE) set(GGML_METAL_EMBED_LIBRARY ON CACHE BOOL "" FORCE) -set(GGML_CUDA OFF CACHE BOOL "" FORCE) +set(GGML_CUDA ${TENSORSHARP_ENABLE_CUDA} CACHE BOOL "" FORCE) set(GGML_HIP OFF CACHE BOOL "" FORCE) set(GGML_VULKAN OFF CACHE BOOL "" FORCE) set(GGML_OPENCL OFF CACHE BOOL "" FORCE) diff --git a/TensorSharp.GGML.Native/ggml_ops.cpp b/TensorSharp.GGML.Native/ggml_ops.cpp index 13fb21a..0a17642 100644 --- a/TensorSharp.GGML.Native/ggml_ops.cpp +++ b/TensorSharp.GGML.Native/ggml_ops.cpp @@ -23,6 +23,9 @@ #include "ggml-backend.h" #include "ggml-metal.h" #include "ggml-cpu.h" +#if defined(GGML_USE_CUDA) +#include "ggml-cuda.h" +#endif #include "ggml-quants.h" // GGML context memory pool: reuse mem_buffers to avoid per-op allocation overhead @@ -265,6 +268,7 @@ namespace constexpr int BACKEND_TYPE_METAL = 1; constexpr int BACKEND_TYPE_CPU = 2; + constexpr int BACKEND_TYPE_CUDA = 3; void initialize_backend() { @@ -288,6 +292,20 @@ namespace return; } } + else if (g_backend_type == BACKEND_TYPE_CUDA) + { +#if defined(GGML_USE_CUDA) + g_backend = ggml_backend_cuda_init(0); + if (g_backend == nullptr) + { + set_last_error("ggml-cuda backend initialization failed."); + return; + } +#else + set_last_error("ggml-cuda backend requested, but this native bridge was built without CUDA support."); + return; +#endif + } else { set_last_error("Unknown GGML backend type requested."); @@ -299,7 +317,7 @@ namespace bool ensure_backend(int backend_type) { - if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU) + if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU && backend_type != BACKEND_TYPE_CUDA) { set_last_error("Invalid GGML backend type."); return false; diff --git a/TensorSharp.GGML/GgmlAllocator.cs b/TensorSharp.GGML/GgmlAllocator.cs index 5169dd7..78c74cc 100644 --- a/TensorSharp.GGML/GgmlAllocator.cs +++ b/TensorSharp.GGML/GgmlAllocator.cs @@ -23,7 +23,12 @@ public GgmlAllocator(GgmlContext context, int deviceId) this.deviceId = deviceId; } - public BlasEnum BlasEnum => context.BackendType == GgmlBackendType.Metal ? BlasEnum.GGML_METAL : BlasEnum.GGML_CPU; + public BlasEnum BlasEnum => context.BackendType switch + { + GgmlBackendType.Metal => BlasEnum.GGML_METAL, + GgmlBackendType.Cuda => BlasEnum.CUDA, + _ => BlasEnum.GGML_CPU, + }; public int DeviceId => deviceId; @@ -33,7 +38,7 @@ public Storage Allocate(DType elementType, long elementCount) { if (elementType == DType.Float16) { - throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend."); + throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend."); } return new GgmlStorage(this, context, elementType, elementCount); diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.GGML/GgmlBasicOps.cs index 7a944af..16a4fdd 100644 --- a/TensorSharp.GGML/GgmlBasicOps.cs +++ b/TensorSharp.GGML/GgmlBasicOps.cs @@ -1171,13 +1171,13 @@ public static Tensor RoPEEx( [RegisterOpStorageType("float2half", typeof(GgmlStorage))] public Tensor Float2Half(Tensor result, Tensor src) { - throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend."); + throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend."); } [RegisterOpStorageType("half2float", typeof(GgmlStorage))] public Tensor Half2Float(Tensor result, Tensor src) { - throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend."); + throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend."); } private static Tensor ExecuteUnary(Tensor result, Tensor src, GgmlUnaryOp op, string opName) diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.GGML/GgmlNative.cs index c53b76c..bd6bcd5 100644 --- a/TensorSharp.GGML/GgmlNative.cs +++ b/TensorSharp.GGML/GgmlNative.cs @@ -20,6 +20,7 @@ public enum GgmlBackendType { Metal = 1, Cpu = 2, + Cuda = 3, } [StructLayout(LayoutKind.Sequential)] @@ -532,7 +533,12 @@ public static void EnsureAvailable(GgmlBackendType backendType) { if (TSGgml_IsBackendAvailable((int)backendType) == 0) { - string backendName = backendType == GgmlBackendType.Metal ? "ggml-metal" : "ggml-cpu"; + string backendName = backendType switch + { + GgmlBackendType.Metal => "ggml-metal", + GgmlBackendType.Cuda => "ggml-cuda", + _ => "ggml-cpu", + }; throw new InvalidOperationException($"Failed to initialize {backendName}. {GetLastErrorMessage("Build the native GGML bridge and ensure the requested GGML backend is available.")}"); } } diff --git a/TensorSharp.GGML/GgmlStorage.cs b/TensorSharp.GGML/GgmlStorage.cs index bb63cd2..f1ec10e 100644 --- a/TensorSharp.GGML/GgmlStorage.cs +++ b/TensorSharp.GGML/GgmlStorage.cs @@ -179,7 +179,7 @@ public override void SetElementsAsFloat(long index, float[] value) public override void SetElementsAsHalf(long index, half[] value) { - throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend."); + throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend."); } public override void CopyToStorage(long storageIndex, IntPtr src, long byteCount)