Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions InferenceConsole/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ static void Main(string[] args)
Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}");
Console.Error.WriteLine("Usage: InferenceConsole --model <path.gguf> [--input <input.txt>] " +
"[--input-jsonl <requests.jsonl>] [--image <image.png>] [--output <output.txt>] " +
"[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal]");
"[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]");
return;
}

Expand All @@ -102,7 +102,8 @@ static void Main(string[] args)
"cpu" => BackendType.Cpu,
"ggml_cpu" => BackendType.GgmlCpu,
"ggml_metal" => BackendType.GgmlMetal,
_ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal"),
"ggml_cuda" => BackendType.GgmlCuda,
_ => throw new ArgumentException($"Unknown backend '{backendStr}'. Use: cpu, ggml_cpu, ggml_metal, ggml_cuda"),
};

using var model = ModelBase.Create(modelPath, backend);
Expand Down
7 changes: 6 additions & 1 deletion InferenceEngine/ModelBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public enum BackendType
Cpu,
GgmlCpu,
GgmlMetal,
GgmlCuda,
}

public class ModelConfig
Expand Down Expand Up @@ -124,6 +125,10 @@ protected ModelBase(string ggufPath, BackendType backend)
_ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Metal);
_allocator = new GgmlAllocator(_ggmlContext, 0);
break;
case BackendType.GgmlCuda:
_ggmlContext = new GgmlContext(new[] { 0 }, GgmlBackendType.Cuda);
_allocator = new GgmlAllocator(_ggmlContext, 0);
break;
case BackendType.Cpu:
_allocator = new CpuAllocator(BlasEnum.DotNet);
break;
Expand All @@ -135,7 +140,7 @@ protected ModelBase(string ggufPath, BackendType backend)
_gguf = new GgufFile(ggufPath);
}

protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal;
protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || _backend == BackendType.GgmlMetal || _backend == BackendType.GgmlCuda;

protected void ParseBaseConfig()
{
Expand Down
1 change: 1 addition & 0 deletions InferenceWeb/ModelService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public void LoadModel(string modelPath, string mmProjPath, string backendStr)
_backend = backendStr switch
{
"ggml_metal" => BackendType.GgmlMetal,
"ggml_cuda" => BackendType.GgmlCuda,
"ggml_cpu" => BackendType.GgmlCpu,
"cpu" => BackendType.Cpu,
_ => BackendType.GgmlCpu
Expand Down
1 change: 1 addition & 0 deletions InferenceWeb/wwwroot/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ <h2>Load Model</h2>
<label>Backend</label>
<select id="backend-select">
<option value="ggml_metal">GGML Metal (GPU)</option>
<option value="ggml_cuda">GGML CUDA (GPU)</option>
<option value="ggml_cpu">GGML CPU</option>
<option value="cpu">CPU</option>
</select>
Expand Down
3 changes: 2 additions & 1 deletion TensorSharp.GGML.Native/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")

find_package(Threads REQUIRED)
option(TENSORSHARP_ENABLE_CUDA "Build TensorSharp GGML bridge with CUDA backend support." OFF)

set(GGML_VERSION "0.0.0" CACHE STRING "" FORCE)
set(GGML_VERSION_MAJOR 0 CACHE STRING "" FORCE)
Expand All @@ -22,7 +23,7 @@ set(GGML_CPU ON CACHE BOOL "" FORCE)
set(GGML_METAL ON CACHE BOOL "" FORCE)
set(GGML_METAL_NDEBUG ON CACHE BOOL "" FORCE)
set(GGML_METAL_EMBED_LIBRARY ON CACHE BOOL "" FORCE)
set(GGML_CUDA OFF CACHE BOOL "" FORCE)
set(GGML_CUDA ${TENSORSHARP_ENABLE_CUDA} CACHE BOOL "" FORCE)
set(GGML_HIP OFF CACHE BOOL "" FORCE)
set(GGML_VULKAN OFF CACHE BOOL "" FORCE)
set(GGML_OPENCL OFF CACHE BOOL "" FORCE)
Expand Down
20 changes: 19 additions & 1 deletion TensorSharp.GGML.Native/ggml_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#include "ggml-backend.h"
#include "ggml-metal.h"
#include "ggml-cpu.h"
#if defined(GGML_USE_CUDA)
#include "ggml-cuda.h"
#endif
#include "ggml-quants.h"

// GGML context memory pool: reuse mem_buffers to avoid per-op allocation overhead
Expand Down Expand Up @@ -265,6 +268,7 @@ namespace

constexpr int BACKEND_TYPE_METAL = 1;
constexpr int BACKEND_TYPE_CPU = 2;
constexpr int BACKEND_TYPE_CUDA = 3;

void initialize_backend()
{
Expand All @@ -288,6 +292,20 @@ namespace
return;
}
}
else if (g_backend_type == BACKEND_TYPE_CUDA)
{
#if defined(GGML_USE_CUDA)
g_backend = ggml_backend_cuda_init(0);
if (g_backend == nullptr)
{
set_last_error("ggml-cuda backend initialization failed.");
return;
}
#else
set_last_error("ggml-cuda backend requested, but this native bridge was built without CUDA support.");
return;
Comment on lines +305 to +306
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Reset backend state when CUDA support is unavailable

When ggml_cuda is requested on a build compiled without GGML_USE_CUDA, this branch returns after setting an error, but ensure_backend has already latched g_backend_type to CUDA and std::call_once prevents re-initialization. That leaves the process stuck so later attempts to load ggml_cpu/ggml_metal fail with “A different GGML backend was already initialized,” meaning a single failed CUDA attempt can permanently break backend selection until restart.

Useful? React with 👍 / 👎.

#endif
}
else
{
set_last_error("Unknown GGML backend type requested.");
Expand All @@ -299,7 +317,7 @@ namespace

bool ensure_backend(int backend_type)
{
if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU)
if (backend_type != BACKEND_TYPE_METAL && backend_type != BACKEND_TYPE_CPU && backend_type != BACKEND_TYPE_CUDA)
{
set_last_error("Invalid GGML backend type.");
return false;
Expand Down
9 changes: 7 additions & 2 deletions TensorSharp.GGML/GgmlAllocator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ public GgmlAllocator(GgmlContext context, int deviceId)
this.deviceId = deviceId;
}

public BlasEnum BlasEnum => context.BackendType == GgmlBackendType.Metal ? BlasEnum.GGML_METAL : BlasEnum.GGML_CPU;
public BlasEnum BlasEnum => context.BackendType switch
{
GgmlBackendType.Metal => BlasEnum.GGML_METAL,
GgmlBackendType.Cuda => BlasEnum.CUDA,
_ => BlasEnum.GGML_CPU,
};

public int DeviceId => deviceId;

Expand All @@ -33,7 +38,7 @@ public Storage Allocate(DType elementType, long elementCount)
{
if (elementType == DType.Float16)
{
throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
}

return new GgmlStorage(this, context, elementType, elementCount);
Expand Down
4 changes: 2 additions & 2 deletions TensorSharp.GGML/GgmlBasicOps.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1171,13 +1171,13 @@ public static Tensor RoPEEx(
[RegisterOpStorageType("float2half", typeof(GgmlStorage))]
public Tensor Float2Half(Tensor result, Tensor src)
{
throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
}

[RegisterOpStorageType("half2float", typeof(GgmlStorage))]
public Tensor Half2Float(Tensor result, Tensor src)
{
throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
}

private static Tensor ExecuteUnary(Tensor result, Tensor src, GgmlUnaryOp op, string opName)
Expand Down
8 changes: 7 additions & 1 deletion TensorSharp.GGML/GgmlNative.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public enum GgmlBackendType
{
Metal = 1,
Cpu = 2,
Cuda = 3,
}

[StructLayout(LayoutKind.Sequential)]
Expand Down Expand Up @@ -532,7 +533,12 @@ public static void EnsureAvailable(GgmlBackendType backendType)
{
if (TSGgml_IsBackendAvailable((int)backendType) == 0)
{
string backendName = backendType == GgmlBackendType.Metal ? "ggml-metal" : "ggml-cpu";
string backendName = backendType switch
{
GgmlBackendType.Metal => "ggml-metal",
GgmlBackendType.Cuda => "ggml-cuda",
_ => "ggml-cpu",
};
throw new InvalidOperationException($"Failed to initialize {backendName}. {GetLastErrorMessage("Build the native GGML bridge and ensure the requested GGML backend is available.")}");
}
}
Expand Down
2 changes: 1 addition & 1 deletion TensorSharp.GGML/GgmlStorage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ public override void SetElementsAsFloat(long index, float[] value)

public override void SetElementsAsHalf(long index, half[] value)
{
throw new NotSupportedException("The GGML Metal backend currently supports Float32 tensors only. Disable AMP to use this backend.");
throw new NotSupportedException("The GGML backend currently supports Float32 tensors only. Disable AMP to use this backend.");
}

public override void CopyToStorage(long storageIndex, IntPtr src, long byteCount)
Expand Down