Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ninja-build \
curl \
ca-certificates \
wget \
# Core libs for Python packages
libssl3 \
libssl-dev \
libffi8 \
libcurl4 \
libcurl4-openssl-dev \
libopenblas0 \
# GPU acceleration support
libvulkan1 \
vulkan-tools \
mesa-vulkan-drivers \
# GPU acceleration support (CUDA only)
ocl-icd-libopencl1 \
libnuma1 \
pciutils \
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ It is designed for **power users running models on a single machine or small ser
- **CPU-only** inference (OpenBLAS)
- **NVIDIA CUDA GPUs** (via the NVIDIA Container Toolkit)

There is **no built-in support for Vulkan/ROCm/Metal backends** and **no Smart Auto feature** – configuration is explicit and predictable.

### Key capabilities

- **HuggingFace search (GGUF + safetensors)**: Search the Hub, inspect metadata, and plan downloads by quantization or safetensors bundle.
Expand Down
113 changes: 8 additions & 105 deletions backend/gpu_detector.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""
GPU Detection and Capability Discovery

This module provides comprehensive GPU detection across multiple vendors:
This module provides GPU detection for:
- NVIDIA GPUs (CUDA support)
- AMD GPUs (ROCm and Vulkan support)
- GPU acceleration backends (CUDA, Vulkan, Metal, OpenBLAS)
- CPU acceleration (OpenBLAS)
"""

import subprocess
Expand Down Expand Up @@ -61,20 +60,6 @@ def _cpu_only_response() -> Dict:
# ============================================================================


def _check_vulkan_drivers() -> bool:
"""Check if Vulkan drivers are installed"""
try:
result = subprocess.run(
["vulkaninfo", "--summary"], capture_output=True, text=True, timeout=5
)
return result.returncode == 0
except (subprocess.CalledProcessError, FileNotFoundError):
# Check if vulkan libraries exist
return os.path.exists("/usr/share/vulkan") or os.path.exists(
"/usr/lib/x86_64-linux-gnu/libvulkan.so"
)


def _check_openblas() -> bool:
"""Check if OpenBLAS is available"""
try:
Expand All @@ -92,20 +77,6 @@ def _check_openblas() -> bool:
) or os.path.exists("/usr/local/lib/libopenblas.so")


def _check_metal() -> bool:
"""Check if Metal is available (macOS only)"""
try:
if os.uname().sysname == "Darwin":
return os.path.exists(
"/System/Library/Extensions/GeForceMTLDriver.bundle"
) or os.path.exists(
"/Library/Apple/System/Library/CoreServices/GPUWrangler.app"
)
except:
pass
return False


def _resolve_nvidia_smi() -> Optional[str]:
"""Resolve the nvidia-smi binary path across environments."""
candidates = []
Expand Down Expand Up @@ -549,16 +520,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
"recommended": False,
"reason": _gpu_disable_reason or "GPU detection disabled",
},
"vulkan": {
"available": False,
"recommended": False,
"reason": "CPU-only mode",
},
"metal": {
"available": False,
"recommended": False,
"reason": "CPU-only mode",
},
"openblas": {
"available": openblas_available,
"recommended": openblas_available,
Expand All @@ -578,63 +539,21 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
if gpu_info.get("device_count", 0) > 0:
cuda_available = vendor == "nvidia"

# Check other backends
metal_available = _check_metal()
openblas_available = _check_openblas()

# Vulkan is only available if:
# 1. An AMD GPU is detected AND Vulkan drivers are installed, OR
# 2. A GPU device directory exists (indicating GPU passthrough in a container)
vulkan_available = False
if vendor == "amd":
# For AMD GPUs, check if Vulkan drivers are available
vulkan_available = _check_vulkan_drivers()
elif vendor is None:
# No specific GPU detected, but check if we have GPU access in a container
if os.path.exists("/dev/dri"):
vulkan_available = _check_vulkan_drivers()

# Build capabilities response
capabilities = {
"cuda": {
"available": cuda_available,
"recommended": cuda_available
and not vulkan_available
and not openblas_available,
"recommended": cuda_available and not openblas_available,
"reason": (
f"{gpu_info.get('device_count', 0)} NVIDIA GPU(s) detected"
if cuda_available
else "No NVIDIA GPU detected"
),
},
"vulkan": {
"available": vulkan_available,
"recommended": (vulkan_available and not cuda_available)
or (gpu_info.get("vendor") == "amd"),
"reason": (
"Vulkan drivers available"
if vulkan_available
else (
"Available for AMD GPUs in containers"
if gpu_info.get("vendor") == "amd"
else "Vulkan drivers not detected"
)
),
},
"metal": {
"available": metal_available,
"recommended": metal_available
and not cuda_available
and not vulkan_available,
"reason": (
"Metal available (macOS)" if metal_available else "Not running on macOS"
),
},
"openblas": {
"available": openblas_available,
"recommended": openblas_available
and not cuda_available
and not vulkan_available,
"recommended": openblas_available and not cuda_available,
"reason": (
"OpenBLAS library available"
if openblas_available
Expand All @@ -643,22 +562,11 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
},
}

# Special handling for AMD GPUs
if gpu_info.get("vendor") == "amd":
capabilities["cuda"]["reason"] = "AMD GPU detected - use Vulkan instead"
capabilities["cuda"]["available"] = False # Explicitly disable CUDA for AMD
capabilities["vulkan"]["recommended"] = True
capabilities["vulkan"][
"reason"
] = f"AMD GPU detected ({gpu_info.get('device_count', 0)} device(s)) - Vulkan recommended"

# If no GPU available, recommend OpenBLAS for CPU acceleration
if (
not cuda_available
and not vulkan_available
and not metal_available
and openblas_available
):
capabilities["cuda"]["reason"] = "AMD GPU detected - CUDA not supported"
capabilities["cuda"]["available"] = False

if not cuda_available and openblas_available:
capabilities["openblas"]["recommended"] = True

return capabilities
Expand All @@ -669,11 +577,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
# ============================================================================


async def check_vulkan() -> bool:
"""Legacy function for Vulkan check (for backward compatibility)"""
return _check_vulkan_drivers()


async def detect_gpu_capabilities() -> Dict[str, bool]:
"""Legacy function for GPU capabilities (for backward compatibility)"""
try:
Expand Down
14 changes: 8 additions & 6 deletions backend/llama_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ class BuildConfig:

# GPU backends
enable_cuda: bool = False
enable_vulkan: bool = False
enable_metal: bool = False
enable_openblas: bool = False
enable_flash_attention: bool = False # Enables -DGGML_CUDA_FA_ALL_QUANTS=ON

Expand Down Expand Up @@ -84,6 +82,12 @@ class LlamaManager:
"ik_llama.cpp": IK_LLAMA_CPP_REPO,
}

# Build options: llama.cpp vs ik_llama.cpp
# - Both use the same GGML_* / LLAMA_* CMake options (GGML_CUDA, GGML_NATIVE, LLAMA_BUILD_*, etc.).
# - ik_llama.cpp is a fork with IQK quantization and optimizations; IQK is built-in (no extra CMake flag).
# - ik_llama.cpp puts the server binary under examples/, so LLAMA_BUILD_EXAMPLES must be ON for
# the server to be built. We enforce build_examples=True when repository_source == "ik_llama.cpp".

def __init__(self):
# Use absolute path so clone/build work regardless of process cwd (e.g. --app-dir backend)
if os.path.exists("/app/data"):
Expand Down Expand Up @@ -492,8 +496,6 @@ def _extract_asset_features(self, asset_name: str) -> List[str]:

feature_map = {
"cuda": "CUDA",
"vulkan": "Vulkan",
"metal": "Metal",
"opencl": "OpenCL",
"hip": "HIP/ROCm",
"rocm": "HIP/ROCm",
Expand Down Expand Up @@ -1436,8 +1438,6 @@ def set_flag(flag: str, value: bool):
logger.info(
f"CUDA configuration: compiler={nvcc_path}, toolkit={validated_cuda_root}"
)
set_flag("GGML_VULKAN", build_config.enable_vulkan)
set_flag("GGML_METAL", build_config.enable_metal)
set_flag("GGML_BLAS", build_config.enable_openblas)
if build_config.enable_openblas:
cmake_args.append("-DGGML_BLAS_VENDOR=OpenBLAS")
Expand All @@ -1461,6 +1461,8 @@ def set_flag(flag: str, value: bool):
set_flag("LLAMA_BUILD_EXAMPLES", build_config.build_examples)
set_flag("LLAMA_BUILD_SERVER", build_config.build_server)
set_flag("LLAMA_TOOLS_INSTALL", build_config.install_tools)
# HTTPS support (required for model URLs, etc.)
set_flag("LLAMA_OPENSSL", True)

# Advanced GGML options
set_flag("GGML_BACKEND_DL", build_config.enable_backend_dl)
Expand Down
Loading
Loading