lapy · lapy · Mar 9, 2026 · Mar 9, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -100,16 +100,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ninja-build \
     curl \
     ca-certificates \
+    wget \
     # Core libs for Python packages
     libssl3 \
+    libssl-dev \
     libffi8 \
     libcurl4 \
     libcurl4-openssl-dev \
     libopenblas0 \
-    # GPU acceleration support
-    libvulkan1 \
-    vulkan-tools \
-    mesa-vulkan-drivers \
+    # GPU acceleration support (CUDA only)
     ocl-icd-libopencl1 \
     libnuma1 \
     pciutils \

diff --git a/README.md b/README.md
@@ -7,8 +7,6 @@ It is designed for **power users running models on a single machine or small ser
 - **CPU-only** inference (OpenBLAS)
 - **NVIDIA CUDA GPUs** (via the NVIDIA Container Toolkit)
 
-There is **no built-in support for Vulkan/ROCm/Metal backends** and **no Smart Auto feature** – configuration is explicit and predictable.
-
 ### Key capabilities
 
 - **HuggingFace search (GGUF + safetensors)**: Search the Hub, inspect metadata, and plan downloads by quantization or safetensors bundle.

diff --git a/backend/gpu_detector.py b/backend/gpu_detector.py
@@ -1,10 +1,9 @@
 """
 GPU Detection and Capability Discovery
 
-This module provides comprehensive GPU detection across multiple vendors:
+This module provides GPU detection for:
 - NVIDIA GPUs (CUDA support)
-- AMD GPUs (ROCm and Vulkan support)
-- GPU acceleration backends (CUDA, Vulkan, Metal, OpenBLAS)
+- CPU acceleration (OpenBLAS)
 """
 
 import subprocess
@@ -61,20 +60,6 @@ def _cpu_only_response() -> Dict:
 # ============================================================================
 
 
-def _check_vulkan_drivers() -> bool:
-    """Check if Vulkan drivers are installed"""
-    try:
-        result = subprocess.run(
-            ["vulkaninfo", "--summary"], capture_output=True, text=True, timeout=5
-        )
-        return result.returncode == 0
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # Check if vulkan libraries exist
-        return os.path.exists("/usr/share/vulkan") or os.path.exists(
-            "/usr/lib/x86_64-linux-gnu/libvulkan.so"
-        )
-
-
 def _check_openblas() -> bool:
     """Check if OpenBLAS is available"""
     try:
@@ -92,20 +77,6 @@ def _check_openblas() -> bool:
         ) or os.path.exists("/usr/local/lib/libopenblas.so")
 
 
-def _check_metal() -> bool:
-    """Check if Metal is available (macOS only)"""
-    try:
-        if os.uname().sysname == "Darwin":
-            return os.path.exists(
-                "/System/Library/Extensions/GeForceMTLDriver.bundle"
-            ) or os.path.exists(
-                "/Library/Apple/System/Library/CoreServices/GPUWrangler.app"
-            )
-    except:
-        pass
-    return False
-
-
 def _resolve_nvidia_smi() -> Optional[str]:
     """Resolve the nvidia-smi binary path across environments."""
     candidates = []
@@ -549,16 +520,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
                 "recommended": False,
                 "reason": _gpu_disable_reason or "GPU detection disabled",
             },
-            "vulkan": {
-                "available": False,
-                "recommended": False,
-                "reason": "CPU-only mode",
-            },
-            "metal": {
-                "available": False,
-                "recommended": False,
-                "reason": "CPU-only mode",
-            },
             "openblas": {
                 "available": openblas_available,
                 "recommended": openblas_available,
@@ -578,63 +539,21 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
     if gpu_info.get("device_count", 0) > 0:
         cuda_available = vendor == "nvidia"
 
-    # Check other backends
-    metal_available = _check_metal()
     openblas_available = _check_openblas()
 
-    # Vulkan is only available if:
-    # 1. An AMD GPU is detected AND Vulkan drivers are installed, OR
-    # 2. A GPU device directory exists (indicating GPU passthrough in a container)
-    vulkan_available = False
-    if vendor == "amd":
-        # For AMD GPUs, check if Vulkan drivers are available
-        vulkan_available = _check_vulkan_drivers()
-    elif vendor is None:
-        # No specific GPU detected, but check if we have GPU access in a container
-        if os.path.exists("/dev/dri"):
-            vulkan_available = _check_vulkan_drivers()
-
-    # Build capabilities response
     capabilities = {
         "cuda": {
             "available": cuda_available,
-            "recommended": cuda_available
-            and not vulkan_available
-            and not openblas_available,
+            "recommended": cuda_available and not openblas_available,
             "reason": (
                 f"{gpu_info.get('device_count', 0)} NVIDIA GPU(s) detected"
                 if cuda_available
                 else "No NVIDIA GPU detected"
             ),
         },
-        "vulkan": {
-            "available": vulkan_available,
-            "recommended": (vulkan_available and not cuda_available)
-            or (gpu_info.get("vendor") == "amd"),
-            "reason": (
-                "Vulkan drivers available"
-                if vulkan_available
-                else (
-                    "Available for AMD GPUs in containers"
-                    if gpu_info.get("vendor") == "amd"
-                    else "Vulkan drivers not detected"
-                )
-            ),
-        },
-        "metal": {
-            "available": metal_available,
-            "recommended": metal_available
-            and not cuda_available
-            and not vulkan_available,
-            "reason": (
-                "Metal available (macOS)" if metal_available else "Not running on macOS"
-            ),
-        },
         "openblas": {
             "available": openblas_available,
-            "recommended": openblas_available
-            and not cuda_available
-            and not vulkan_available,
+            "recommended": openblas_available and not cuda_available,
             "reason": (
                 "OpenBLAS library available"
                 if openblas_available
@@ -643,22 +562,11 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
         },
     }
 
-    # Special handling for AMD GPUs
     if gpu_info.get("vendor") == "amd":
-        capabilities["cuda"]["reason"] = "AMD GPU detected - use Vulkan instead"
-        capabilities["cuda"]["available"] = False  # Explicitly disable CUDA for AMD
-        capabilities["vulkan"]["recommended"] = True
-        capabilities["vulkan"][
-            "reason"
-        ] = f"AMD GPU detected ({gpu_info.get('device_count', 0)} device(s)) - Vulkan recommended"
-
-    # If no GPU available, recommend OpenBLAS for CPU acceleration
-    if (
-        not cuda_available
-        and not vulkan_available
-        and not metal_available
-        and openblas_available
-    ):
+        capabilities["cuda"]["reason"] = "AMD GPU detected - CUDA not supported"
+        capabilities["cuda"]["available"] = False
+
+    if not cuda_available and openblas_available:
         capabilities["openblas"]["recommended"] = True
 
     return capabilities
@@ -669,11 +577,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]:
 # ============================================================================
 
 
-async def check_vulkan() -> bool:
-    """Legacy function for Vulkan check (for backward compatibility)"""
-    return _check_vulkan_drivers()
-
-
 async def detect_gpu_capabilities() -> Dict[str, bool]:
     """Legacy function for GPU capabilities (for backward compatibility)"""
     try:

diff --git a/backend/llama_manager.py b/backend/llama_manager.py
@@ -23,8 +23,6 @@ class BuildConfig:
 
     # GPU backends
     enable_cuda: bool = False
-    enable_vulkan: bool = False
-    enable_metal: bool = False
     enable_openblas: bool = False
     enable_flash_attention: bool = False  # Enables -DGGML_CUDA_FA_ALL_QUANTS=ON
 
@@ -84,6 +82,12 @@ class LlamaManager:
         "ik_llama.cpp": IK_LLAMA_CPP_REPO,
     }
 
+    # Build options: llama.cpp vs ik_llama.cpp
+    # - Both use the same GGML_* / LLAMA_* CMake options (GGML_CUDA, GGML_NATIVE, LLAMA_BUILD_*, etc.).
+    # - ik_llama.cpp is a fork with IQK quantization and optimizations; IQK is built-in (no extra CMake flag).
+    # - ik_llama.cpp puts the server binary under examples/, so LLAMA_BUILD_EXAMPLES must be ON for
+    #   the server to be built. We enforce build_examples=True when repository_source == "ik_llama.cpp".
+
     def __init__(self):
         # Use absolute path so clone/build work regardless of process cwd (e.g. --app-dir backend)
         if os.path.exists("/app/data"):
@@ -492,8 +496,6 @@ def _extract_asset_features(self, asset_name: str) -> List[str]:
 
         feature_map = {
             "cuda": "CUDA",
-            "vulkan": "Vulkan",
-            "metal": "Metal",
             "opencl": "OpenCL",
             "hip": "HIP/ROCm",
             "rocm": "HIP/ROCm",
@@ -1436,8 +1438,6 @@ def set_flag(flag: str, value: bool):
                     logger.info(
                         f"CUDA configuration: compiler={nvcc_path}, toolkit={validated_cuda_root}"
                     )
-            set_flag("GGML_VULKAN", build_config.enable_vulkan)
-            set_flag("GGML_METAL", build_config.enable_metal)
             set_flag("GGML_BLAS", build_config.enable_openblas)
             if build_config.enable_openblas:
                 cmake_args.append("-DGGML_BLAS_VENDOR=OpenBLAS")
@@ -1461,6 +1461,8 @@ def set_flag(flag: str, value: bool):
             set_flag("LLAMA_BUILD_EXAMPLES", build_config.build_examples)
             set_flag("LLAMA_BUILD_SERVER", build_config.build_server)
             set_flag("LLAMA_TOOLS_INSTALL", build_config.install_tools)
+            # HTTPS support (required for model URLs, etc.)
+            set_flag("LLAMA_OPENSSL", True)
 
             # Advanced GGML options
             set_flag("GGML_BACKEND_DL", build_config.enable_backend_dl)