diff --git a/Dockerfile b/Dockerfile index dcc4173..59d6bea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -100,16 +100,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ninja-build \ curl \ ca-certificates \ + wget \ # Core libs for Python packages libssl3 \ + libssl-dev \ libffi8 \ libcurl4 \ libcurl4-openssl-dev \ libopenblas0 \ - # GPU acceleration support - libvulkan1 \ - vulkan-tools \ - mesa-vulkan-drivers \ + # GPU acceleration support (CUDA only) ocl-icd-libopencl1 \ libnuma1 \ pciutils \ diff --git a/README.md b/README.md index cb1ecef..b8db9e3 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,6 @@ It is designed for **power users running models on a single machine or small ser - **CPU-only** inference (OpenBLAS) - **NVIDIA CUDA GPUs** (via the NVIDIA Container Toolkit) -There is **no built-in support for Vulkan/ROCm/Metal backends** and **no Smart Auto feature** – configuration is explicit and predictable. - ### Key capabilities - **HuggingFace search (GGUF + safetensors)**: Search the Hub, inspect metadata, and plan downloads by quantization or safetensors bundle. diff --git a/backend/gpu_detector.py b/backend/gpu_detector.py index 9591feb..b1fa038 100644 --- a/backend/gpu_detector.py +++ b/backend/gpu_detector.py @@ -1,10 +1,9 @@ """ GPU Detection and Capability Discovery -This module provides comprehensive GPU detection across multiple vendors: +This module provides GPU detection for: - NVIDIA GPUs (CUDA support) -- AMD GPUs (ROCm and Vulkan support) -- GPU acceleration backends (CUDA, Vulkan, Metal, OpenBLAS) +- CPU acceleration (OpenBLAS) """ import subprocess @@ -61,20 +60,6 @@ def _cpu_only_response() -> Dict: # ============================================================================ -def _check_vulkan_drivers() -> bool: - """Check if Vulkan drivers are installed""" - try: - result = subprocess.run( - ["vulkaninfo", "--summary"], capture_output=True, text=True, timeout=5 - ) - return result.returncode == 0 - except (subprocess.CalledProcessError, FileNotFoundError): - # Check if vulkan libraries exist - return os.path.exists("/usr/share/vulkan") or os.path.exists( - "/usr/lib/x86_64-linux-gnu/libvulkan.so" - ) - - def _check_openblas() -> bool: """Check if OpenBLAS is available""" try: @@ -92,20 +77,6 @@ def _check_openblas() -> bool: ) or os.path.exists("/usr/local/lib/libopenblas.so") -def _check_metal() -> bool: - """Check if Metal is available (macOS only)""" - try: - if os.uname().sysname == "Darwin": - return os.path.exists( - "/System/Library/Extensions/GeForceMTLDriver.bundle" - ) or os.path.exists( - "/Library/Apple/System/Library/CoreServices/GPUWrangler.app" - ) - except: - pass - return False - - def _resolve_nvidia_smi() -> Optional[str]: """Resolve the nvidia-smi binary path across environments.""" candidates = [] @@ -549,16 +520,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]: "recommended": False, "reason": _gpu_disable_reason or "GPU detection disabled", }, - "vulkan": { - "available": False, - "recommended": False, - "reason": "CPU-only mode", - }, - "metal": { - "available": False, - "recommended": False, - "reason": "CPU-only mode", - }, "openblas": { "available": openblas_available, "recommended": openblas_available, @@ -578,63 +539,21 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]: if gpu_info.get("device_count", 0) > 0: cuda_available = vendor == "nvidia" - # Check other backends - metal_available = _check_metal() openblas_available = _check_openblas() - # Vulkan is only available if: - # 1. An AMD GPU is detected AND Vulkan drivers are installed, OR - # 2. A GPU device directory exists (indicating GPU passthrough in a container) - vulkan_available = False - if vendor == "amd": - # For AMD GPUs, check if Vulkan drivers are available - vulkan_available = _check_vulkan_drivers() - elif vendor is None: - # No specific GPU detected, but check if we have GPU access in a container - if os.path.exists("/dev/dri"): - vulkan_available = _check_vulkan_drivers() - - # Build capabilities response capabilities = { "cuda": { "available": cuda_available, - "recommended": cuda_available - and not vulkan_available - and not openblas_available, + "recommended": cuda_available and not openblas_available, "reason": ( f"{gpu_info.get('device_count', 0)} NVIDIA GPU(s) detected" if cuda_available else "No NVIDIA GPU detected" ), }, - "vulkan": { - "available": vulkan_available, - "recommended": (vulkan_available and not cuda_available) - or (gpu_info.get("vendor") == "amd"), - "reason": ( - "Vulkan drivers available" - if vulkan_available - else ( - "Available for AMD GPUs in containers" - if gpu_info.get("vendor") == "amd" - else "Vulkan drivers not detected" - ) - ), - }, - "metal": { - "available": metal_available, - "recommended": metal_available - and not cuda_available - and not vulkan_available, - "reason": ( - "Metal available (macOS)" if metal_available else "Not running on macOS" - ), - }, "openblas": { "available": openblas_available, - "recommended": openblas_available - and not cuda_available - and not vulkan_available, + "recommended": openblas_available and not cuda_available, "reason": ( "OpenBLAS library available" if openblas_available @@ -643,22 +562,11 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]: }, } - # Special handling for AMD GPUs if gpu_info.get("vendor") == "amd": - capabilities["cuda"]["reason"] = "AMD GPU detected - use Vulkan instead" - capabilities["cuda"]["available"] = False # Explicitly disable CUDA for AMD - capabilities["vulkan"]["recommended"] = True - capabilities["vulkan"][ - "reason" - ] = f"AMD GPU detected ({gpu_info.get('device_count', 0)} device(s)) - Vulkan recommended" - - # If no GPU available, recommend OpenBLAS for CPU acceleration - if ( - not cuda_available - and not vulkan_available - and not metal_available - and openblas_available - ): + capabilities["cuda"]["reason"] = "AMD GPU detected - CUDA not supported" + capabilities["cuda"]["available"] = False + + if not cuda_available and openblas_available: capabilities["openblas"]["recommended"] = True return capabilities @@ -669,11 +577,6 @@ async def detect_build_capabilities() -> Dict[str, Dict[str, any]]: # ============================================================================ -async def check_vulkan() -> bool: - """Legacy function for Vulkan check (for backward compatibility)""" - return _check_vulkan_drivers() - - async def detect_gpu_capabilities() -> Dict[str, bool]: """Legacy function for GPU capabilities (for backward compatibility)""" try: diff --git a/backend/llama_manager.py b/backend/llama_manager.py index 55f5d2e..4659f8e 100644 --- a/backend/llama_manager.py +++ b/backend/llama_manager.py @@ -23,8 +23,6 @@ class BuildConfig: # GPU backends enable_cuda: bool = False - enable_vulkan: bool = False - enable_metal: bool = False enable_openblas: bool = False enable_flash_attention: bool = False # Enables -DGGML_CUDA_FA_ALL_QUANTS=ON @@ -84,6 +82,12 @@ class LlamaManager: "ik_llama.cpp": IK_LLAMA_CPP_REPO, } + # Build options: llama.cpp vs ik_llama.cpp + # - Both use the same GGML_* / LLAMA_* CMake options (GGML_CUDA, GGML_NATIVE, LLAMA_BUILD_*, etc.). + # - ik_llama.cpp is a fork with IQK quantization and optimizations; IQK is built-in (no extra CMake flag). + # - ik_llama.cpp puts the server binary under examples/, so LLAMA_BUILD_EXAMPLES must be ON for + # the server to be built. We enforce build_examples=True when repository_source == "ik_llama.cpp". + def __init__(self): # Use absolute path so clone/build work regardless of process cwd (e.g. --app-dir backend) if os.path.exists("/app/data"): @@ -492,8 +496,6 @@ def _extract_asset_features(self, asset_name: str) -> List[str]: feature_map = { "cuda": "CUDA", - "vulkan": "Vulkan", - "metal": "Metal", "opencl": "OpenCL", "hip": "HIP/ROCm", "rocm": "HIP/ROCm", @@ -1436,8 +1438,6 @@ def set_flag(flag: str, value: bool): logger.info( f"CUDA configuration: compiler={nvcc_path}, toolkit={validated_cuda_root}" ) - set_flag("GGML_VULKAN", build_config.enable_vulkan) - set_flag("GGML_METAL", build_config.enable_metal) set_flag("GGML_BLAS", build_config.enable_openblas) if build_config.enable_openblas: cmake_args.append("-DGGML_BLAS_VENDOR=OpenBLAS") @@ -1461,6 +1461,8 @@ def set_flag(flag: str, value: bool): set_flag("LLAMA_BUILD_EXAMPLES", build_config.build_examples) set_flag("LLAMA_BUILD_SERVER", build_config.build_server) set_flag("LLAMA_TOOLS_INSTALL", build_config.install_tools) + # HTTPS support (required for model URLs, etc.) + set_flag("LLAMA_OPENSSL", True) # Advanced GGML options set_flag("GGML_BACKEND_DL", build_config.enable_backend_dl) diff --git a/backend/routes/llama_versions.py b/backend/routes/llama_versions.py index deb69ac..de6ab8f 100644 --- a/backend/routes/llama_versions.py +++ b/backend/routes/llama_versions.py @@ -97,14 +97,28 @@ async def list_llama_versions(): def _default_build_settings() -> dict: - """Default build-settings payload for engines when nothing is saved yet.""" + """Default build-settings payload for engines when nothing is saved yet. + Covers all BuildConfig fields so backend and frontend stay in sync. + """ return { + "build_type": "Release", "cuda": False, + "openblas": False, "flash_attention": False, - "native": True, + "build_common": True, + "build_tests": True, + "build_tools": True, + "build_examples": True, + "build_server": True, + "install_tools": True, "backend_dl": False, "cpu_all_variants": False, + "lto": False, + "native": True, + "custom_cmake_args": "", "cuda_architectures": "", + "cflags": "", + "cxxflags": "", } @@ -120,25 +134,56 @@ def _bool(v): return v.strip().lower() in ("1", "true", "yes", "on") return bool(v) + def _str(v, default=""): + return str(v).strip() if v is not None else default + + build_type = _str(settings.get("build_type"), base["build_type"]) + if build_type not in ("Debug", "Release", "RelWithDebInfo", "MinSizeRel"): + build_type = base["build_type"] + return { + "build_type": build_type, "cuda": _bool(settings.get("cuda", base["cuda"])), + "openblas": _bool(settings.get("openblas", base["openblas"])), "flash_attention": _bool(settings.get("flash_attention", base["flash_attention"])), - "native": _bool(settings.get("native", base["native"])), + "build_common": _bool(settings.get("build_common", base["build_common"])), + "build_tests": _bool(settings.get("build_tests", base["build_tests"])), + "build_tools": _bool(settings.get("build_tools", base["build_tools"])), + "build_examples": _bool(settings.get("build_examples", base["build_examples"])), + "build_server": _bool(settings.get("build_server", base["build_server"])), + "install_tools": _bool(settings.get("install_tools", base["install_tools"])), "backend_dl": _bool(settings.get("backend_dl", base["backend_dl"])), "cpu_all_variants": _bool(settings.get("cpu_all_variants", base["cpu_all_variants"])), - "cuda_architectures": str(settings.get("cuda_architectures") or ""), + "lto": _bool(settings.get("lto", base["lto"])), + "native": _bool(settings.get("native", base["native"])), + "custom_cmake_args": _str(settings.get("custom_cmake_args"), base["custom_cmake_args"]), + "cuda_architectures": _str(settings.get("cuda_architectures"), base["cuda_architectures"]), + "cflags": _str(settings.get("cflags"), base["cflags"]), + "cxxflags": _str(settings.get("cxxflags"), base["cxxflags"]), } def _build_config_from_settings(settings: Optional[dict]) -> BuildConfig: normalized = _coerce_build_settings(settings) return BuildConfig( + build_type=normalized["build_type"], enable_cuda=normalized["cuda"], + enable_openblas=normalized["openblas"], enable_flash_attention=normalized["flash_attention"], - enable_native=normalized["native"], + build_common=normalized["build_common"], + build_tests=normalized["build_tests"], + build_tools=normalized["build_tools"], + build_examples=normalized["build_examples"], + build_server=normalized["build_server"], + install_tools=normalized["install_tools"], enable_backend_dl=normalized["backend_dl"], enable_cpu_all_variants=normalized["cpu_all_variants"], + enable_lto=normalized["lto"], + enable_native=normalized["native"], + custom_cmake_args=normalized["custom_cmake_args"], cuda_architectures=normalized["cuda_architectures"], + cflags=normalized["cflags"], + cxxflags=normalized["cxxflags"], ) @@ -187,6 +232,14 @@ def _fetch_latest_release(repository_source: str) -> Optional[dict]: return None +def _apply_engine_specific_build_defaults(engine: str, settings: dict) -> dict: + """Apply engine-specific build defaults. ik_llama.cpp requires LLAMA_BUILD_EXAMPLES=ON (server in examples).""" + out = dict(settings) + if engine == "ik_llama": + out["build_examples"] = True + return out + + @router.get("/build-settings") async def get_build_settings(engine: str = "llama_cpp"): """Get persisted build settings for an engine ('llama_cpp' or 'ik_llama').""" @@ -197,7 +250,7 @@ async def get_build_settings(engine: str = "llama_cpp"): # Always return a full shape so the frontend can rely on defaults. base = _default_build_settings() base.update({k: v for k, v in settings.items() if k in base}) - return base + return _apply_engine_specific_build_defaults(engine, base) @router.put("/build-settings") @@ -211,10 +264,11 @@ async def update_build_settings(engine: str = "llama_cpp", settings: dict = Body # Only persist known build keys; ignore extras. allowed = _default_build_settings().keys() filtered = {k: v for k, v in settings.items() if k in allowed} + filtered = _apply_engine_specific_build_defaults(engine, filtered) stored = store.update_engine_build_settings(engine, filtered) base = _default_build_settings() base.update({k: v for k, v in stored.items() if k in base}) - return base + return _apply_engine_specific_build_defaults(engine, base) @router.post("/update") @@ -320,7 +374,7 @@ async def get_release_assets(tag_name: str): @router.get("/build-capabilities") async def get_build_capabilities_endpoint(): - """Get build capabilities based on detected hardware""" + """Get build capabilities (CUDA, OpenBLAS).""" try: return await detect_build_capabilities() except Exception as e: @@ -332,16 +386,6 @@ async def get_build_capabilities_endpoint(): "recommended": False, "reason": f"Error: {str(e)}", }, - "vulkan": { - "available": False, - "recommended": False, - "reason": f"Error: {str(e)}", - }, - "metal": { - "available": False, - "recommended": False, - "reason": f"Error: {str(e)}", - }, "openblas": { "available": False, "recommended": False, @@ -459,14 +503,32 @@ def _bool(v): return v.strip().lower() in ("1", "true", "yes", "on") return bool(v) - # Frontend sends cuda, flash_attention, native, backend_dl, cpu_all_variants + def _str(v, default=""): + return str(v).strip() if v is not None else default + + bt = _str(build_config_dict.get("build_type"), "Release") + if bt not in ("Debug", "Release", "RelWithDebInfo", "MinSizeRel"): + bt = "Release" + mapped = { + "build_type": bt, "enable_cuda": _bool(build_config_dict.get("cuda", False)), + "enable_openblas": _bool(build_config_dict.get("openblas", False)), "enable_flash_attention": _bool(build_config_dict.get("flash_attention", False)), - "enable_native": _bool(build_config_dict.get("native", True)), + "build_common": _bool(build_config_dict.get("build_common", True)), + "build_tests": _bool(build_config_dict.get("build_tests", True)), + "build_tools": _bool(build_config_dict.get("build_tools", True)), + "build_examples": _bool(build_config_dict.get("build_examples", True)), + "build_server": _bool(build_config_dict.get("build_server", True)), + "install_tools": _bool(build_config_dict.get("install_tools", True)), "enable_backend_dl": _bool(build_config_dict.get("backend_dl", False)), "enable_cpu_all_variants": _bool(build_config_dict.get("cpu_all_variants", False)), - "cuda_architectures": str(build_config_dict.get("cuda_architectures") or ""), + "enable_lto": _bool(build_config_dict.get("lto", False)), + "enable_native": _bool(build_config_dict.get("native", True)), + "custom_cmake_args": _str(build_config_dict.get("custom_cmake_args")), + "cuda_architectures": _str(build_config_dict.get("cuda_architectures")), + "cflags": _str(build_config_dict.get("cflags")), + "cxxflags": _str(build_config_dict.get("cxxflags")), } try: build_config = BuildConfig(**mapped) @@ -794,14 +856,55 @@ async def delete_version(version_id: str): if active and str(active.get("version")) == version_str: raise HTTPException(status_code=400, detail="Cannot delete active version") try: - binary_path = version_entry.get("binary_path") - if binary_path: - if not os.path.isabs(binary_path): - binary_path = os.path.join("/app", binary_path) - if os.path.exists(binary_path): - version_dir = os.path.dirname(os.path.dirname(binary_path)) - if os.path.exists(version_dir): - _robust_rmtree(version_dir) + binary_path = _resolve_binary_path(version_entry.get("binary_path") or "") + if binary_path and os.path.exists(binary_path): + # Safely resolve the on-disk version directory without ever deleting the + # entire llama-cpp root. Versions are stored as subdirectories of + # llama_manager.llama_dir (e.g. //.../llama-server). + try: + llama_root = os.path.realpath(llama_manager.llama_dir) + binary_real = os.path.realpath(binary_path) + except Exception: + llama_root = llama_manager.llama_dir + binary_real = binary_path + + version_dir = None + + # If the binary lives under the llama root, treat the first path + # component under that root as the version directory. + try: + if os.path.commonpath([binary_real, llama_root]) == llama_root: + rel = os.path.relpath(binary_real, llama_root) + first_component = rel.split(os.sep)[0] + if first_component and first_component not in (".", ""): + candidate = os.path.join(llama_root, first_component) + if os.path.isdir(candidate): + version_dir = candidate + except Exception: + # Fall back to parent-directory logic below if commonpath/relpath fail + version_dir = None + + # Fallback: use the binary's parent directory, but never delete the + # llama root itself. + if not version_dir: + candidate = os.path.dirname(binary_real) + if ( + candidate + and os.path.isdir(candidate) + and os.path.commonpath([candidate, llama_root]) == llama_root + and os.path.abspath(candidate) != os.path.abspath(llama_root) + ): + version_dir = candidate + + if version_dir and os.path.exists(version_dir): + _robust_rmtree(version_dir) + else: + # As a last resort, remove just the binary to avoid leaving a + # completely broken entry on disk. + try: + os.remove(binary_real) + except OSError: + pass store.delete_engine_version(engine, version_str) logger.info(f"Deleted version: {version_str}") return {"message": f"Deleted version {version_str}"} diff --git a/frontend/src/views/EnginesView.vue b/frontend/src/views/EnginesView.vue index 363e291..aa578b7 100644 --- a/frontend/src/views/EnginesView.vue +++ b/frontend/src/views/EnginesView.vue @@ -363,7 +363,7 @@ + modal :style="{ width: '620px' }" class="build-settings-dialog">
@@ -378,9 +378,48 @@ Appended to version name. Defaults to timestamp if empty.
- + + +
+
+
-
+
+ +
+ {{ opt.label }} + {{ opt.desc }} +
+
+
+
+
+ +
+ For ik_llama.cpp, Examples is required (server binary lives in examples). +
+
+
+ +
+ {{ opt.label }} + {{ opt.desc }} +
+
+
+
+
+ +
+
{{ opt.label }} @@ -394,6 +433,18 @@
+
+ + +
+
+ +
+ + +
+