From 21bb3406b69b35de33c6bf4c89b5af4f70a62737 Mon Sep 17 00:00:00 2001
From: lapy <vin.lapenta@gmail.com>
Date: Sun, 8 Mar 2026 21:10:12 +0000
Subject: [PATCH] Refactoring

---
 Dockerfile                                    |   18 +-
 README.md                                     | 1050 ++--
 backend/architecture_profiles.py              |  265 -
 backend/cuda_installer.py                     | 4864 ++++++++---------
 backend/data_store.py                         |  216 +
 backend/database.py                           |  364 --
 backend/gguf_reader.py                        |   20 +-
 backend/huggingface.py                        |  655 ++-
 backend/llama_manager.py                      |  435 +-
 backend/llama_swap_config.py                  |  213 +-
 backend/llama_swap_manager.py                 |  159 +-
 backend/lmdeploy_installer.py                 |  778 +--
 backend/lmdeploy_manager.py                   | 1718 +++---
 backend/main.py                               |  306 +-
 backend/param_registry.py                     |  117 +
 backend/presets.py                            |  170 -
 backend/progress_manager.py                   |  236 +
 backend/routes/llama_version_manager.py       |  244 +-
 backend/routes/llama_versions.py              |  486 +-
 backend/routes/lmdeploy.py                    |  133 +-
 backend/routes/models.py                      | 1795 +++---
 backend/routes/status.py                      |   61 +-
 backend/routes/unified_monitoring.py          |   75 -
 backend/smart_auto/__init__.py                |  480 --
 backend/smart_auto/architecture_config.py     |  103 -
 backend/smart_auto/calculators.py             |  402 --
 backend/smart_auto/config_builder.py          |   79 -
 backend/smart_auto/constants.py               |  184 -
 backend/smart_auto/cpu_config.py              |  200 -
 backend/smart_auto/generation_params.py       |   87 -
 backend/smart_auto/gpu_config.py              |  469 --
 backend/smart_auto/kv_cache.py                |   30 -
 backend/smart_auto/memory_estimator.py        |  478 --
 backend/smart_auto/model_metadata.py          |  106 -
 backend/smart_auto/models.py                  |  216 -
 backend/smart_auto/moe_handler.py             |   98 -
 backend/smart_auto/optimizer.py               |  199 -
 backend/smart_auto/recommendations.py         |  372 --
 backend/tests/conftest.py                     |    8 +
 backend/tests/test_app_smoke.py               |   58 +
 backend/tests/test_architecture_profiles.py   |   70 -
 backend/unified_monitor.py                    |  716 ---
 backend/websocket_manager.py                  |  192 -
 docker-compose.cuda.yml                       |    9 +-
 docker-compose.rocm.yml                       |   40 -
 docker-compose.vulkan.yml                     |   37 -
 docker-entrypoint.sh                          |   11 +-
 frontend/src/App.vue                          |   22 +-
 frontend/src/components/BuildProgress.vue     |  375 --
 frontend/src/components/DownloadProgress.vue  |  217 -
 frontend/src/components/GgufModelList.vue     |  455 --
 .../src/components/SafetensorsModelList.vue   | 1917 -------
 frontend/src/components/SliderInput.vue       |  477 --
 frontend/src/components/common/BaseCard.vue   |   35 -
 frontend/src/components/common/BaseDialog.vue |   70 -
 .../src/components/common/BaseFormField.vue   |   62 -
 frontend/src/components/common/LogViewer.vue  |  451 --
 .../src/components/common/ProgressTracker.vue |  124 +
 .../src/components/common/StatusBadge.vue     |   99 -
 .../src/components/config/AdvancedSection.vue |   81 -
 .../config/AdvancedSettingsSection.vue        |   83 -
 .../components/config/ConfigChangePreview.vue |  329 --
 .../src/components/config/ConfigField.vue     |   91 -
 .../src/components/config/ConfigSection.vue   |  145 -
 .../src/components/config/ConfigWarnings.vue  |   90 -
 .../src/components/config/ConfigWizard.vue    |  764 ---
 .../config/ContextParamsSection.vue           |  163 -
 .../components/config/CustomArgsSection.vue   |   32 -
 frontend/src/components/config/EmptyState.vue |  145 -
 .../config/EssentialSettingsSection.vue       |  182 -
 .../config/GenerationParamsSection.vue        |  244 -
 .../src/components/config/MemoryMonitor.vue   |  439 --
 .../components/config/MemoryParamsSection.vue |  148 -
 .../components/config/ModelInfoSection.vue    |  177 -
 .../src/components/config/OnboardingTour.vue  |  397 --
 .../components/config/PerformanceSection.vue  |  393 --
 .../src/components/config/QuickStartModal.vue |  352 --
 .../src/components/config/SettingsTooltip.vue |  129 -
 frontend/src/components/layout/AppFooter.vue  |   81 +-
 .../src/components/layout/AppNavigation.vue   |  146 +-
 .../src/components/system/CudaInstaller.vue   |  534 --
 .../src/components/system/LMDeployTab.vue     |  347 --
 .../system/LlamaCppManager/BuildDialog.vue    |  715 ---
 .../LlamaCppManager/CudaInstallDialog.vue     |  274 -
 .../system/LlamaCppManager/ReleaseDialog.vue  |  419 --
 .../system/LlamaCppManager/UpdateInfo.vue     |  135 -
 .../system/LlamaCppManager/VersionCard.vue    |  250 -
 .../system/LlamaCppManager/VersionList.vue    |   52 -
 .../src/components/system/LlamaCppTab.vue     |    7 -
 frontend/src/components/system/SystemTab.vue  |  748 ---
 .../src/components/system/VersionTable.vue    |  131 +
 frontend/src/main.js                          |    7 -
 frontend/src/router/index.js                  |   10 +-
 frontend/src/stores/engines.js                |  189 +
 frontend/src/stores/lmdeploy.js               |  172 -
 frontend/src/stores/models.js                 |  559 +-
 frontend/src/stores/progress.js               |  151 +
 frontend/src/stores/system.js                 |  178 -
 frontend/src/stores/websocket.js              |  322 --
 frontend/src/styles/_base.css                 |    3 +-
 frontend/src/styles/_components.css           |   61 -
 frontend/src/styles/_variables.css            |    2 +
 frontend/src/utils/formatting.js              |   24 +-
 frontend/src/views/EnginesView.vue            | 1059 ++++
 frontend/src/views/LMDeploy.vue               |  298 -
 frontend/src/views/LlamaCppManager.vue        |  339 --
 frontend/src/views/ModelConfig.vue            | 4131 ++------------
 frontend/src/views/ModelLibrary.vue           | 1329 ++---
 frontend/src/views/ModelSearch.vue            | 2055 ++-----
 frontend/src/views/System.vue                 |   74 -
 frontend/src/views/SystemStatus.vue           |  723 ---
 frontend/vite.config.js                       |   87 +-
 migrate_db.py                                 |  404 --
 migrate_gguf_storage.py                       |   96 -
 package-lock.json                             |  184 +
 package.json                                  |   14 +-
 requirements.txt                              |   10 +-
 117 files changed, 10642 insertions(+), 33388 deletions(-)
 delete mode 100644 backend/architecture_profiles.py
 create mode 100644 backend/data_store.py
 delete mode 100644 backend/database.py
 create mode 100644 backend/param_registry.py
 delete mode 100644 backend/presets.py
 create mode 100644 backend/progress_manager.py
 delete mode 100644 backend/routes/unified_monitoring.py
 delete mode 100644 backend/smart_auto/__init__.py
 delete mode 100644 backend/smart_auto/architecture_config.py
 delete mode 100644 backend/smart_auto/calculators.py
 delete mode 100644 backend/smart_auto/config_builder.py
 delete mode 100644 backend/smart_auto/constants.py
 delete mode 100644 backend/smart_auto/cpu_config.py
 delete mode 100644 backend/smart_auto/generation_params.py
 delete mode 100644 backend/smart_auto/gpu_config.py
 delete mode 100644 backend/smart_auto/kv_cache.py
 delete mode 100644 backend/smart_auto/memory_estimator.py
 delete mode 100644 backend/smart_auto/model_metadata.py
 delete mode 100644 backend/smart_auto/models.py
 delete mode 100644 backend/smart_auto/moe_handler.py
 delete mode 100644 backend/smart_auto/optimizer.py
 delete mode 100644 backend/smart_auto/recommendations.py
 create mode 100644 backend/tests/conftest.py
 create mode 100644 backend/tests/test_app_smoke.py
 delete mode 100644 backend/tests/test_architecture_profiles.py
 delete mode 100644 backend/unified_monitor.py
 delete mode 100644 backend/websocket_manager.py
 delete mode 100644 docker-compose.rocm.yml
 delete mode 100644 docker-compose.vulkan.yml
 delete mode 100644 frontend/src/components/BuildProgress.vue
 delete mode 100644 frontend/src/components/DownloadProgress.vue
 delete mode 100644 frontend/src/components/GgufModelList.vue
 delete mode 100644 frontend/src/components/SafetensorsModelList.vue
 delete mode 100644 frontend/src/components/SliderInput.vue
 delete mode 100644 frontend/src/components/common/BaseCard.vue
 delete mode 100644 frontend/src/components/common/BaseDialog.vue
 delete mode 100644 frontend/src/components/common/BaseFormField.vue
 delete mode 100644 frontend/src/components/common/LogViewer.vue
 create mode 100644 frontend/src/components/common/ProgressTracker.vue
 delete mode 100644 frontend/src/components/common/StatusBadge.vue
 delete mode 100644 frontend/src/components/config/AdvancedSection.vue
 delete mode 100644 frontend/src/components/config/AdvancedSettingsSection.vue
 delete mode 100644 frontend/src/components/config/ConfigChangePreview.vue
 delete mode 100644 frontend/src/components/config/ConfigField.vue
 delete mode 100644 frontend/src/components/config/ConfigSection.vue
 delete mode 100644 frontend/src/components/config/ConfigWarnings.vue
 delete mode 100644 frontend/src/components/config/ConfigWizard.vue
 delete mode 100644 frontend/src/components/config/ContextParamsSection.vue
 delete mode 100644 frontend/src/components/config/CustomArgsSection.vue
 delete mode 100644 frontend/src/components/config/EmptyState.vue
 delete mode 100644 frontend/src/components/config/EssentialSettingsSection.vue
 delete mode 100644 frontend/src/components/config/GenerationParamsSection.vue
 delete mode 100644 frontend/src/components/config/MemoryMonitor.vue
 delete mode 100644 frontend/src/components/config/MemoryParamsSection.vue
 delete mode 100644 frontend/src/components/config/ModelInfoSection.vue
 delete mode 100644 frontend/src/components/config/OnboardingTour.vue
 delete mode 100644 frontend/src/components/config/PerformanceSection.vue
 delete mode 100644 frontend/src/components/config/QuickStartModal.vue
 delete mode 100644 frontend/src/components/config/SettingsTooltip.vue
 delete mode 100644 frontend/src/components/system/CudaInstaller.vue
 delete mode 100644 frontend/src/components/system/LMDeployTab.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/BuildDialog.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/CudaInstallDialog.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/ReleaseDialog.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/UpdateInfo.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/VersionCard.vue
 delete mode 100644 frontend/src/components/system/LlamaCppManager/VersionList.vue
 delete mode 100644 frontend/src/components/system/LlamaCppTab.vue
 delete mode 100644 frontend/src/components/system/SystemTab.vue
 create mode 100644 frontend/src/components/system/VersionTable.vue
 create mode 100644 frontend/src/stores/engines.js
 delete mode 100644 frontend/src/stores/lmdeploy.js
 create mode 100644 frontend/src/stores/progress.js
 delete mode 100644 frontend/src/stores/system.js
 delete mode 100644 frontend/src/stores/websocket.js
 create mode 100644 frontend/src/views/EnginesView.vue
 delete mode 100644 frontend/src/views/LMDeploy.vue
 delete mode 100644 frontend/src/views/LlamaCppManager.vue
 delete mode 100644 frontend/src/views/System.vue
 delete mode 100644 frontend/src/views/SystemStatus.vue
 delete mode 100644 migrate_db.py
 delete mode 100644 migrate_gguf_storage.py

diff --git a/Dockerfile b/Dockerfile
index 13ee749..0b8bd9b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,8 +19,8 @@ RUN if [ -f package-lock.json ] || [ -f npm-shrinkwrap.json ]; then \
         npm install; \
     fi
 
-# Copy frontend source (vite.config.js expects files at /build root, not /build/frontend)
-COPY frontend/ ./
+# Copy frontend source using the same layout as the repo root scripts expect
+COPY frontend/ ./frontend/
 RUN npm run build
 
 ################################################################################
@@ -81,6 +81,8 @@ ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_VISIBLE_DEVICES=all \
     NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    HF_HOME=/app/data/temp/.cache/huggingface \
+    HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub \
     VENV_PATH=/opt/venv \
     PYTHONPATH=/app \
     PATH="/app/data/cuda/current/bin:${PATH}" \
@@ -97,7 +99,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     pkg-config \
     ninja-build \
     curl \
-    wget \
     ca-certificates \
     # Core libs for Python packages
     libssl3 \
@@ -112,8 +113,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ocl-icd-libopencl1 \
     libnuma1 \
     pciutils \
-    usbutils \
-    lshw \
     # Optional: ROCm (fails gracefully if unavailable)
     && (apt-get install -y --no-install-recommends rocminfo rocm-smi || echo "ROCm unavailable") \
     && rm -rf /var/lib/apt/lists/* \
@@ -127,7 +126,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Ubuntu 24.04 may have a newer cmake, but we install a specific version for consistency
 # Placed here to avoid re-downloading when application code changes
 ARG CMAKE_VERSION=3.31.3
-RUN wget -qO /tmp/cmake.sh "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" \
+RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -o /tmp/cmake.sh \
     && chmod +x /tmp/cmake.sh \
     && /tmp/cmake.sh --skip-license --prefix=/usr/local \
     && rm /tmp/cmake.sh \
@@ -135,7 +134,7 @@ RUN wget -qO /tmp/cmake.sh "https://github.com/Kitware/CMake/releases/download/v
 
 # Install llama-swap binary
 ARG LLAMA_SWAP_VERSION=179
-RUN wget -q https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz -O /tmp/llama-swap.tar.gz && \
+RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \
     tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \
     mv /tmp/llama-swap /usr/local/bin/llama-swap && \
     chmod +x /usr/local/bin/llama-swap && \
@@ -151,8 +150,7 @@ WORKDIR /app
 
 # Copy application code (excluding data via .dockerignore)
 COPY backend/ ./backend/
-COPY migrate_db.py ./
-COPY --from=frontend-builder /build/dist ./frontend/dist
+COPY --from=frontend-builder /build/frontend/dist ./frontend/dist
 COPY frontend/public ./frontend/public
 
 # Copy and setup entrypoint script and CUDA environment helper
@@ -170,7 +168,7 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 # Create non-root user and data directory structure
 RUN useradd -m -s /bin/bash appuser && \
-    mkdir -p /app/data/models /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp && \
+    mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp/.cache/huggingface/hub && \
     chown -R appuser:appuser /app && \
     # Ensure entrypoint script is accessible to appuser
     chmod 755 /usr/local/bin/docker-entrypoint.sh
diff --git a/README.md b/README.md
index fed2b77..89567a4 100644
--- a/README.md
+++ b/README.md
@@ -1,525 +1,525 @@
-# llama.cpp Studio
-
-A professional AI model management platform for llama.cpp models and versions, designed for modern AI workflows with comprehensive GPU support (NVIDIA CUDA, AMD Vulkan/ROCm, Metal, OpenBLAS).
-
-## Features
-
-### Model Management
-- **Search & Download**: Search HuggingFace for GGUF models with comprehensive metadata and size information for each quantization
-- **Multi-Quantization Support**: Download and manage multiple quantizations of the same model
-- **Model Library**: Manage downloaded models with start/stop/delete functionality
-- **Smart Configuration**: Auto-generate optimal llama.cpp parameters based on GPU capabilities
-- **VRAM Estimation**: Real-time VRAM usage estimation with warnings for memory constraints
-- **Metadata Extraction**: Rich model information including parameters, architecture, license, tags, and more
-- **Safetensors Runner**: Configure and run safetensors checkpoints via LMDeploy TurboMind with an OpenAI-compatible endpoint on port 2001
-
-### llama.cpp Version Management
-- **Release Installation**: Download and install pre-built binaries from GitHub releases
-- **Source Building**: Build from source with optional patches from GitHub PRs
-- **Custom Build Configuration**: Customize GPU backends (CUDA, Vulkan, Metal, OpenBLAS), build type, and compiler flags
-- **Update Checking**: Check for updates to both releases and source code
-- **Version Management**: Install, update, and delete multiple llama.cpp versions
-- **Build Validation**: Automatic validation of built binaries to ensure they work correctly
-
-### GPU Support
-- **Multi-GPU Support**: Automatic detection and configuration for NVIDIA, AMD, and other GPUs
-- **NVIDIA CUDA**: Full support for CUDA compute capabilities, flash attention, and multi-GPU
-- **AMD GPU Support**: Vulkan and ROCm support for AMD GPUs
-- **Apple Metal**: Support for Apple Silicon GPUs
-- **OpenBLAS**: CPU acceleration with optimized BLAS routines
-- **VRAM Monitoring**: Real-time GPU memory usage and temperature monitoring
-- **NVLink Detection**: Automatic detection of NVLink connections and topology analysis
-
-### Multi-Model Serving
-- **Concurrent Execution**: Run multiple models simultaneously via llama-swap proxy
-- **OpenAI-Compatible API**: Standard API format for easy integration
-- **Port 2000**: All models served through a single unified endpoint
-- **Automatic Lifecycle Management**: Seamless starting/stopping of models
-
-### Web Interface
-- **Modern UI**: Vue.js 3 with PrimeVue components
-- **Real-time Updates**: WebSocket-based progress tracking and system monitoring
-- **Responsive Design**: Works on desktop and mobile devices
-- **System Status**: CPU, memory, disk, and GPU monitoring
-- **LMDeploy Installer**: Dedicated UI to install/remove LMDeploy at runtime with live logs
-- **Dark Mode**: Built-in theme support
-
-## Quick Start
-
-### Using Docker Compose
-
-1. Clone the repository:
-```bash
-git clone <repository-url>
-cd llama-cpp-studio
-```
-
-2. Start the application:
-```bash
-# CPU-only mode
-docker-compose -f docker-compose.cpu.yml up -d
-
-# GPU mode (NVIDIA CUDA)
-docker-compose -f docker-compose.cuda.yml up -d
-
-# Vulkan/AMD GPU mode
-docker-compose -f docker-compose.vulkan.yml up -d
-
-# ROCm mode
-docker-compose -f docker-compose.rocm.yml up -d
-```
-
-3. Access the web interface at `http://localhost:8080`
-
-### Published Container Images
-
-Prebuilt images are pushed to GitHub Container Registry whenever the `publish-docker` workflow runs.
-
-- `ghcr.io/<org-or-user>/llama-cpp-studio:latest` – standard image based on `ubuntu:22.04` with GPU tooling installed at runtime
-
-Pull the image from GHCR:
-
-```bash
-docker pull ghcr.io/<org-or-user>/llama-cpp-studio:latest
-```
-
-### Manual Docker Build
-
-1. Build the image:
-```bash
-docker build -t llama-cpp-studio .
-```
-
-2. Run the container:
-```bash
-# With GPU support
-docker run -d \
-  --name llama-cpp-studio \
-  --gpus all \
-  -p 8080:8080 \
-  -v ./data:/app/data \
-  llama-cpp-studio
-
-# CPU-only
-docker run -d \
-  --name llama-cpp-studio \
-  -p 8080:8080 \
-  -v ./data:/app/data \
-  llama-cpp-studio
-```
-
-## Configuration
-
-### Environment Variables
-- `CUDA_VISIBLE_DEVICES`: GPU device selection (default: all, set to "" for CPU-only)
-- `PORT`: Web server port (default: 8080)
-- `HUGGINGFACE_API_KEY`: HuggingFace API token for model search and download (optional)
-- `LMDEPLOY_BIN`: Override path to the `lmdeploy` CLI (default: `lmdeploy` on PATH)
-- `LMDEPLOY_PORT`: Override the LMDeploy OpenAI port (default: 2001)
-
-### Volume Mounts
-- `/app/data`: Persistent storage for models, configurations, and database
-
-### HuggingFace API Key
-
-To enable model search and download functionality, you need to set your HuggingFace API key. You can do this in several ways:
-
-#### Option 1: Docker Compose Environment Variable
-Uncomment and set the token in your `docker-compose.yml`:
-```yaml
-environment:
-  - CUDA_VISIBLE_DEVICES=all
-  - HUGGINGFACE_API_KEY=your_huggingface_token_here
-```
-
-#### Option 2: .env File
-Create a `.env` file in your project root:
-```bash
-HUGGINGFACE_API_KEY=your_huggingface_token_here
-```
-
-Then uncomment the `env_file` section in `docker-compose.yml`:
-```yaml
-env_file:
-  - .env
-```
-
-#### Option 3: System Environment Variable
-Set the environment variable before running Docker Compose:
-```bash
-export HUGGINGFACE_API_KEY=your_huggingface_token_here
-docker-compose up -d
-```
-
-#### Getting Your HuggingFace Token
-1. Go to [HuggingFace Settings](https://huggingface.co/settings/tokens)
-2. Create a new token with "Read" permissions
-3. Copy the token and use it in one of the methods above
-
-**Note**: When the API key is set via environment variable, it cannot be modified through the web UI for security reasons.
-
-### GPU Requirements
-- **NVIDIA**: NVIDIA GPU with CUDA support, NVIDIA Container Toolkit installed
-- **AMD**: AMD GPU with Vulkan/ROCm drivers
-- **Apple**: Apple Silicon with Metal support
-- **CPU**: OpenBLAS for CPU acceleration (included in Docker image)
-- Minimum 8GB VRAM recommended for most models
-
-### LMDeploy Requirement
-
-Safetensors execution relies on [LMDeploy](https://github.com/InternLM/lmdeploy), but the base image intentionally omits it to keep Docker builds lightweight (critical for GitHub Actions). Use the **LMDeploy** page in the UI to install or remove LMDeploy inside the running container—installs happen via `pip` at runtime and logs are streamed live. The installer creates a dedicated virtual environment under `/app/data/lmdeploy/venv`, so the package lives on the writable volume and can be removed by deleting that folder. If you are running outside the container, you can still `pip install lmdeploy` manually or point `LMDEPLOY_BIN` to a custom binary. The runtime uses `lmdeploy serve turbomind` to expose an OpenAI-compatible server on port `2001`.
-
-## Usage
-
-### 1. Model Management
-
-#### Search Models
-- Use the search bar to find GGUF models on HuggingFace
-- Filter by tags, parameters, or model name
-- View comprehensive metadata including downloads, likes, tags, and file sizes
-
-#### Download Models
-- Click download on any quantization to start downloading
-- Multiple quantizations of the same model are automatically grouped
-- Progress tracking with real-time updates via WebSocket
-
-#### Configure Models
-- Set llama.cpp parameters or use Smart Auto for optimal settings
-- View VRAM estimation before starting
-- Configure context size, batch sizes, temperature, and more
-
-#### Run Models
-- Start/stop models with one click
-- Multiple models can run simultaneously
-- View running instances and resource usage
-
-### 2. llama.cpp Versions
-
-#### Check Updates
-- View available releases and source updates
-- See commit history and release notes
-
-#### Install Release
-- Download pre-built binaries from GitHub
-- Automatic verification and installation
-
-#### Build from Source
-- Compile from source with custom configuration
-- Select GPU backends (CUDA, Vulkan, Metal, OpenBLAS)
-- Configure build type (Release, Debug, RelWithDebInfo)
-- Add custom CMake flags and compiler options
-- Apply patches from GitHub PRs
-- Automatic validation of built binaries
-
-#### Manage Versions
-- Delete old versions to free up space
-- View installation details and build configuration
-
-### 3. System Monitoring
-- **Overview**: CPU, memory, disk, and GPU usage
-- **GPU Details**: Individual GPU information and utilization
-- **Running Instances**: Active model instances with resource usage
-- **WebSocket**: Real-time updates for all metrics
-
-## Multi-Model Serving
-
-llama-cpp-studio uses llama-swap to serve multiple models simultaneously on port 2000.
-
-### Starting Models
-
-Simply start any model from the Model Library. All models run on port 2000 simultaneously.
-
-### OpenAI-Compatible API
-
-```bash
-curl http://localhost:2000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "llama-3-2-1b-instruct-iq2-xs",
-    "messages": [{"role": "user", "content": "Hello!"}]
-  }'
-```
-
-Model names are shown in System Status after starting a model.
-
-### Features
-
-- Multiple models run concurrently
-- No loading time - instant switching between models
-- Standard OpenAI API format
-- Automatic lifecycle management
-- Single unified endpoint
-
-### Troubleshooting
-
-- Check available models: `http://localhost:2000/v1/models`
-- Check proxy health: `http://localhost:2000/health`
-- View logs: `docker logs llama-cpp-studio`
-
-### LMDeploy TurboMind (Safetensors)
-
-- Run exactly one safetensors checkpoint at a time via LMDeploy
-- Configure tensor/pipeline parallelism, context length, temperature, and other runtime flags from the Model Library
-- Serves an OpenAI-compatible endpoint at `http://localhost:2001/v1/chat/completions`
-- Install LMDeploy on demand from the LMDeploy page (or manually via `pip`) before starting safetensors runtimes
-- Start/stop directly from the Safetensors panel; status is reported in System Status and the LMDeploy status chip
-
-## Build Customization
-
-### GPU Backends
-
-Enable specific GPU backends during source builds:
-
-- **CUDA**: NVIDIA GPU acceleration with cuBLAS
-- **Vulkan**: AMD/Intel GPU acceleration with Vulkan compute
-- **Metal**: Apple Silicon GPU acceleration
-- **OpenBLAS**: CPU optimization with OpenBLAS routines
-
-### Build Configuration
-
-Customize your build with:
-
-- **Build Type**: Release (optimal), Debug (development), RelWithDebInfo
-- **Custom CMake Flags**: Additional CMake configuration
-- **Compiler Flags**: CFLAGS and CXXFLAGS for optimization
-- **Git Patches**: Apply patches from GitHub PRs
-
-### Example Build Configuration
-
-```json
-{
-  "commit_sha": "master",
-  "patches": [
-    "https://github.com/ggerganov/llama.cpp/pull/1234.patch"
-  ],
-  "build_config": {
-    "build_type": "Release",
-    "enable_cuda": true,
-    "enable_vulkan": false,
-    "enable_metal": false,
-    "enable_openblas": true,
-    "custom_cmake_args": "-DGGML_CUDA_CUBLAS=ON",
-    "cflags": "-O3 -march=native",
-    "cxxflags": "-O3 -march=native"
-  }
-}
-```
-
-## Smart Auto Configuration
-
-The Smart Auto feature automatically generates optimal llama.cpp parameters based on:
-
-- **GPU Capabilities**: VRAM, compute capability, multi-GPU support
-- **NVLink Topology**: Automatic detection and optimization for NVLink clusters
-- **Model Architecture**: Detected from model name (Llama, Mistral, etc.)
-- **Available Resources**: CPU cores, memory, disk space
-- **Performance Optimization**: Flash attention, tensor parallelism, batch sizing
-
-### NVLink Optimization Strategies
-
-The system automatically detects NVLink topology and applies appropriate strategies:
-
-- **Unified NVLink**: All GPUs connected via NVLink - uses aggressive tensor splitting and higher parallelism
-- **Clustered NVLink**: Multiple NVLink clusters - optimizes for the largest cluster
-- **Partial NVLink**: Some GPUs connected via NVLink - uses hybrid approach
-- **PCIe Only**: No NVLink detected - uses conservative PCIe-based configuration
-
-### Supported Parameters
-- Context size, batch sizes, GPU layers
-- Temperature, top-k, top-p, repeat penalty
-- CPU threads, parallel sequences
-- RoPE scaling, YaRN factors
-- Multi-GPU tensor splitting
-- Custom arguments via YAML config
-
-## API Endpoints
-
-### Models
-- `GET /api/models` - List all models
-- `POST /api/models/search` - Search HuggingFace
-- `POST /api/models/download` - Download model
-- `GET /api/models/{id}/config` - Get model configuration
-- `PUT /api/models/{id}/config` - Update configuration
-- `POST /api/models/{id}/auto-config` - Generate smart configuration
-- `POST /api/models/{id}/start` - Start model
-- `POST /api/models/{id}/stop` - Stop model
-- `DELETE /api/models/{id}` - Delete model
-- `GET /api/models/safetensors/{model_id}/lmdeploy/config` - Get LMDeploy config for a safetensors download
-- `PUT /api/models/safetensors/{model_id}/lmdeploy/config` - Update LMDeploy config
-- `POST /api/models/safetensors/{model_id}/lmdeploy/start` - Start LMDeploy runtime
-- `POST /api/models/safetensors/{model_id}/lmdeploy/stop` - Stop LMDeploy runtime
-- `GET /api/models/safetensors/lmdeploy/status` - LMDeploy manager status
-
-### LMDeploy Installer
-- `GET /api/lmdeploy/status` - Installer status (version, binary path, current operation)
-- `POST /api/lmdeploy/install` - Install LMDeploy via pip at runtime
-- `POST /api/lmdeploy/remove` - Remove LMDeploy from the runtime environment
-- `GET /api/lmdeploy/logs` - Tail the LMDeploy installer log
-
-### llama.cpp Versions
-- `GET /api/llama-versions` - List installed versions
-- `GET /api/llama-versions/check-updates` - Check for updates
-- `GET /api/llama-versions/build-capabilities` - Get build capabilities
-- `POST /api/llama-versions/install-release` - Install release
-- `POST /api/llama-versions/build-source` - Build from source
-- `DELETE /api/llama-versions/{id}` - Delete version
-
-### System
-- `GET /api/status` - System status
-- `GET /api/gpu-info` - GPU information
-- `WebSocket /ws` - Real-time updates
-
-## Database Migration
-
-If upgrading from an older version, you may need to migrate your database:
-
-```bash
-# Run migration to support multi-quantization
-python migrate_db.py
-```
-
-## Troubleshooting
-
-### Common Issues
-
-1. **GPU Not Detected**
-   - Ensure NVIDIA Container Toolkit is installed (for NVIDIA)
-   - Check `nvidia-smi` output
-   - Verify `--gpus all` flag in docker run
-   - For AMD: Check Vulkan/ROCm drivers
-
-2. **Build Failures**
-   - Check CUDA version compatibility (for NVIDIA)
-   - Ensure sufficient disk space (at least 10GB free)
-   - Verify internet connectivity for downloads
-   - For Vulkan builds: Ensure `glslang-tools` is installed
-   - Check build logs for specific errors
-
-3. **Memory Issues**
-   - Use Smart Auto configuration
-   - Reduce context size or batch size
-   - Enable memory mapping
-   - Check available system RAM and VRAM
-
-4. **Model Download Failures**
-   - Check HuggingFace connectivity
-   - Verify model exists and is public
-   - Ensure sufficient disk space
-   - Set HUGGINGFACE_API_KEY if using private models
-
-5. **Validation Failed**
-   - Binary exists and is executable
-   - Binary runs `--version` successfully
-   - Output contains "llama" or "version:" string
-
-### Logs
-- Application logs: `docker logs llama-cpp-studio`
-- Model logs: Available in the web interface
-- Build logs: Shown during source compilation
-- WebSocket logs: DEBUG level for detailed connection info
-
-## Development
-
-### Backend
-- FastAPI with async support
-- SQLAlchemy for database management
-- WebSocket for real-time updates
-- Background tasks for long operations
-- Llama-swap integration for multi-model serving
-
-### Frontend
-- Vue.js 3 with Composition API
-- PrimeVue component library
-- Pinia for state management
-- Vite for build tooling
-- Dark mode support
-
-### Database
-- SQLite for simplicity
-- Models, versions, and instances tracking
-- Configuration storage
-- Multi-quantization support
-
-## Memory Estimation Model
-
-The studio’s capacity planning tooling is grounded in a three-component model for llama.cpp that provides a conservative upper bound on peak memory usage.
-
-- **Formula**: `M_total = M_weights + M_kv + M_compute`
-- **Model weights (`M_weights`)**: Treat the GGUF file size as the ground truth. When `--no-mmap` is disabled (default), the file is memory-mapped so only referenced pages touch physical RAM, but the virtual footprint still equals the file size.
-- **KV cache (`M_kv`)**: Uses the GQA-aware formula `n_ctx × N_layers × N_head_kv × (N_embd / N_head) × (p_a_k + p_a_v)`, where `p_a_*` are the bytes-per-value chosen via `--cache-type-k` / `--cache-type-v`.
-- **Compute buffers (`M_compute`)**: Approximate as a fixed CUDA overhead (~550 MB) plus a scratch buffer that scales with micro-batch size (`n_ubatch × 0.5 MB` by default).
-
-### RAM vs VRAM Allocation
-
-- `-ngl 0` (CPU-only): All components stay in RAM.
-- `-ngl > 0` (hybrid/full GPU): Model weights split by layer between RAM and VRAM, while **both `M_kv` and `M_compute` move entirely to VRAM**—the “VRAM trap”.
-- Full offload avoids PCIe contention; hybrid splits suffer a “performance cliff” because activations bounce between CPU and GPU.
-
-### Optimization Strategy
-
-1. Attempt full offload first (best throughput). If weights + compute fit, deduce `n_ctx_max` from remaining VRAM budget.
-2. When full offload fails, search decreasing `n_ngl` values that satisfy RAM limits while maximizing context length, accepting the hybrid performance penalty.
-3. Iterate quantization choices to find the smallest model that still enables full offload on the target hardware profile.
-
-## Smart Auto Module Report
-
-The Smart Auto subsystem applies the model above to recommend llama.cpp launch parameters. Priority 1 fixes are complete, eliminating prior memory underestimation bugs.
-
-- **Resolutions**:
-  - Corrected KV cache math to respect grouped-query attention head counts.
-  - Removed the dangerous 0.30 multiplier on cache size; estimates now use real memory.
-  - Ensured KV cache/compute buffers migrate to VRAM whenever GPU layers are in play.
-  - Modeled compute overhead as `550 MB + 0.5 MB × n_ubatch`.
-  - Improved GPU layer estimation using GGUF file size with a 20 % safety buffer.
-- **Open improvements**:
-  - Reorder calculations so KV cache quantization feeds batch/context sizing directly.
-  - Replace remaining heuristics with joint optimization across `n_ctx`, `n_ngl`, and `n_ubatch`.
-
-### Recommended Validation
-
-- Benchmark against known examples (e.g., 13B @ 2 048 tokens → ~1.6 GB KV cache, 7B @ 4 096 tokens → ~6 GB total).
-- Stress-test large contexts, tight VRAM scenarios, MoE models, and hybrid modes.
-- Expand automated regression coverage around the estimator and Smart Auto flows.
-
-## Memory Estimation Test Results
-
-Empirical testing with `Llama-3.2-1B-Instruct.IQ1_M` demonstrates that the estimator acts as a safe upper bound.
-
-- **Setup**: `n_ctx ≈ 35 K`, batch 32, CPU-only run.
-- **Estimated peak**: 4.99 GB (weights 394 MB, KV cache 4.34 GB, batch 12 MB, llama.cpp overhead 256 MB).
-- **Observed deltas**:
-  - With mmap enabled: ~608 MB (11.9 % of estimate). Lower usage is expected because the KV cache grows as context fills and weights are paged on demand.
-  - With `--no-mmap`: ~1.16 GB (23 % of estimate). Weights load fully, but KV cache still expands progressively.
-- **Takeaways**:
-  - Estimates intentionally err on the high side to prevent OOM once the context window reaches capacity.
-  - Divergence between virtual and physical usage stems from memory mapping and lazy KV cache allocation.
-  - Additional GPU-focused measurements and long session traces are encouraged to correlate VRAM predictions with reality.
-
-## License
-
-This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
-
-Copyright (c) 2024 llama.cpp Studio
-
-## Contributing
-
-1. Fork the repository
-2. Create a feature branch
-3. Make your changes
-4. Add tests if applicable
-5. Submit a pull request
-
-## Support
-
-For issues and questions:
-- Create an issue on GitHub
-- Check the troubleshooting section
-- Review the API documentation
-
-## Acknowledgments
-
-- [llama.cpp](https://github.com/ggerganov/llama.cpp) - The core inference engine
-- [llama-swap](https://github.com/mostlygeek/llama-swap) - Multi-model serving proxy
-- [HuggingFace](https://huggingface.co) - Model hosting and search
-- [Vue.js](https://vuejs.org) - Frontend framework
-- [FastAPI](https://fastapi.tiangolo.com) - Backend framework
+# llama.cpp Studio
+
+A professional AI model management platform for llama.cpp models and versions, designed for modern AI workflows with comprehensive GPU support (NVIDIA CUDA, AMD Vulkan/ROCm, Metal, OpenBLAS).
+
+## Features
+
+### Model Management
+- **Search & Download**: Search HuggingFace for GGUF models with comprehensive metadata and size information for each quantization
+- **Multi-Quantization Support**: Download and manage multiple quantizations of the same model
+- **Model Library**: Manage downloaded models with start/stop/delete functionality
+- **Smart Configuration**: Auto-generate optimal llama.cpp parameters based on GPU capabilities
+- **VRAM Estimation**: Real-time VRAM usage estimation with warnings for memory constraints
+- **Metadata Extraction**: Rich model information including parameters, architecture, license, tags, and more
+- **Safetensors Runner**: Configure and run safetensors checkpoints via LMDeploy TurboMind with an OpenAI-compatible endpoint on port 2001
+
+### llama.cpp Version Management
+- **Release Installation**: Download and install pre-built binaries from GitHub releases
+- **Source Building**: Build from source with optional patches from GitHub PRs
+- **Custom Build Configuration**: Customize GPU backends (CUDA, Vulkan, Metal, OpenBLAS), build type, and compiler flags
+- **Update Checking**: Check for updates to both releases and source code
+- **Version Management**: Install, update, and delete multiple llama.cpp versions
+- **Build Validation**: Automatic validation of built binaries to ensure they work correctly
+
+### GPU Support
+- **Multi-GPU Support**: Automatic detection and configuration for NVIDIA, AMD, and other GPUs
+- **NVIDIA CUDA**: Full support for CUDA compute capabilities, flash attention, and multi-GPU
+- **AMD GPU Support**: Vulkan and ROCm support for AMD GPUs
+- **Apple Metal**: Support for Apple Silicon GPUs
+- **OpenBLAS**: CPU acceleration with optimized BLAS routines
+- **VRAM Monitoring**: Real-time GPU memory usage and temperature monitoring
+- **NVLink Detection**: Automatic detection of NVLink connections and topology analysis
+
+### Multi-Model Serving
+- **Concurrent Execution**: Run multiple models simultaneously via llama-swap proxy
+- **OpenAI-Compatible API**: Standard API format for easy integration
+- **Port 2000**: All models served through a single unified endpoint
+- **Automatic Lifecycle Management**: Seamless starting/stopping of models
+
+### Web Interface
+- **Modern UI**: Vue.js 3 with PrimeVue components
+- **Real-time Updates**: SSE-based progress tracking and system monitoring
+- **Responsive Design**: Works on desktop and mobile devices
+- **System Status**: CPU, memory, disk, and GPU monitoring
+- **LMDeploy Installer**: Dedicated UI to install/remove LMDeploy at runtime with live logs
+- **Dark Mode**: Built-in theme support
+
+## Quick Start
+
+### Using Docker Compose
+
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd llama-cpp-studio
+```
+
+2. Start the application:
+```bash
+# CPU-only mode
+docker-compose -f docker-compose.cpu.yml up -d
+
+# GPU mode (NVIDIA CUDA)
+docker-compose -f docker-compose.cuda.yml up -d
+
+# Vulkan/AMD GPU mode
+docker-compose -f docker-compose.vulkan.yml up -d
+
+# ROCm mode
+docker-compose -f docker-compose.rocm.yml up -d
+```
+
+3. Access the web interface at `http://localhost:8080`
+
+### Published Container Images
+
+Prebuilt images are pushed to GitHub Container Registry whenever the `publish-docker` workflow runs.
+
+- `ghcr.io/<org-or-user>/llama-cpp-studio:latest` – standard image based on `ubuntu:22.04` with GPU tooling installed at runtime
+
+Pull the image from GHCR:
+
+```bash
+docker pull ghcr.io/<org-or-user>/llama-cpp-studio:latest
+```
+
+### Manual Docker Build
+
+1. Build the image:
+```bash
+docker build -t llama-cpp-studio .
+```
+
+2. Run the container:
+```bash
+# With GPU support
+docker run -d \
+  --name llama-cpp-studio \
+  --gpus all \
+  -p 8080:8080 \
+  -v ./data:/app/data \
+  llama-cpp-studio
+
+# CPU-only
+docker run -d \
+  --name llama-cpp-studio \
+  -p 8080:8080 \
+  -v ./data:/app/data \
+  llama-cpp-studio
+```
+
+## Configuration
+
+### Environment Variables
+- `CUDA_VISIBLE_DEVICES`: GPU device selection (default: all, set to "" for CPU-only)
+- `PORT`: Web server port (default: 8080)
+- `HUGGINGFACE_API_KEY`: HuggingFace API token for model search and download (optional)
+- `LMDEPLOY_BIN`: Override path to the `lmdeploy` CLI (default: `lmdeploy` on PATH)
+- `LMDEPLOY_PORT`: Override the LMDeploy OpenAI port (default: 2001)
+
+### Volume Mounts
+- `/app/data`: Persistent storage for models, configurations, and database
+
+### HuggingFace API Key
+
+To enable model search and download functionality, you need to set your HuggingFace API key. You can do this in several ways:
+
+#### Option 1: Docker Compose Environment Variable
+Uncomment and set the token in your `docker-compose.yml`:
+```yaml
+environment:
+  - CUDA_VISIBLE_DEVICES=all
+  - HUGGINGFACE_API_KEY=your_huggingface_token_here
+```
+
+#### Option 2: .env File
+Create a `.env` file in your project root:
+```bash
+HUGGINGFACE_API_KEY=your_huggingface_token_here
+```
+
+Then uncomment the `env_file` section in `docker-compose.yml`:
+```yaml
+env_file:
+  - .env
+```
+
+#### Option 3: System Environment Variable
+Set the environment variable before running Docker Compose:
+```bash
+export HUGGINGFACE_API_KEY=your_huggingface_token_here
+docker-compose up -d
+```
+
+#### Getting Your HuggingFace Token
+1. Go to [HuggingFace Settings](https://huggingface.co/settings/tokens)
+2. Create a new token with "Read" permissions
+3. Copy the token and use it in one of the methods above
+
+**Note**: When the API key is set via environment variable, it cannot be modified through the web UI for security reasons.
+
+### GPU Requirements
+- **NVIDIA**: NVIDIA GPU with CUDA support, NVIDIA Container Toolkit installed
+- **AMD**: AMD GPU with Vulkan/ROCm drivers
+- **Apple**: Apple Silicon with Metal support
+- **CPU**: OpenBLAS for CPU acceleration (included in Docker image)
+- Minimum 8GB VRAM recommended for most models
+
+### LMDeploy Requirement
+
+Safetensors execution relies on [LMDeploy](https://github.com/InternLM/lmdeploy), but the base image intentionally omits it to keep Docker builds lightweight (critical for GitHub Actions). Use the **LMDeploy** page in the UI to install or remove LMDeploy inside the running container—installs happen via `pip` at runtime and logs are streamed live. The installer creates a dedicated virtual environment under `/app/data/lmdeploy/venv`, so the package lives on the writable volume and can be removed by deleting that folder. If you are running outside the container, you can still `pip install lmdeploy` manually or point `LMDEPLOY_BIN` to a custom binary. The runtime uses `lmdeploy serve turbomind` to expose an OpenAI-compatible server on port `2001`.
+
+## Usage
+
+### 1. Model Management
+
+#### Search Models
+- Use the search bar to find GGUF models on HuggingFace
+- Filter by tags, parameters, or model name
+- View comprehensive metadata including downloads, likes, tags, and file sizes
+
+#### Download Models
+- Click download on any quantization to start downloading
+- Multiple quantizations of the same model are automatically grouped
+- Progress tracking with real-time updates via SSE
+
+#### Configure Models
+- Set llama.cpp parameters or use Smart Auto for optimal settings
+- View VRAM estimation before starting
+- Configure context size, batch sizes, temperature, and more
+
+#### Run Models
+- Start/stop models with one click
+- Multiple models can run simultaneously
+- View running instances and resource usage
+
+### 2. llama.cpp Versions
+
+#### Check Updates
+- View available releases and source updates
+- See commit history and release notes
+
+#### Install Release
+- Download pre-built binaries from GitHub
+- Automatic verification and installation
+
+#### Build from Source
+- Compile from source with custom configuration
+- Select GPU backends (CUDA, Vulkan, Metal, OpenBLAS)
+- Configure build type (Release, Debug, RelWithDebInfo)
+- Add custom CMake flags and compiler options
+- Apply patches from GitHub PRs
+- Automatic validation of built binaries
+
+#### Manage Versions
+- Delete old versions to free up space
+- View installation details and build configuration
+
+### 3. System Monitoring
+- **Overview**: CPU, memory, disk, and GPU usage
+- **GPU Details**: Individual GPU information and utilization
+- **Running Instances**: Active model instances with resource usage
+- **SSE**: Real-time updates for all metrics
+
+## Multi-Model Serving
+
+llama-cpp-studio uses llama-swap to serve multiple models simultaneously on port 2000.
+
+### Starting Models
+
+Simply start any model from the Model Library. All models run on port 2000 simultaneously.
+
+### OpenAI-Compatible API
+
+```bash
+curl http://localhost:2000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama-3-2-1b-instruct-iq2-xs",
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+```
+
+Model names are shown in System Status after starting a model.
+
+### Features
+
+- Multiple models run concurrently
+- No loading time - instant switching between models
+- Standard OpenAI API format
+- Automatic lifecycle management
+- Single unified endpoint
+
+### Troubleshooting
+
+- Check available models: `http://localhost:2000/v1/models`
+- Check proxy health: `http://localhost:2000/health`
+- View logs: `docker logs llama-cpp-studio`
+
+### LMDeploy TurboMind (Safetensors)
+
+- Run exactly one safetensors checkpoint at a time via LMDeploy
+- Configure tensor/pipeline parallelism, context length, temperature, and other runtime flags from the Model Library
+- Serves an OpenAI-compatible endpoint at `http://localhost:2001/v1/chat/completions`
+- Install LMDeploy on demand from the LMDeploy page (or manually via `pip`) before starting safetensors runtimes
+- Start/stop directly from the Safetensors panel; status is reported in System Status and the LMDeploy status chip
+
+## Build Customization
+
+### GPU Backends
+
+Enable specific GPU backends during source builds:
+
+- **CUDA**: NVIDIA GPU acceleration with cuBLAS
+- **Vulkan**: AMD/Intel GPU acceleration with Vulkan compute
+- **Metal**: Apple Silicon GPU acceleration
+- **OpenBLAS**: CPU optimization with OpenBLAS routines
+
+### Build Configuration
+
+Customize your build with:
+
+- **Build Type**: Release (optimal), Debug (development), RelWithDebInfo
+- **Custom CMake Flags**: Additional CMake configuration
+- **Compiler Flags**: CFLAGS and CXXFLAGS for optimization
+- **Git Patches**: Apply patches from GitHub PRs
+
+### Example Build Configuration
+
+```json
+{
+  "commit_sha": "master",
+  "patches": [
+    "https://github.com/ggerganov/llama.cpp/pull/1234.patch"
+  ],
+  "build_config": {
+    "build_type": "Release",
+    "enable_cuda": true,
+    "enable_vulkan": false,
+    "enable_metal": false,
+    "enable_openblas": true,
+    "custom_cmake_args": "-DGGML_CUDA_CUBLAS=ON",
+    "cflags": "-O3 -march=native",
+    "cxxflags": "-O3 -march=native"
+  }
+}
+```
+
+## Smart Auto Configuration
+
+The Smart Auto feature automatically generates optimal llama.cpp parameters based on:
+
+- **GPU Capabilities**: VRAM, compute capability, multi-GPU support
+- **NVLink Topology**: Automatic detection and optimization for NVLink clusters
+- **Model Architecture**: Detected from model name (Llama, Mistral, etc.)
+- **Available Resources**: CPU cores, memory, disk space
+- **Performance Optimization**: Flash attention, tensor parallelism, batch sizing
+
+### NVLink Optimization Strategies
+
+The system automatically detects NVLink topology and applies appropriate strategies:
+
+- **Unified NVLink**: All GPUs connected via NVLink - uses aggressive tensor splitting and higher parallelism
+- **Clustered NVLink**: Multiple NVLink clusters - optimizes for the largest cluster
+- **Partial NVLink**: Some GPUs connected via NVLink - uses hybrid approach
+- **PCIe Only**: No NVLink detected - uses conservative PCIe-based configuration
+
+### Supported Parameters
+- Context size, batch sizes, GPU layers
+- Temperature, top-k, top-p, repeat penalty
+- CPU threads, parallel sequences
+- RoPE scaling, YaRN factors
+- Multi-GPU tensor splitting
+- Custom arguments via YAML config
+
+## API Endpoints
+
+### Models
+- `GET /api/models` - List all models
+- `POST /api/models/search` - Search HuggingFace
+- `POST /api/models/download` - Download model
+- `GET /api/models/{id}/config` - Get model configuration
+- `PUT /api/models/{id}/config` - Update configuration
+- `POST /api/models/{id}/auto-config` - Generate smart configuration
+- `POST /api/models/{id}/start` - Start model
+- `POST /api/models/{id}/stop` - Stop model
+- `DELETE /api/models/{id}` - Delete model
+- `GET /api/models/safetensors/{model_id}/lmdeploy/config` - Get LMDeploy config for a safetensors download
+- `PUT /api/models/safetensors/{model_id}/lmdeploy/config` - Update LMDeploy config
+- `POST /api/models/safetensors/{model_id}/lmdeploy/start` - Start LMDeploy runtime
+- `POST /api/models/safetensors/{model_id}/lmdeploy/stop` - Stop LMDeploy runtime
+- `GET /api/models/safetensors/lmdeploy/status` - LMDeploy manager status
+
+### LMDeploy Installer
+- `GET /api/lmdeploy/status` - Installer status (version, binary path, current operation)
+- `POST /api/lmdeploy/install` - Install LMDeploy via pip at runtime
+- `POST /api/lmdeploy/remove` - Remove LMDeploy from the runtime environment
+- `GET /api/lmdeploy/logs` - Tail the LMDeploy installer log
+
+### llama.cpp Versions
+- `GET /api/llama-versions` - List installed versions
+- `GET /api/llama-versions/check-updates` - Check for updates
+- `GET /api/llama-versions/build-capabilities` - Get build capabilities
+- `POST /api/llama-versions/install-release` - Install release
+- `POST /api/llama-versions/build-source` - Build from source
+- `DELETE /api/llama-versions/{id}` - Delete version
+
+### System
+- `GET /api/status` - System status
+- `GET /api/gpu-info` - GPU information
+- `GET /api/events` - Server-Sent Events for real-time updates
+
+## Database Migration
+
+If upgrading from an older version, you may need to migrate your database:
+
+```bash
+# Run migration to support multi-quantization
+python migrate_db.py
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **GPU Not Detected**
+   - Ensure NVIDIA Container Toolkit is installed (for NVIDIA)
+   - Check `nvidia-smi` output
+   - Verify `--gpus all` flag in docker run
+   - For AMD: Check Vulkan/ROCm drivers
+
+2. **Build Failures**
+   - Check CUDA version compatibility (for NVIDIA)
+   - Ensure sufficient disk space (at least 10GB free)
+   - Verify internet connectivity for downloads
+   - For Vulkan builds: Ensure `glslang-tools` is installed
+   - Check build logs for specific errors
+
+3. **Memory Issues**
+   - Use Smart Auto configuration
+   - Reduce context size or batch size
+   - Enable memory mapping
+   - Check available system RAM and VRAM
+
+4. **Model Download Failures**
+   - Check HuggingFace connectivity
+   - Verify model exists and is public
+   - Ensure sufficient disk space
+   - Set HUGGINGFACE_API_KEY if using private models
+
+5. **Validation Failed**
+   - Binary exists and is executable
+   - Binary runs `--version` successfully
+   - Output contains "llama" or "version:" string
+
+### Logs
+- Application logs: `docker logs llama-cpp-studio`
+- Model logs: Available in the web interface
+- Build logs: Shown during source compilation
+- SSE event stream: GET /api/events for real-time progress and status
+
+## Development
+
+### Backend
+- FastAPI with async support
+- YAML-backed data store (models, engines, settings)
+- SSE (GET /api/events) for real-time updates
+- Background tasks for long operations
+- Llama-swap integration for multi-model serving
+
+### Frontend
+- Vue.js 3 with Composition API
+- PrimeVue component library
+- Pinia for state management
+- Vite for build tooling
+- Dark mode support
+
+### Testing
+- Backend tests: `pytest` (install deps first: `pip install -r requirements.txt pytest pytest-asyncio`)
+- Run from repo root: `PYTHONPATH=. pytest backend/tests/ -v`
+- Smoke tests in `backend/tests/test_app_smoke.py` verify the app starts and key API routes respond (`/api/status`, `/api/models/param-registry`, `/api/models/`, `/api/events`)
+- LMDeploy installer and config validation tests in `backend/tests/test_lmdeploy_*.py`
+
+## Memory Estimation Model
+
+The studio’s capacity planning tooling is grounded in a three-component model for llama.cpp that provides a conservative upper bound on peak memory usage.
+
+- **Formula**: `M_total = M_weights + M_kv + M_compute`
+- **Model weights (`M_weights`)**: Treat the GGUF file size as the ground truth. When `--no-mmap` is disabled (default), the file is memory-mapped so only referenced pages touch physical RAM, but the virtual footprint still equals the file size.
+- **KV cache (`M_kv`)**: Uses the GQA-aware formula `n_ctx × N_layers × N_head_kv × (N_embd / N_head) × (p_a_k + p_a_v)`, where `p_a_*` are the bytes-per-value chosen via `--cache-type-k` / `--cache-type-v`.
+- **Compute buffers (`M_compute`)**: Approximate as a fixed CUDA overhead (~550 MB) plus a scratch buffer that scales with micro-batch size (`n_ubatch × 0.5 MB` by default).
+
+### RAM vs VRAM Allocation
+
+- `-ngl 0` (CPU-only): All components stay in RAM.
+- `-ngl > 0` (hybrid/full GPU): Model weights split by layer between RAM and VRAM, while **both `M_kv` and `M_compute` move entirely to VRAM**—the “VRAM trap”.
+- Full offload avoids PCIe contention; hybrid splits suffer a “performance cliff” because activations bounce between CPU and GPU.
+
+### Optimization Strategy
+
+1. Attempt full offload first (best throughput). If weights + compute fit, deduce `n_ctx_max` from remaining VRAM budget.
+2. When full offload fails, search decreasing `n_ngl` values that satisfy RAM limits while maximizing context length, accepting the hybrid performance penalty.
+3. Iterate quantization choices to find the smallest model that still enables full offload on the target hardware profile.
+
+## Smart Auto Module Report
+
+The Smart Auto subsystem applies the model above to recommend llama.cpp launch parameters. Priority 1 fixes are complete, eliminating prior memory underestimation bugs.
+
+- **Resolutions**:
+  - Corrected KV cache math to respect grouped-query attention head counts.
+  - Removed the dangerous 0.30 multiplier on cache size; estimates now use real memory.
+  - Ensured KV cache/compute buffers migrate to VRAM whenever GPU layers are in play.
+  - Modeled compute overhead as `550 MB + 0.5 MB × n_ubatch`.
+  - Improved GPU layer estimation using GGUF file size with a 20 % safety buffer.
+- **Open improvements**:
+  - Reorder calculations so KV cache quantization feeds batch/context sizing directly.
+  - Replace remaining heuristics with joint optimization across `n_ctx`, `n_ngl`, and `n_ubatch`.
+
+### Recommended Validation
+
+- Benchmark against known examples (e.g., 13B @ 2 048 tokens → ~1.6 GB KV cache, 7B @ 4 096 tokens → ~6 GB total).
+- Stress-test large contexts, tight VRAM scenarios, MoE models, and hybrid modes.
+- Expand automated regression coverage around the estimator and Smart Auto flows.
+
+## Memory Estimation Test Results
+
+Empirical testing with `Llama-3.2-1B-Instruct.IQ1_M` demonstrates that the estimator acts as a safe upper bound.
+
+- **Setup**: `n_ctx ≈ 35 K`, batch 32, CPU-only run.
+- **Estimated peak**: 4.99 GB (weights 394 MB, KV cache 4.34 GB, batch 12 MB, llama.cpp overhead 256 MB).
+- **Observed deltas**:
+  - With mmap enabled: ~608 MB (11.9 % of estimate). Lower usage is expected because the KV cache grows as context fills and weights are paged on demand.
+  - With `--no-mmap`: ~1.16 GB (23 % of estimate). Weights load fully, but KV cache still expands progressively.
+- **Takeaways**:
+  - Estimates intentionally err on the high side to prevent OOM once the context window reaches capacity.
+  - Divergence between virtual and physical usage stems from memory mapping and lazy KV cache allocation.
+  - Additional GPU-focused measurements and long session traces are encouraged to correlate VRAM predictions with reality.
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+Copyright (c) 2024 llama.cpp Studio
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Add tests if applicable
+5. Submit a pull request
+
+## Support
+
+For issues and questions:
+- Create an issue on GitHub
+- Check the troubleshooting section
+- Review the API documentation
+
+## Acknowledgments
+
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) - The core inference engine
+- [llama-swap](https://github.com/mostlygeek/llama-swap) - Multi-model serving proxy
+- [HuggingFace](https://huggingface.co) - Model hosting and search
+- [Vue.js](https://vuejs.org) - Frontend framework
+- [FastAPI](https://fastapi.tiangolo.com) - Backend framework
diff --git a/backend/architecture_profiles.py b/backend/architecture_profiles.py
deleted file mode 100644
index 319e0f1..0000000
--- a/backend/architecture_profiles.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""
-Architecture-aware profiles for interpreting GGUF metadata.
-
-Each profile is responsible for turning raw GGUF metadata into:
-- block_count: architectural depth (number of transformer blocks)
-- effective_layer_count: layers llama.cpp can offload (including output layer)
-"""
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-
-@dataclass(frozen=True)
-class LayerConfig:
-    """Standardized output for layer calculations."""
-
-    block_count: int
-    effective_layer_count: int
-
-
-# --- Helper Utilities ---
-
-
-def _get_first_valid_int(
-    metadata: Dict[str, Any], keys: List[str], default: Optional[int] = None
-) -> Optional[int]:
-    """
-    Scans metadata for the first key that contains a valid >0 number.
-    """
-    for key in keys:
-        val = metadata.get(key)
-        # GGUF metadata values can be various numeric types
-        if isinstance(val, (int, float)) and val > 0:
-            return int(val)
-    return default
-
-
-# --- Registry System ---
-
-_PROFILE_REGISTRY: List["ArchitectureProfile"] = []
-
-
-def register_profile(cls: Type["ArchitectureProfile"]) -> Type["ArchitectureProfile"]:
-    """
-    Registers a profile class.
-    Profiles are stored and later sorted by specificity (longest name match first).
-    """
-    _PROFILE_REGISTRY.append(cls())
-    return cls
-
-
-# --- Base Class ---
-
-
-class ArchitectureProfile(ABC):
-    """Base class for architecture-specific GGUF metadata interpretation."""
-
-    def __init__(self, names: Tuple[str, ...]):
-        self.names = names
-
-    def matches(self, architecture: str) -> bool:
-        """
-        Checks if the architecture string matches this profile.
-        """
-        arch = architecture.lower()
-        # "llama" should match "llama", "llama-2", etc.
-        return any(arch == n or arch.startswith(n) for n in self.names)
-
-    def compute(
-        self,
-        metadata: Dict[str, Any],
-        base_block_count: int,
-    ) -> LayerConfig:
-        """
-        Public interface that wraps the calculation with standardized logging.
-        """
-        result = self._calculate_layers(metadata, base_block_count)
-
-        logger.debug(
-            "%s: matched. block_count=%s, effective_layer_count=%s (base=%s)",
-            self.__class__.__name__,
-            result.block_count,
-            result.effective_layer_count,
-            base_block_count,
-        )
-        return result
-
-    @abstractmethod
-    def _calculate_layers(
-        self,
-        metadata: Dict[str, Any],
-        base_block_count: int,
-    ) -> LayerConfig:
-        """Implementation specific logic."""
-        raise NotImplementedError
-
-
-# --- Standard Profile (Handles 95% of cases) ---
-
-
-class StandardDecoderProfile(ArchitectureProfile):
-    """
-    Generic profile for standard decoder-only models (Llama, Qwen, DeepSeek, etc.).
-
-    Logic:
-    1. Look for specific keys (names.block_count, names.n_layer).
-    2. Fallback to base_block_count.
-    3. Effective layers = block_count + 1 (for the output head).
-
-    Note on MoEs: Even for MoE models (Qwen2-MoE, DeepSeek-V2), llama.cpp
-    counts the 'offloadable layers' as the number of transformer blocks.
-    Expert offloading is managed internally within those blocks.
-    """
-
-    def _calculate_layers(
-        self, metadata: Dict[str, Any], base_block_count: int
-    ) -> LayerConfig:
-        # Generate candidate keys based on architecture names
-        # e.g., ["llama.block_count", "llama.n_layer", ...]
-        candidate_keys = []
-        for name in self.names:
-            candidate_keys.extend(
-                [
-                    f"{name}.block_count",
-                    f"{name}.n_layer",
-                    f"{name}.n_layers",  # Some older models use plural
-                    f"{name}.num_hidden_layers",  # Some models use num_hidden_layers (e.g., Seed OSS)
-                ]
-            )
-
-        block_count = (
-            _get_first_valid_int(metadata, candidate_keys, default=base_block_count)
-            or 0
-        )
-
-        # Standard decoder: blocks + output head
-        effective = (block_count + 1) if block_count > 0 else 0
-
-        return LayerConfig(block_count=block_count, effective_layer_count=effective)
-
-
-# --- Concrete Profiles ---
-
-
-@register_profile
-class GlmProfile(StandardDecoderProfile):
-    """
-    Profile for GLM family (GLM-4, GLM-4-MoE, etc.).
-    """
-
-    def __init__(self) -> None:
-        super().__init__(names=("glm", "glm4", "glm4moe"))
-
-
-@register_profile
-class DeepseekProfile(StandardDecoderProfile):
-    """
-    DeepSeek decoder LMs and MoE variants.
-    Crucial: Must check 'deepseek2' for V2/V3 models.
-    """
-
-    def __init__(self) -> None:
-        super().__init__(names=("deepseek", "deepseek2"))
-
-
-@register_profile
-class QwenFamilyProfile(StandardDecoderProfile):
-    """Qwen / Qwen2 / Qwen2.5 / Qwen2-MoE."""
-
-    def __init__(self) -> None:
-        super().__init__(names=("qwen", "qwen2", "qwen3", "qwen2moe", "qwen3moe"))
-
-
-@register_profile
-class LlamaLikeProfile(StandardDecoderProfile):
-    """LLaMA, Mistral, Mixtral, Gemma, Phi, etc."""
-
-    def __init__(self) -> None:
-        # "phi" added as it follows the same decoder structure in GGUF
-        super().__init__(
-            names=(
-                "llama",
-                "mistral",
-                "mixtral",
-                "gemma",
-                "phi",
-                "seed",
-                "seed-oss",
-                "seedoss",
-                "seed_oss",
-            )
-        )
-
-
-@register_profile
-class MiniMaxProfile(StandardDecoderProfile):
-    """MiniMax models (MiniMax-M2.1 and variants)."""
-
-    def __init__(self) -> None:
-        super().__init__(names=("minimax", "minimax-m2", "minimax_m2", "m2"))
-
-
-# --- Main Accessor ---
-
-
-def get_sorted_profiles() -> List[ArchitectureProfile]:
-    """
-    Returns profiles sorted by specificity (longest name match first).
-    Example: 'glm4moe' (len 7) is checked before 'glm' (len 3).
-    """
-    return sorted(
-        _PROFILE_REGISTRY, key=lambda p: max(len(n) for n in p.names), reverse=True
-    )
-
-
-def compute_layers_for_architecture(
-    architecture: str,
-    metadata: Dict[str, Any],
-    base_block_count: int,
-) -> Dict[str, int]:
-    """
-    Compute block_count and effective_layer_count.
-    """
-    arch = architecture.lower()
-
-    # Iterate through automatically sorted profiles
-    for profile in get_sorted_profiles():
-        if profile.matches(arch):
-            result = profile.compute(metadata, base_block_count)
-            return {
-                "block_count": result.block_count,
-                "effective_layer_count": result.effective_layer_count,
-            }
-
-    # --- Generic Fallback ---
-    # Even if unknown architecture, if we have a base_block_count,
-    # it's safe to assume it's a decoder stack + 1 output head.
-    block_count = base_block_count or 0
-
-    if block_count > 0:
-        effective_layer_count = block_count + 1
-        logger.info(
-            "Generic profile: architecture=%s, block_count=%s, "
-            "effective_layer_count=%s",
-            arch,
-            block_count,
-            effective_layer_count,
-        )
-        return {
-            "block_count": block_count,
-            "effective_layer_count": effective_layer_count,
-        }
-
-    # Complete fallback
-    logger.warning(
-        "Could not determine block_count for architecture=%s; "
-        "using default effective_layer_count=32",
-        arch,
-    )
-    return {"block_count": 0, "effective_layer_count": 32}
diff --git a/backend/cuda_installer.py b/backend/cuda_installer.py
index e407c6c..12b307b 100644
--- a/backend/cuda_installer.py
+++ b/backend/cuda_installer.py
@@ -1,2432 +1,2432 @@
-"""
-CUDA Toolkit Installer
-
-Handles downloading and installing CUDA Toolkit on Linux systems.
-"""
-
-import asyncio
-import json
-import os
-import platform
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import time
-import gzip
-from datetime import datetime, timezone
-from typing import Any, Awaitable, Dict, Optional, Tuple
-import aiohttp
-import aiofiles
-
-from backend.logging_config import get_logger
-from backend.websocket_manager import websocket_manager
-
-logger = get_logger(__name__)
-
-_installer_instance: Optional["CUDAInstaller"] = None
-
-
-def get_cuda_installer() -> "CUDAInstaller":
-    global _installer_instance
-    if _installer_instance is None:
-        _installer_instance = CUDAInstaller()
-    return _installer_instance
-
-
-def _utcnow() -> str:
-    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
-
-
-class CUDAInstaller:
-    """Install CUDA Toolkit on Linux systems."""
-
-    # Supported CUDA versions - URLs are fetched dynamically from NVIDIA's archive
-    # Format: version -> platform -> architecture (URLs fetched on demand)
-    SUPPORTED_VERSIONS = [
-        "13.0",
-        "12.9",
-        "12.8",
-        "12.7",
-        "12.6",
-        "12.5",
-        "12.4",
-        "12.3",
-        "12.2",
-        "12.1",
-        "12.0",
-        "11.9",
-        "11.8",
-    ]
-
-    # cuDNN version mappings by CUDA major version
-    CUDNN_VERSIONS = {
-        "13": "9.5.1",  # cuDNN 9.x for CUDA 13.x
-        "12": "9.5.1",  # cuDNN 9.x for CUDA 12.x
-        "11": "8.9.7",  # cuDNN 8.x for CUDA 11.x
-    }
-
-    # TensorRT version mappings by CUDA major version
-    TENSORRT_VERSIONS = {
-        "13": "10.7.0",  # TensorRT 10.x for CUDA 13.x
-        "12": "10.7.0",  # TensorRT 10.x for CUDA 12.x
-        "11": "8.6.1",   # TensorRT 8.x for CUDA 11.x
-    }
-
-    def __init__(
-        self,
-        *,
-        log_path: Optional[str] = None,
-        state_path: Optional[str] = None,
-        download_dir: Optional[str] = None,
-    ) -> None:
-        self._lock = asyncio.Lock()
-        self._operation: Optional[str] = None
-        self._operation_started_at: Optional[str] = None
-        self._current_task: Optional[asyncio.Task] = None
-        self._last_error: Optional[str] = None
-        self._download_progress: Dict[str, Any] = {}
-        self._last_logged_percentage: int = -1
-        self._last_progress_broadcast_time: float = 0.0
-        self._pending_progress: Optional[Dict[str, Any]] = None
-        self._progress_broadcast_count: int = 0
-
-        # Determine data root - check Docker path first, then fallback to local
-        if os.path.exists("/app/data"):
-            data_root = "/app/data"
-        else:
-            data_root = os.path.abspath("data")
-
-        log_path = log_path or os.path.join(data_root, "logs", "cuda_install.log")
-        state_path = state_path or os.path.join(
-            data_root, "configs", "cuda_installer.json"
-        )
-        download_dir = download_dir or os.path.join(
-            data_root, "temp", "cuda_installers"
-        )
-        self._cuda_install_dir = os.path.join(data_root, "cuda")
-
-        self._log_path = os.path.abspath(log_path)
-        self._state_path = os.path.abspath(state_path)
-        self._download_dir = os.path.abspath(download_dir)
-        self._url_cache: Dict[str, str] = {}  # Cache for dynamically fetched URLs
-        self._repo_cache: Dict[str, list] = {}  # Cache for NVIDIA repo packages
-        self._ensure_directories()
-
-    def _ensure_directories(self) -> None:
-        os.makedirs(self._download_dir, exist_ok=True)
-        os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
-        os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
-        os.makedirs(self._cuda_install_dir, exist_ok=True)
-
-    def _update_current_symlink(self, install_path: str) -> None:
-        """Create or update the /app/data/cuda/current symlink to point to the active CUDA installation."""
-        current_symlink = os.path.join(self._cuda_install_dir, "current")
-        try:
-            # Remove existing symlink if it exists
-            if os.path.islink(current_symlink):
-                os.remove(current_symlink)
-            elif os.path.exists(current_symlink):
-                # If it's not a symlink but exists, remove it (shouldn't happen, but be safe)
-                os.remove(current_symlink)
-            
-            # Create new symlink pointing to the installation
-            os.symlink(install_path, current_symlink)
-            logger.info(f"Updated CUDA current symlink: {current_symlink} -> {install_path}")
-        except OSError as e:
-            logger.warning(f"Failed to update CUDA current symlink: {e}")
-
-    def _remove_current_symlink(self) -> None:
-        """Remove the current symlink and optionally re-point it to another installed version."""
-        current_symlink = os.path.join(self._cuda_install_dir, "current")
-        try:
-            if os.path.islink(current_symlink) or os.path.exists(current_symlink):
-                os.remove(current_symlink)
-            
-            # Try to find another installed version to point to
-            state = self._load_state()
-            installations = state.get("installations", {})
-            
-            # Find the most recently installed version that still exists
-            latest_version = None
-            latest_time = None
-            for v, info in installations.items():
-                install_path = info.get("path")
-                if install_path and os.path.exists(install_path):
-                    installed_at = info.get("installed_at", "")
-                    if not latest_time or installed_at > latest_time:
-                        latest_time = installed_at
-                        latest_version = v
-            
-            # Re-point to the latest remaining installation
-            if latest_version:
-                install_path = installations[latest_version].get("path")
-                if install_path and os.path.exists(install_path):
-                    os.symlink(install_path, current_symlink)
-                    logger.info(f"Re-pointed CUDA current symlink to: {install_path}")
-        except OSError as e:
-            logger.warning(f"Failed to update CUDA current symlink: {e}")
-
-    def _get_platform(self) -> Tuple[str, str]:
-        """Get platform (os, arch) tuple."""
-        system = platform.system().lower()
-        machine = platform.machine().lower()
-
-        if machine in ("x86_64", "amd64"):
-            arch = "x86_64"
-        else:
-            arch = machine
-
-        return system, arch
-
-    def _get_ubuntu_version(self) -> str:
-        """Get Ubuntu version for NVIDIA repository URLs."""
-        # Try to detect Ubuntu version from /etc/os-release
-        try:
-            if os.path.exists("/etc/os-release"):
-                with open("/etc/os-release", "r") as f:
-                    for line in f:
-                        if line.startswith("VERSION_ID="):
-                            version = line.split("=")[1].strip().strip('"')
-                            # Extract major.minor (e.g., "24.04" from "24.04.1")
-                            parts = version.split(".")
-                            if len(parts) >= 2:
-                                major_minor = f"{parts[0]}{parts[1]}"
-                                # Check if it's 24.04 or newer
-                                if major_minor >= "2404":
-                                    return "ubuntu2404"
-                                else:
-                                    return "ubuntu2204"
-        except Exception:
-            pass
-        
-        # Default to ubuntu2404 for Ubuntu 24.04 base image
-        return "ubuntu2404"
-
-    def _get_archive_target_version(self) -> str:
-        """Get archive target version for CUDA runfile lookups."""
-        ubuntu_version = self._get_ubuntu_version()
-        if ubuntu_version == "ubuntu2404":
-            return "24.04"
-        return "22.04"
-
-    async def _get_repo_packages(self, ubuntu_version: str) -> list:
-        """Fetch and cache NVIDIA CUDA repo package metadata."""
-        if ubuntu_version in self._repo_cache:
-            return self._repo_cache[ubuntu_version]
-
-        base_url = (
-            f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64"
-        )
-        packages_url = f"{base_url}/Packages.gz"
-        packages_plain_url = f"{base_url}/Packages"
-        packages: list = []
-
-        async with aiohttp.ClientSession() as session:
-            data = None
-            try:
-                async with session.get(packages_url) as response:
-                    if response.status == 200:
-                        compressed = await response.read()
-                        data = gzip.decompress(compressed)
-            except Exception:
-                data = None
-
-            if data is None:
-                try:
-                    async with session.get(packages_plain_url) as response:
-                        if response.status == 200:
-                            data = await response.read()
-                except Exception:
-                    data = None
-
-        if not data:
-            self._repo_cache[ubuntu_version] = []
-            return []
-
-        text = data.decode("utf-8", errors="replace")
-        current = {}
-        for line in text.splitlines():
-            if not line.strip():
-                if current:
-                    packages.append(current)
-                    current = {}
-                continue
-            if line.startswith("Package:"):
-                current["Package"] = line.split(":", 1)[1].strip()
-            elif line.startswith("Version:"):
-                current["Version"] = line.split(":", 1)[1].strip()
-            elif line.startswith("Filename:"):
-                current["Filename"] = line.split(":", 1)[1].strip()
-
-        if current:
-            packages.append(current)
-
-        self._repo_cache[ubuntu_version] = packages
-        return packages
-
-    def _version_key(self, version: str) -> tuple:
-        """Create a sortable key for package version strings."""
-        tokens = re.split(r"[^\w]+", version)
-        key = []
-        for token in tokens:
-            if token.isdigit():
-                key.append(int(token))
-            elif token:
-                key.append(token)
-        return tuple(key)
-
-    def _select_repo_package(
-        self,
-        packages: list,
-        package_name: str,
-        version_prefix: Optional[str] = None,
-        version_contains: Optional[str] = None,
-    ) -> Optional[Dict[str, str]]:
-        """Select the best matching package from repo metadata."""
-        candidates = [
-            pkg for pkg in packages if pkg.get("Package") == package_name
-        ]
-        if version_prefix:
-            candidates = [
-                pkg
-                for pkg in candidates
-                if pkg.get("Version", "").startswith(version_prefix)
-            ]
-        if version_contains:
-            candidates = [
-                pkg
-                for pkg in candidates
-                if version_contains in pkg.get("Version", "")
-            ]
-        if not candidates:
-            return None
-        return max(candidates, key=lambda pkg: self._version_key(pkg.get("Version", "")))
-
-    def _load_state(self) -> Dict[str, Any]:
-        if not os.path.exists(self._state_path):
-            return {}
-        try:
-            with open(self._state_path, "r", encoding="utf-8") as f:
-                data = json.load(f)
-                return data if isinstance(data, dict) else {}
-        except Exception as exc:
-            logger.warning(f"Failed to load CUDA installer state: {exc}")
-            return {}
-
-    def _save_state(self, state: Dict[str, Any]) -> None:
-        tmp_path = f"{self._state_path}.tmp"
-        with open(tmp_path, "w", encoding="utf-8") as f:
-            json.dump(state, f, indent=2)
-        os.replace(tmp_path, self._state_path)
-
-    def _detect_installed_version(self) -> Optional[str]:
-        """Detect installed CUDA version by checking nvcc or state."""
-        # First check state for installed versions
-        state = self._load_state()
-        installations = state.get("installations", {})
-        if installations:
-            # Return the most recently installed version
-            latest_version = None
-            latest_time = None
-            for v, info in installations.items():
-                installed_at = info.get("installed_at", "")
-                if not latest_time or installed_at > latest_time:
-                    latest_time = installed_at
-                    latest_version = v
-            if latest_version:
-                install_path = installations[latest_version].get("path")
-                if install_path and os.path.exists(install_path):
-                    return latest_version
-
-        # Fallback: try to detect via nvcc command
-        try:
-            # Get CUDA environment to find nvcc
-            cuda_env = self.get_cuda_env()
-            env = os.environ.copy()
-            env.update(cuda_env)
-
-            nvcc_path = shutil.which("nvcc", path=env.get("PATH", ""))
-            if not nvcc_path:
-                return None
-
-            result = subprocess.run(
-                [nvcc_path, "--version"],
-                capture_output=True,
-                text=True,
-                timeout=5,
-                env=env,
-            )
-            if result.returncode == 0:
-                # Parse version from output
-                for line in result.stdout.split("\n"):
-                    if "release" in line.lower():
-                        parts = line.split()
-                        for i, part in enumerate(parts):
-                            if "release" in part.lower() and i + 1 < len(parts):
-                                version_str = parts[i + 1].rstrip(",")
-                                # Extract major.minor
-                                version_parts = version_str.split(".")
-                                if len(version_parts) >= 2:
-                                    return f"{version_parts[0]}.{version_parts[1]}"
-        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
-            pass
-        return None
-
-    def _get_cuda_path(self, version: Optional[str] = None) -> Optional[str]:
-        """Get CUDA installation path."""
-        # First, check the current symlink (most reliable for active installation)
-        current_symlink = os.path.join(self._cuda_install_dir, "current")
-        if os.path.islink(current_symlink) or os.path.exists(current_symlink):
-            try:
-                resolved_path = os.path.realpath(current_symlink)
-                if os.path.exists(resolved_path):
-                    nvcc_path = os.path.join(resolved_path, "bin", "nvcc")
-                    if os.path.exists(nvcc_path):
-                        return resolved_path
-            except (OSError, ValueError):
-                pass
-
-        # Check state for installed versions
-        state = self._load_state()
-        installations = state.get("installations", {})
-
-        # If version specified, return that installation path
-        if version and version in installations:
-            install_path = installations[version].get("path")
-            if install_path and os.path.exists(install_path):
-                return install_path
-
-        # Check for latest installed version in state
-        if installations:
-            # Get the most recently installed version
-            latest_version = None
-            latest_time = None
-            for v, info in installations.items():
-                installed_at = info.get("installed_at", "")
-                if not latest_time or installed_at > latest_time:
-                    latest_time = installed_at
-                    latest_version = v
-
-            if latest_version:
-                install_path = installations[latest_version].get("path")
-                if install_path and os.path.exists(install_path):
-                    return install_path
-
-        # Check environment variables (only accept paths under data directory)
-        env_path = os.environ.get("CUDA_PATH") or os.environ.get("CUDA_HOME")
-        if (
-            env_path
-            and os.path.exists(env_path)
-            and os.path.abspath(env_path).startswith(self._cuda_install_dir)
-        ):
-            return env_path
-
-        # Scan the data directory for CUDA installs as fallback
-        try:
-            if os.path.exists(self._cuda_install_dir):
-                for item in sorted(os.listdir(self._cuda_install_dir), reverse=True):
-                    # Skip the current symlink
-                    if item == "current":
-                        continue
-                    full_path = os.path.join(self._cuda_install_dir, item)
-                    if os.path.isdir(full_path):
-                        nvcc_path = os.path.join(full_path, "bin", "nvcc")
-                        if os.path.exists(nvcc_path):
-                            return full_path
-        except OSError:
-            pass
-
-        return None
-
-    def get_cuda_env(self, version: Optional[str] = None) -> Dict[str, str]:
-        """Get environment variables for CUDA installation."""
-        cuda_path = self._get_cuda_path(version)
-        if not cuda_path:
-            return {}
-
-        cuda_bin = os.path.join(cuda_path, "bin")
-        cuda_lib = os.path.join(cuda_path, "lib64")
-
-        env = {
-            "CUDA_HOME": cuda_path,
-            "CUDA_PATH": cuda_path,
-        }
-
-        # Add to PATH if bin directory exists
-        if os.path.exists(cuda_bin):
-            current_path = os.environ.get("PATH", "")
-            if cuda_bin not in current_path:
-                env["PATH"] = f"{cuda_bin}:{current_path}" if current_path else cuda_bin
-
-        # Add to LD_LIBRARY_PATH if lib64 directory exists
-        if os.path.exists(cuda_lib):
-            current_ld_path = os.environ.get("LD_LIBRARY_PATH", "")
-            if cuda_lib not in current_ld_path:
-                env["LD_LIBRARY_PATH"] = (
-                    f"{cuda_lib}:{current_ld_path}" if current_ld_path else cuda_lib
-                )
-
-        # Add TensorRT path if TensorRT is installed
-        tensorrt_version = self._detect_tensorrt_version(cuda_path)
-        if tensorrt_version:
-            env["TENSORRT_PATH"] = cuda_path
-            env["TENSORRT_ROOT"] = cuda_path
-
-        return env
-
-    def _get_archive_url(self, version: str) -> str:
-        """Get NVIDIA download archive URL for a CUDA version."""
-        # Convert version like "12.8" to "12-8-0" for URL
-        version_parts = version.split(".")
-        major = version_parts[0]
-        minor = version_parts[1] if len(version_parts) > 1 else "0"
-        patch = version_parts[2] if len(version_parts) > 2 else "0"
-        version_slug = f"{major}-{minor}-{patch}"
-        target_version = self._get_archive_target_version()
-
-        return (
-            f"https://developer.nvidia.com/cuda-{version_slug}-download-archive"
-            f"?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version={target_version}&target_type=runfile_local"
-        )
-
-    async def _fetch_download_url(self, version: str) -> str:
-        """Fetch the actual download URL from NVIDIA's archive page."""
-        # Check cache first
-        cache_key = f"{version}_linux_x86_64"
-        if cache_key in self._url_cache:
-            return self._url_cache[cache_key]
-
-        archive_url = self._get_archive_url(version)
-        logger.info(f"Fetching CUDA {version} download URL from {archive_url}")
-
-        async with aiohttp.ClientSession() as session:
-            try:
-                async with session.get(
-                    archive_url, timeout=aiohttp.ClientTimeout(total=30)
-                ) as response:
-                    if response.status != 200:
-                        raise RuntimeError(
-                            f"Failed to fetch archive page: HTTP {response.status}"
-                        )
-
-                    html = await response.text()
-
-                    # The page contains JSON data with download URLs
-                    # The JSON structure has keys like "Linux/x86_64/Ubuntu/24.04/runfile_local"
-                    # The URL is in the "details" field which contains HTML with href attributes
-                    target_version = self._get_archive_target_version()
-                    json_key = f"Linux/x86_64/Ubuntu/{target_version}/runfile_local"
-
-                    # Pattern 1: Look for href in the details field (HTML may be escaped)
-                    # Match: "Linux/x86_64/Ubuntu/<version>/runfile_local":{..."details":"...href=\"URL\"..."}
-                    pattern1 = rf'"{re.escape(json_key)}"[^}}]*"details"[^"]*href[=:][\\"]*([^"\\s<>]+cuda_\d+\.\d+\.\d+_[^"\\s<>]+_linux\.run)'
-                    matches = re.findall(pattern1, html, re.IGNORECASE | re.DOTALL)
-
-                    if not matches:
-                        # Pattern 2: Look for href with escaped quotes (\u0022 or \")
-                        pattern2 = rf'"{re.escape(json_key)}"[^}}]*href[\\u0022=:]*([^"\\s<>]+cuda_\d+\.\d+\.\d+_[^"\\s<>]+_linux\.run)'
-                        matches = re.findall(pattern2, html, re.IGNORECASE | re.DOTALL)
-
-                    if not matches:
-                        # Pattern 3: Look for the filename field and construct URL
-                        pattern3 = rf'"{re.escape(json_key)}"[^}}]*"filename"[^"]*"([^"]+_linux\.run)"'
-                        filename_matches = re.findall(pattern3, html, re.IGNORECASE)
-                        if filename_matches:
-                            filename = filename_matches[0]
-                            version_full = f"{version}.0"
-                            url = f"https://developer.download.nvidia.com/compute/cuda/{version_full}/local_installers/{filename}"
-                            matches = [url]
-
-                    if not matches:
-                        # Pattern 4: Fallback - look for any URL matching the pattern
-                        version_escaped = version.replace(".", r"\.")
-                        pattern4 = rf'https://developer\.download\.nvidia\.com/compute/cuda/{version_escaped}\.0/local_installers/cuda_{version_escaped}\.0_[^"\'\s<>]+_linux\.run'
-                        matches = re.findall(pattern4, html, re.IGNORECASE)
-
-                    if matches:
-                        url = matches[0]
-                        # Cache it
-                        self._url_cache[cache_key] = url
-                        logger.info(f"Found CUDA {version} download URL: {url}")
-                        return url
-                    else:
-                        raise RuntimeError(
-                            f"Could not find download URL for CUDA {version} on archive page"
-                        )
-
-            except aiohttp.ClientError as e:
-                raise RuntimeError(f"Failed to fetch archive page: {e}")
-
-    async def _broadcast_log_line(self, line: str) -> None:
-        try:
-            await websocket_manager.broadcast(
-                {
-                    "type": "cuda_install_log",
-                    "line": line,
-                    "timestamp": _utcnow(),
-                }
-            )
-        except Exception as exc:
-            logger.debug(f"Failed to broadcast CUDA log line: {exc}")
-
-    async def _broadcast_progress(self, progress: Dict[str, Any]) -> None:
-        """Broadcast progress updates, throttled to 1 second intervals."""
-        try:
-            current_time = time.time()
-            progress_value = progress.get("progress", 0)
-            is_complete = progress_value >= 100
-            
-            # Always send completion updates immediately
-            if is_complete:
-                await websocket_manager.broadcast(
-                    {
-                        "type": "cuda_install_progress",
-                        **progress,
-                        "timestamp": _utcnow(),
-                    }
-                )
-                self._last_progress_broadcast_time = current_time
-                self._pending_progress = None
-                return
-            
-            # Always send the first few updates immediately (first 3 updates)
-            # then throttle to 1 second intervals
-            is_first_update = self._last_progress_broadcast_time == 0.0
-            time_since_last_broadcast = current_time - self._last_progress_broadcast_time
-            is_early_update = self._progress_broadcast_count < 3
-            should_send = is_first_update or is_early_update or time_since_last_broadcast >= 1.0
-            
-            if should_send:
-                await websocket_manager.broadcast(
-                    {
-                        "type": "cuda_install_progress",
-                        **progress,
-                        "timestamp": _utcnow(),
-                    }
-                )
-                self._last_progress_broadcast_time = current_time
-                self._pending_progress = None
-                self._progress_broadcast_count += 1
-            else:
-                # Store the latest progress data for next send
-                self._pending_progress = progress
-        except Exception as exc:
-            logger.exception(f"Failed to broadcast CUDA progress: {exc}")
-
-    async def _set_operation(self, operation: str) -> None:
-        self._operation = operation
-        self._operation_started_at = _utcnow()
-        self._last_error = None
-        await websocket_manager.broadcast(
-            {
-                "type": "cuda_install_status",
-                "status": operation,
-                "started_at": self._operation_started_at,
-            }
-        )
-
-    async def _finish_operation(self, success: bool, message: str = "") -> None:
-        payload = {
-            "type": "cuda_install_status",
-            "status": "completed" if success else "failed",
-            "operation": self._operation,
-            "message": message,
-            "ended_at": _utcnow(),
-        }
-        await websocket_manager.broadcast(payload)
-        self._operation = None
-        self._operation_started_at = None
-
-    def _create_task(self, coro: Awaitable[Any]) -> None:
-        loop = asyncio.get_running_loop()
-        task = loop.create_task(coro)
-        self._current_task = task
-
-        def _cleanup(fut: asyncio.Future) -> None:
-            try:
-                fut.result()
-            except Exception as exc:
-                logger.exception("CUDA installer task error")
-            finally:
-                self._current_task = None
-
-        task.add_done_callback(_cleanup)
-
-    async def _download_installer(
-        self, version: str, url: str, installer_path: str
-    ) -> None:
-        """Download CUDA installer with progress tracking."""
-        # Check if installer already exists
-        if os.path.exists(installer_path):
-            file_size = os.path.getsize(installer_path)
-            file_size_mb = file_size / (1024 * 1024)
-            
-            # Verify existing file is not corrupted (should be at least 100MB for CUDA installers)
-            if file_size < 100 * 1024 * 1024:
-                await self._broadcast_log_line(
-                    f"Existing installer file appears corrupted (too small: {file_size_mb:.1f} MB), re-downloading..."
-                )
-                try:
-                    os.remove(installer_path)
-                except OSError:
-                    pass
-            else:
-                # Verify the file is actually valid and matches expected size from server
-                try:
-                    # First, check if it's a valid shell script
-                    with open(installer_path, "rb") as f:
-                        header = f.read(100)
-                        if not header.startswith(b"#!/"):
-                            await self._broadcast_log_line(
-                                f"Existing installer file is not a valid shell script, re-downloading..."
-                            )
-                            try:
-                                os.remove(installer_path)
-                            except OSError:
-                                pass
-                        else:
-                            # File appears valid, now verify size matches server expectation
-                            # Fetch the expected file size from the server
-                            try:
-                                async with aiohttp.ClientSession() as session:
-                                    async with session.head(url, allow_redirects=True) as head_response:
-                                        expected_size = int(head_response.headers.get("Content-Length", 0))
-                                        
-                                        if expected_size > 0:
-                                            # Verify file size matches (with small tolerance)
-                                            size_diff = abs(file_size - expected_size)
-                                            if size_diff > 1024:  # Allow 1KB tolerance
-                                                await self._broadcast_log_line(
-                                                    f"Existing installer file size mismatch: expected {expected_size / (1024*1024):.1f} MB, "
-                                                    f"got {file_size_mb:.1f} MB (difference: {size_diff} bytes). Re-downloading..."
-                                                )
-                                                try:
-                                                    os.remove(installer_path)
-                                                except OSError:
-                                                    pass
-                                            else:
-                                                # File size matches, verify it's stable (not currently being written)
-                                                await asyncio.sleep(0.2)  # Brief pause to ensure file is fully written if being written
-                                                new_size = os.path.getsize(installer_path)
-                                                if new_size != file_size:
-                                                    await self._broadcast_log_line(
-                                                        f"File size changed during verification (was {file_size_mb:.1f} MB, now {new_size / (1024*1024):.1f} MB), "
-                                                        f"file may still be downloading. Re-downloading..."
-                                                    )
-                                                    try:
-                                                        os.remove(installer_path)
-                                                    except OSError:
-                                                        pass
-                                                else:
-                                                    await self._broadcast_log_line(
-                                                        f"Installer file already exists and verified: {installer_path} ({file_size_mb:.1f} MB)"
-                                                    )
-                                                    await self._broadcast_progress(
-                                                        {
-                                                            "stage": "download",
-                                                            "progress": 100,
-                                                            "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
-                                                        }
-                                                    )
-                                                    return
-                                        else:
-                                            # Couldn't get expected size, but file looks valid - use it
-                                            await self._broadcast_log_line(
-                                                f"Installer file already exists: {installer_path} ({file_size_mb:.1f} MB). "
-                                                f"Could not verify size from server, but file appears valid."
-                                            )
-                                            await self._broadcast_progress(
-                                                {
-                                                    "stage": "download",
-                                                    "progress": 100,
-                                                    "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
-                                                }
-                                            )
-                                            return
-                            except Exception as size_check_error:
-                                # If we can't verify size from server, but file looks valid, use it
-                                await self._broadcast_log_line(
-                                    f"Could not verify file size from server: {size_check_error}. "
-                                    f"File appears valid, using existing file: {installer_path} ({file_size_mb:.1f} MB)"
-                                )
-                                await self._broadcast_progress(
-                                    {
-                                        "stage": "download",
-                                        "progress": 100,
-                                        "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
-                                    }
-                                )
-                                return
-                except (OSError, IOError) as e:
-                    await self._broadcast_log_line(
-                        f"Failed to verify existing installer file: {e}, re-downloading..."
-                    )
-                    try:
-                        os.remove(installer_path)
-                    except OSError:
-                        pass
-
-        # Reset logging state for new download
-        self._last_logged_percentage = -1
-        self._last_progress_broadcast_time = 0.0
-        self._pending_progress = None
-        self._progress_broadcast_count = 0
-
-        log_header = f"[{_utcnow()}] Downloading CUDA {version} installer from {url}\n"
-        with open(self._log_path, "w", encoding="utf-8") as log_file:
-            log_file.write(log_header)
-
-        await self._broadcast_log_line(
-            f"Starting download of CUDA {version} installer..."
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "download",
-                "progress": 0,
-                "message": f"Downloading CUDA {version} installer...",
-            }
-        )
-
-        # Configure timeout for large file downloads:
-        # - total: 1 hour (3600s) for very large files and slow connections
-        # - connect: 30s to establish connection
-        # - sock_read: 5 minutes (300s) to allow for slow network during chunk reads
-        timeout = aiohttp.ClientTimeout(
-            total=3600,  # 1 hour total timeout
-            connect=30,  # 30 seconds to connect
-            sock_read=300,  # 5 minutes per read operation
-        )
-
-        downloaded = 0
-        total_size = 0
-        try:
-            async with aiohttp.ClientSession(timeout=timeout) as session:
-                async with session.get(url) as response:
-                    response.raise_for_status()
-                    total_size = int(response.headers.get("Content-Length", 0))
-
-                    async with aiofiles.open(installer_path, "wb") as f:
-                        async for chunk in response.content.iter_chunked(8192):
-                            await f.write(chunk)
-                            downloaded += len(chunk)
-
-                        if total_size > 0:
-                            progress = int((downloaded / total_size) * 100)
-                            # Format sizes in MB
-                            downloaded_mb = downloaded / (1024 * 1024)
-                            total_mb = total_size / (1024 * 1024)
-                            await self._broadcast_progress(
-                                {
-                                    "stage": "download",
-                                    "progress": progress,
-                                    "message": f"Downloading CUDA {version} installer... ({downloaded_mb:.1f}/{total_mb:.1f} MB)",
-                                    "bytes_downloaded": downloaded,
-                                    "total_bytes": total_size,
-                                }
-                            )
-
-                        # Log progress only at key percentage milestones (10%, 25%, 50%, 75%, 90%, 100%)
-                        # Only log when we cross a milestone, not when we're within it
-                        should_log = False
-
-                        # Check if we've crossed a key percentage milestone
-                        if total_size > 0:
-                            progress = int((downloaded / total_size) * 100)
-                            if progress != self._last_logged_percentage and progress in [
-                                10,
-                                25,
-                                50,
-                                75,
-                                90,
-                                100,
-                            ]:
-                                should_log = True
-                                self._last_logged_percentage = progress
-
-                        if should_log:
-                            downloaded_mb = downloaded / (1024 * 1024)
-                            total_mb = total_size / (1024 * 1024)
-                            log_line = f"Downloaded {downloaded_mb:.1f}/{total_mb:.1f} MB ({progress}%)\n"
-                            with open(
-                                self._log_path, "a", encoding="utf-8"
-                            ) as log_file:
-                                log_file.write(log_line)
-                            await self._broadcast_log_line(
-                                f"Downloaded {downloaded_mb:.1f} MB / {total_mb:.1f} MB ({progress}%)"
-                            )
-                
-                # File is automatically flushed when the context manager exits
-        except asyncio.TimeoutError as e:
-            # Clean up partial download on timeout
-            if os.path.exists(installer_path):
-                try:
-                    os.remove(installer_path)
-                except OSError:
-                    pass
-            downloaded_mb = downloaded / (1024 * 1024) if downloaded > 0 else 0
-            total_mb = total_size / (1024 * 1024) if total_size > 0 else 0
-            error_msg = (
-                f"Download timeout: Failed to download CUDA {version} installer. "
-                f"Downloaded {downloaded_mb:.1f} MB of {total_mb:.1f} MB. "
-                f"This may be due to a slow network connection. Please try again."
-            )
-            await self._broadcast_log_line(error_msg)
-            raise RuntimeError(error_msg) from e
-        except aiohttp.ClientError as e:
-            # Clean up partial download on client error
-            if os.path.exists(installer_path):
-                try:
-                    os.remove(installer_path)
-                except OSError:
-                    pass
-            error_msg = (
-                f"Network error while downloading CUDA {version} installer: {e}. "
-                f"Please check your network connection and try again."
-            )
-            await self._broadcast_log_line(error_msg)
-            raise RuntimeError(error_msg) from e
-
-        # Wait a brief moment to ensure file system has fully written the file
-        # This helps ensure the file is completely written to disk before verification
-        await asyncio.sleep(0.5)
-        
-        # Verify downloaded file exists and is complete
-        if not os.path.exists(installer_path):
-            raise RuntimeError(f"Downloaded file not found: {installer_path}")
-        
-        # Verify file size matches expected size (with a small tolerance for filesystem differences)
-        actual_size = os.path.getsize(installer_path)
-        if total_size > 0:
-            size_diff = abs(actual_size - total_size)
-            if size_diff > 1024:  # Allow 1KB tolerance for filesystem differences
-                raise RuntimeError(
-                    f"Downloaded file size mismatch: expected {total_size} bytes, "
-                    f"got {actual_size} bytes (difference: {size_diff} bytes). File may be corrupted or incomplete."
-                )
-        
-        if actual_size < 100 * 1024 * 1024:  # Less than 100MB is suspicious
-            raise RuntimeError(
-                f"Downloaded file appears to be corrupted or incomplete: "
-                f"{installer_path} (size: {actual_size} bytes)"
-            )
-        
-        # Verify the file is a valid shell script (CUDA .run files are self-extracting)
-        try:
-            with open(installer_path, "rb") as verify_file:
-                header = verify_file.read(100)
-                if not header.startswith(b"#!/"):
-                    raise RuntimeError(
-                        f"Downloaded file does not appear to be a valid shell script: {installer_path}"
-                    )
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to verify downloaded file integrity: {installer_path}, error: {e}"
-            )
-        
-        await self._broadcast_log_line(
-            f"Download completed and verified: {installer_path} ({actual_size / (1024*1024):.1f} MB)"
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "download",
-                "progress": 100,
-                "message": "Download completed and verified",
-            }
-        )
-
-    def _is_docker_container(self) -> bool:
-        """Check if running inside a Docker container."""
-        # Check for Docker-specific files
-        docker_indicators = [
-            "/.dockerenv",
-            "/proc/self/cgroup",
-        ]
-
-        # Check /.dockerenv
-        if os.path.exists("/.dockerenv"):
-            return True
-
-        # Check /proc/self/cgroup for Docker
-        try:
-            if os.path.exists("/proc/self/cgroup"):
-                with open("/proc/self/cgroup", "r") as f:
-                    content = f.read()
-                    if "docker" in content or "containerd" in content:
-                        return True
-        except (OSError, IOError):
-            pass
-
-        return False
-
-    async def _install_linux(
-        self,
-        installer_path: str,
-        version: str,
-        install_cudnn: bool = False,
-        install_tensorrt: bool = False,
-    ) -> str:
-        """
-        Install CUDA on Linux using runfile installer.
-        
-        Uses optimized installer options for custom location installation:
-        - Silent installation with EULA acceptance
-        - Toolkit-only installation (no driver)
-        - Override installation checks for custom paths
-        - Skip OpenGL libraries (not needed in Docker/headless environments)
-        - Skip man pages to reduce installation size
-        
-        Args:
-            installer_path: Path to the CUDA installer runfile
-            version: CUDA version being installed
-            install_cudnn: Whether to install cuDNN
-            install_tensorrt: Whether to install TensorRT
-        """
-        await self._broadcast_log_line("Starting CUDA installation on Linux...")
-        await self._broadcast_progress(
-            {
-                "stage": "install",
-                "progress": 0,
-                "message": "Installing CUDA Toolkit...",
-            }
-        )
-
-        # Verify installer file exists and is not corrupted
-        if not os.path.exists(installer_path):
-            raise RuntimeError(f"Installer file not found: {installer_path}")
-        
-        file_size = os.path.getsize(installer_path)
-        if file_size < 100 * 1024 * 1024:  # Less than 100MB is suspicious for CUDA installers
-            raise RuntimeError(
-                f"Installer file appears to be corrupted or incomplete: {installer_path} "
-                f"(size: {file_size / (1024*1024):.1f} MB, expected > 100 MB)"
-            )
-        
-        # Verify the file starts with a shell script header (CUDA .run files are self-extracting)
-        try:
-            with open(installer_path, "rb") as f:
-                header = f.read(100)
-                if not header.startswith(b"#!/"):
-                    raise RuntimeError(
-                        f"Installer file does not appear to be a valid shell script: {installer_path}"
-                    )
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to verify installer file: {installer_path}, error: {e}"
-            )
-        
-        await self._broadcast_log_line(
-            f"Verifying installer file: {installer_path} ({file_size / (1024*1024):.1f} MB)"
-        )
-
-        # Make installer executable
-        os.chmod(installer_path, 0o755)
-
-        # Always install to the data directory for persistence
-        install_path = os.path.join(self._cuda_install_dir, f"cuda-{version}")
-        await self._broadcast_log_line(f"Installing to data directory: {install_path}")
-        os.makedirs(install_path, exist_ok=True)
-
-        # Build installer arguments with optimized options for custom location installation
-        # 
-        # Selected options based on NVIDIA CUDA installer documentation:
-        # - --silent: Required for silent installation, implies EULA acceptance
-        # - --toolkit: Install toolkit only (not driver) - required for non-root installations
-        # - --override: Override compiler, third-party library, and toolkit detection checks
-        #   (essential for custom installation paths)
-        # - --toolkitpath: Install to custom data directory path
-        # - --no-opengl-libs: Skip OpenGL libraries (not needed in Docker/headless environments)
-        # - --no-man-page: Skip man pages to reduce installation size
-        #
-        install_args = [
-            "bash",
-            installer_path,
-            "--silent",                    # Silent installation with EULA acceptance
-            "--toolkit",                   # Install toolkit only (not driver)
-            "--override",                  # Override installation checks for custom paths
-            f"--toolkitpath={install_path}", # Install to custom data directory
-            "--no-opengl-libs",            # Skip OpenGL libraries (not needed in Docker)
-            "--no-man-page",               # Skip man pages to reduce size
-        ]
-        
-        await self._broadcast_log_line(f"Installer arguments: {' '.join(install_args[2:])}")  # Skip 'bash' and installer_path
-
-        # Set up environment to prevent /dev/tty access issues in Docker
-        env = os.environ.copy()
-        env["DEBIAN_FRONTEND"] = "noninteractive"
-        # Disable interactive prompts
-        env["PERL_BADLANG"] = "0"
-        # Ensure we're in a non-interactive environment
-        env["TERM"] = "dumb"
-        # Prevent installer from trying to access /dev/tty
-        env["CI"] = "true"  # Indicate we're in a CI/non-interactive environment
-
-        process = await asyncio.create_subprocess_exec(
-            *install_args,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-            stdin=asyncio.subprocess.DEVNULL,  # Redirect stdin to prevent /dev/tty access
-            env=env,
-        )
-
-        # Collect output for error analysis
-        output_lines = []
-        
-        async def _stream_output():
-            if process.stdout is None:
-                return
-            with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
-                while True:
-                    chunk = await process.stdout.readline()
-                    if not chunk:
-                        break
-                    text = chunk.decode("utf-8", errors="replace")
-                    output_lines.append(text)
-                    log_file.write(text)
-                    await self._broadcast_log_line(text.rstrip("\n"))
-
-        await asyncio.gather(process.wait(), _stream_output())
-
-        if process.returncode != 0:
-            # Check for specific error patterns
-            output_text = "".join(output_lines)
-            
-            # Check for /dev/tty errors
-            if "/dev/tty" in output_text.lower() or "cannot create /dev/tty" in output_text.lower():
-                error_msg = (
-                    f"CUDA installer failed due to /dev/tty access issue (common in Docker). "
-                    f"This may indicate the installer file is corrupted or the environment is not properly configured. "
-                    f"Exit code: {process.returncode}. "
-                    f"Please check the installation logs for details. "
-                    f"If the file appears corrupted, try deleting it and re-downloading."
-                )
-            # Check for gzip/corruption errors
-            elif "gzip" in output_text.lower() and ("unexpected end" in output_text.lower() or "corrupt" in output_text.lower()):
-                error_msg = (
-                    f"CUDA installer file appears to be corrupted (gzip error detected). "
-                    f"Please delete the installer file at {installer_path} and try again. "
-                    f"Exit code: {process.returncode}."
-                )
-            else:
-                error_msg = (
-                    f"CUDA installer exited with code {process.returncode}. "
-                    "Please check the installation logs for details."
-                )
-            
-            raise RuntimeError(error_msg)
-
-        # Verify installation and set up environment
-        cuda_home = install_path
-        cuda_bin = os.path.join(cuda_home, "bin")
-        cuda_lib = os.path.join(cuda_home, "lib64")
-
-        # Verify key directories exist
-        if not os.path.exists(cuda_bin) or not os.path.exists(cuda_lib):
-            raise RuntimeError(
-                f"CUDA installation completed but expected directories not found. "
-                f"Expected: {cuda_bin}, {cuda_lib}"
-            )
-
-        await self._broadcast_log_line(
-            f"CUDA installed successfully to: {install_path}"
-        )
-        await self._broadcast_log_line(f"CUDA_HOME={cuda_home}")
-        await self._broadcast_log_line(f"Adding to PATH: {cuda_bin}")
-        await self._broadcast_log_line(f"Adding to LD_LIBRARY_PATH: {cuda_lib}")
-
-        # Install NCCL (required for multi-GPU and llama.cpp CUDA builds)
-        await self._install_nccl_linux(version, install_path)
-
-        # Install nvidia-smi (required for GPU monitoring)
-        await self._install_nvidia_smi_linux(install_path)
-
-        # Install cuDNN if requested
-        if install_cudnn:
-            await self._install_cudnn_linux(version, install_path)
-
-        # Install TensorRT if requested
-        if install_tensorrt:
-            await self._install_tensorrt_linux(version, install_path)
-
-        # Save installation path to state
-        state = self._load_state()
-        if "installations" not in state:
-            state["installations"] = {}
-        state["installations"][version] = {
-            "path": install_path,
-            "installed_at": _utcnow(),
-            "is_system_install": False,
-            "cudnn_installed": install_cudnn,
-            "tensorrt_installed": install_tensorrt,
-        }
-        self._save_state(state)
-
-        # Update the current symlink to point to this installation
-        self._update_current_symlink(install_path)
-        await self._broadcast_log_line(
-            f"Updated CUDA current symlink: /app/data/cuda/current -> {install_path}"
-        )
-
-        components = ["CUDA", "NCCL", "nvidia-smi"]
-        if install_cudnn:
-            components.append("cuDNN")
-        if install_tensorrt:
-            components.append("TensorRT")
-
-        await self._broadcast_progress(
-            {
-                "stage": "install",
-                "progress": 100,
-                "message": f"{', '.join(components)} installation completed",
-            }
-        )
-
-        return install_path
-
-    async def _install_nccl_linux(self, cuda_version: str, cuda_path: str) -> None:
-        """Install NCCL library for multi-GPU support."""
-        await self._broadcast_log_line(
-            "Installing NCCL (NVIDIA Collective Communications Library)..."
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "nccl",
-                "progress": 0,
-                "message": "Installing NCCL...",
-            }
-        )
-
-        ubuntu_version = self._get_ubuntu_version()
-
-        # Download NCCL from NVIDIA's repo package index
-        await self._broadcast_log_line("Attempting manual NCCL installation...")
-
-        try:
-            cuda_major = cuda_version.split(".")[0]
-            packages = await self._get_repo_packages(ubuntu_version)
-            nccl_pkg = self._select_repo_package(
-                packages,
-                "libnccl2",
-                version_prefix="2.",
-                version_contains=f"+cuda{cuda_major}",
-            )
-            nccl_dev_pkg = self._select_repo_package(
-                packages,
-                "libnccl-dev",
-                version_prefix="2.",
-                version_contains=f"+cuda{cuda_major}",
-            )
-
-            if not nccl_pkg or not nccl_dev_pkg:
-                await self._broadcast_log_line(
-                    "NCCL packages not found in repository, skipping NCCL installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "nccl",
-                        "progress": 100,
-                        "message": "NCCL installation skipped (optional)",
-                    }
-                )
-                return
-
-            base_url = (
-                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
-            )
-            nccl_url = base_url + nccl_pkg.get("Filename", "").lstrip("./")
-            nccl_dev_url = base_url + nccl_dev_pkg.get("Filename", "").lstrip("./")
-
-            nccl_path = os.path.join(self._download_dir, "libnccl2.deb")
-            nccl_dev_path = os.path.join(self._download_dir, "libnccl-dev.deb")
-
-            await self._broadcast_progress(
-                {
-                    "stage": "nccl",
-                    "progress": 25,
-                    "message": "Downloading NCCL packages...",
-                }
-            )
-
-            # Download NCCL packages
-            async with aiohttp.ClientSession() as session:
-                for url, path, name in [
-                    (nccl_url, nccl_path, "libnccl2"),
-                    (nccl_dev_url, nccl_dev_path, "libnccl-dev"),
-                ]:
-                    try:
-                        await self._broadcast_log_line(f"Downloading {name}...")
-                        async with session.get(url) as response:
-                            if response.status == 200:
-                                async with aiofiles.open(path, "wb") as f:
-                                    await f.write(await response.read())
-                                await self._broadcast_log_line(f"Downloaded {name}")
-                            else:
-                                await self._broadcast_log_line(
-                                    f"Failed to download {name}: HTTP {response.status}"
-                                )
-                                # Try alternative URL with different NCCL version
-                                continue
-                    except Exception as download_err:
-                        await self._broadcast_log_line(
-                            f"Download error for {name}: {download_err}"
-                        )
-                        continue
-
-            await self._broadcast_progress(
-                {
-                    "stage": "nccl",
-                    "progress": 50,
-                    "message": "Installing NCCL packages...",
-                }
-            )
-
-            if os.path.exists(nccl_path):
-                await self._broadcast_log_line(
-                    "Extracting NCCL to CUDA directory..."
-                )
-
-                # Extract .deb file (it's an ar archive containing data.tar)
-                extract_dir = os.path.join(self._download_dir, "nccl_extract")
-                os.makedirs(extract_dir, exist_ok=True)
-
-                for deb_path in [nccl_path, nccl_dev_path]:
-                    if os.path.exists(deb_path):
-                        # Extract using ar and tar
-                        extract_process = await asyncio.create_subprocess_exec(
-                            "bash",
-                            "-c",
-                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
-                            stdout=asyncio.subprocess.PIPE,
-                            stderr=asyncio.subprocess.STDOUT,
-                        )
-                        await extract_process.wait()
-
-                # Copy NCCL files to CUDA installation
-                nccl_lib_src = os.path.join(
-                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
-                )
-                nccl_include_src = os.path.join(extract_dir, "usr", "include")
-
-                cuda_lib_dst = os.path.join(cuda_path, "lib64")
-                cuda_include_dst = os.path.join(cuda_path, "include")
-
-                if os.path.exists(nccl_lib_src):
-                    # First pass: collect files and symlinks, copy actual files first
-                    files_to_copy = []
-                    symlinks_to_create = []
-                    
-                    for f in os.listdir(nccl_lib_src):
-                        if "nccl" in f.lower():
-                            src = os.path.join(nccl_lib_src, f)
-                            dst = os.path.join(cuda_lib_dst, f)
-                            
-                            if os.path.islink(src):
-                                # Resolve symlink to find actual target
-                                link_target = os.readlink(src)
-                                # If relative symlink, resolve relative to source directory
-                                if not os.path.isabs(link_target):
-                                    link_target = os.path.normpath(
-                                        os.path.join(os.path.dirname(src), link_target)
-                                    )
-                                # Find the actual target file name
-                                actual_target = os.path.basename(link_target)
-                                symlinks_to_create.append((f, actual_target, dst))
-                            else:
-                                files_to_copy.append((f, src, dst))
-                    
-                    # Copy all actual files first
-                    for f, src, dst in files_to_copy:
-                        try:
-                            shutil.copy2(src, dst)
-                            await self._broadcast_log_line(
-                                f"Copied {f} to CUDA lib directory"
-                            )
-                        except Exception as copy_err:
-                            await self._broadcast_log_line(
-                                f"Failed to copy {f}: {copy_err}"
-                            )
-                    
-                    # Then create symlinks pointing to the copied files
-                    for link_name, target_name, dst in symlinks_to_create:
-                        try:
-                            if os.path.exists(dst):
-                                os.remove(dst)
-                            # Create symlink pointing to target in same directory
-                            os.symlink(target_name, dst)
-                            await self._broadcast_log_line(
-                                f"Created symlink {link_name} -> {target_name} in CUDA lib directory"
-                            )
-                        except Exception as link_err:
-                            await self._broadcast_log_line(
-                                f"Failed to create symlink {link_name}: {link_err}"
-                            )
-
-                if os.path.exists(nccl_include_src):
-                    for f in os.listdir(nccl_include_src):
-                        if "nccl" in f.lower():
-                            src = os.path.join(nccl_include_src, f)
-                            dst = os.path.join(cuda_include_dst, f)
-                            try:
-                                if os.path.isdir(src):
-                                    # Handle directories by copying recursively
-                                    if os.path.exists(dst):
-                                        shutil.rmtree(dst)
-                                    shutil.copytree(src, dst)
-                                    await self._broadcast_log_line(
-                                        f"Copied directory {f} to CUDA include directory"
-                                    )
-                                else:
-                                    # Handle regular files
-                                    shutil.copy2(src, dst)
-                                    await self._broadcast_log_line(
-                                        f"Copied {f} to CUDA include directory"
-                                    )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                # Cleanup temporary extract directory only (keep .deb files)
-                shutil.rmtree(extract_dir, ignore_errors=True)
-                # Keep .deb files for future use
-                logger.info(f"NCCL packages kept at: {nccl_path}, {nccl_dev_path}")
-
-                await self._broadcast_log_line("NCCL extracted to CUDA directory")
-                await self._broadcast_progress(
-                    {
-                        "stage": "nccl",
-                        "progress": 100,
-                        "message": "NCCL installed successfully",
-                    }
-                )
-            else:
-                await self._broadcast_log_line(
-                    "NCCL packages not available, skipping NCCL installation"
-                )
-                await self._broadcast_log_line(
-                    "Note: NCCL is optional but recommended for multi-GPU builds"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "nccl",
-                        "progress": 100,
-                        "message": "NCCL installation skipped (optional)",
-                    }
-                )
-
-        except Exception as e:
-            await self._broadcast_log_line(f"NCCL installation error: {e}")
-            await self._broadcast_log_line(
-                "Note: NCCL is optional. The build will continue without multi-GPU support."
-            )
-            await self._broadcast_progress(
-                {
-                    "stage": "nccl",
-                    "progress": 100,
-                    "message": "NCCL installation skipped (optional)",
-                }
-            )
-
-    async def _install_nvidia_smi_linux(self, cuda_path: str) -> None:
-        """Install nvidia-smi binary for GPU monitoring."""
-        await self._broadcast_log_line(
-            "Installing nvidia-smi (NVIDIA System Management Interface)..."
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "nvidia-smi",
-                "progress": 0,
-                "message": "Installing nvidia-smi...",
-            }
-        )
-
-        # Check if nvidia-smi already exists in CUDA installation
-        cuda_bin = os.path.join(cuda_path, "bin")
-        nvidia_smi_dst = os.path.join(cuda_bin, "nvidia-smi")
-        if os.path.exists(nvidia_smi_dst):
-            await self._broadcast_log_line(
-                "nvidia-smi already exists in CUDA installation, skipping"
-            )
-            await self._broadcast_progress(
-                {
-                    "stage": "nvidia-smi",
-                    "progress": 100,
-                    "message": "nvidia-smi already installed",
-                }
-            )
-            return
-
-        ubuntu_version = self._get_ubuntu_version()
-
-        try:
-            # Try to find nvidia-utils package which contains nvidia-smi
-            packages = await self._get_repo_packages(ubuntu_version)
-            nvidia_utils_pkg = None
-            
-            # Try multiple package name patterns
-            for pkg_name in ["nvidia-utils", "nvidia-driver-utils", "nvidia-utils-"]:
-                nvidia_utils_pkg = self._select_repo_package(
-                    packages,
-                    pkg_name,
-                )
-                if nvidia_utils_pkg:
-                    break
-
-            if not nvidia_utils_pkg:
-                await self._broadcast_log_line(
-                    "nvidia-utils package not found in repository, skipping nvidia-smi installation"
-                )
-                await self._broadcast_log_line(
-                    "Note: nvidia-smi will not be available. GPU monitoring may be limited."
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "nvidia-smi",
-                        "progress": 100,
-                        "message": "nvidia-smi installation skipped (package not available)",
-                    }
-                )
-                return
-
-            base_url = (
-                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
-            )
-            nvidia_utils_url = base_url + nvidia_utils_pkg.get("Filename", "").lstrip("./")
-
-            nvidia_utils_path = os.path.join(self._download_dir, "nvidia-utils.deb")
-
-            await self._broadcast_progress(
-                {
-                    "stage": "nvidia-smi",
-                    "progress": 25,
-                    "message": "Downloading nvidia-utils package...",
-                }
-            )
-
-            # Download nvidia-utils package
-            async with aiohttp.ClientSession() as session:
-                try:
-                    await self._broadcast_log_line("Downloading nvidia-utils...")
-                    async with session.get(nvidia_utils_url) as response:
-                        if response.status == 200:
-                            async with aiofiles.open(nvidia_utils_path, "wb") as f:
-                                await f.write(await response.read())
-                            await self._broadcast_log_line("Downloaded nvidia-utils")
-                        else:
-                            await self._broadcast_log_line(
-                                f"Failed to download nvidia-utils: HTTP {response.status}"
-                            )
-                            raise RuntimeError(f"Failed to download nvidia-utils: HTTP {response.status}")
-                except Exception as download_err:
-                    await self._broadcast_log_line(
-                        f"Download error for nvidia-utils: {download_err}"
-                    )
-                    raise
-
-            await self._broadcast_progress(
-                {
-                    "stage": "nvidia-smi",
-                    "progress": 50,
-                    "message": "Extracting nvidia-smi...",
-                }
-            )
-
-            if os.path.exists(nvidia_utils_path):
-                await self._broadcast_log_line(
-                    "Extracting nvidia-smi to CUDA directory..."
-                )
-
-                # Extract .deb file
-                extract_dir = os.path.join(self._download_dir, "nvidia_utils_extract")
-                os.makedirs(extract_dir, exist_ok=True)
-
-                # Extract using ar and tar
-                extract_process = await asyncio.create_subprocess_exec(
-                    "bash",
-                    "-c",
-                    f"cd {extract_dir} && ar x {nvidia_utils_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
-                    stdout=asyncio.subprocess.PIPE,
-                    stderr=asyncio.subprocess.STDOUT,
-                )
-                await extract_process.wait()
-
-                # Copy nvidia-smi binary to CUDA installation
-                nvidia_smi_src = os.path.join(extract_dir, "usr", "bin", "nvidia-smi")
-                cuda_bin_dst = os.path.join(cuda_path, "bin")
-                nvidia_smi_dst = os.path.join(cuda_bin_dst, "nvidia-smi")
-
-                if os.path.exists(nvidia_smi_src):
-                    os.makedirs(cuda_bin_dst, exist_ok=True)
-                    try:
-                        shutil.copy2(nvidia_smi_src, nvidia_smi_dst)
-                        os.chmod(nvidia_smi_dst, 0o755)
-                        await self._broadcast_log_line(
-                            "Copied nvidia-smi to CUDA bin directory"
-                        )
-                        await self._broadcast_progress(
-                            {
-                                "stage": "nvidia-smi",
-                                "progress": 100,
-                                "message": "nvidia-smi installed successfully",
-                            }
-                        )
-                    except Exception as copy_err:
-                        await self._broadcast_log_line(
-                            f"Failed to copy nvidia-smi: {copy_err}"
-                        )
-                        raise
-                else:
-                    await self._broadcast_log_line(
-                        "nvidia-smi not found in extracted package"
-                    )
-                    await self._broadcast_progress(
-                        {
-                            "stage": "nvidia-smi",
-                            "progress": 100,
-                            "message": "nvidia-smi installation skipped (not in package)",
-                        }
-                    )
-
-                # Cleanup temporary extract directory only (keep .deb file)
-                shutil.rmtree(extract_dir, ignore_errors=True)
-                # Keep .deb file for future use
-                if os.path.exists(nvidia_utils_path):
-                    logger.info(f"nvidia-utils package kept at: {nvidia_utils_path}")
-
-            else:
-                await self._broadcast_log_line(
-                    "nvidia-utils package not available, skipping nvidia-smi installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "nvidia-smi",
-                        "progress": 100,
-                        "message": "nvidia-smi installation skipped (package not available)",
-                    }
-                )
-
-        except Exception as e:
-            await self._broadcast_log_line(f"nvidia-smi installation error: {e}")
-            await self._broadcast_log_line(
-                "Note: nvidia-smi installation failed. GPU monitoring may be limited."
-            )
-            await self._broadcast_progress(
-                {
-                    "stage": "nvidia-smi",
-                    "progress": 100,
-                    "message": "nvidia-smi installation skipped (error occurred)",
-                }
-            )
-
-    async def _install_cudnn_linux(self, cuda_version: str, cuda_path: str) -> None:
-        """Install cuDNN library for deep learning primitives."""
-        await self._broadcast_log_line(
-            "Installing cuDNN (CUDA Deep Neural Network library)..."
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "cudnn",
-                "progress": 0,
-                "message": "Installing cuDNN...",
-            }
-        )
-
-        try:
-            # Determine CUDA major version for cuDNN compatibility
-            cuda_major = cuda_version.split(".")[0]
-            cudnn_version = self.CUDNN_VERSIONS.get(cuda_major)
-            
-            if not cudnn_version:
-                await self._broadcast_log_line(
-                    f"cuDNN version not available for CUDA {cuda_version}, skipping"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "cudnn",
-                        "progress": 100,
-                        "message": "cuDNN installation skipped (version not available)",
-                    }
-                )
-                return
-
-            ubuntu_version = self._get_ubuntu_version()
-            
-            # cuDNN package names vary by CUDA version
-            # For CUDA 12.x: libcudnn9-cuda-12, libcudnn9-dev-cuda-12
-            # For CUDA 11.x: libcudnn8-cuda-11, libcudnn8-dev-cuda-11
-            if cuda_major == "12" or cuda_major == "13":
-                cudnn_pkg = "libcudnn9"
-                cudnn_cuda_suffix = f"cuda-{cuda_major}"
-            else:
-                cudnn_pkg = "libcudnn8"
-                cudnn_cuda_suffix = f"cuda-{cuda_major}"
-
-            # Manual cuDNN installation
-            await self._broadcast_log_line("Installing cuDNN packages...")
-
-            cudnn_package_name = f"{cudnn_pkg}-{cudnn_cuda_suffix}"
-            cudnn_dev_package_name = f"{cudnn_pkg}-dev-{cudnn_cuda_suffix}"
-            packages = await self._get_repo_packages(ubuntu_version)
-            cudnn_pkg_entry = self._select_repo_package(
-                packages, cudnn_package_name, version_prefix=cudnn_version
-            )
-            cudnn_dev_pkg_entry = self._select_repo_package(
-                packages, cudnn_dev_package_name, version_prefix=cudnn_version
-            )
-
-            if not cudnn_pkg_entry or not cudnn_dev_pkg_entry:
-                await self._broadcast_log_line(
-                    "cuDNN packages not found in repository, skipping cuDNN installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "cudnn",
-                        "progress": 100,
-                        "message": "cuDNN installation skipped (optional)",
-                    }
-                )
-                return
-
-            base_url = (
-                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
-            )
-            cudnn_url = base_url + cudnn_pkg_entry.get("Filename", "").lstrip("./")
-            cudnn_dev_url = base_url + cudnn_dev_pkg_entry.get("Filename", "").lstrip("./")
-
-            cudnn_path = os.path.join(self._download_dir, f"{cudnn_pkg}.deb")
-            cudnn_dev_path = os.path.join(self._download_dir, f"{cudnn_pkg}-dev.deb")
-
-            await self._broadcast_progress(
-                {
-                    "stage": "cudnn",
-                    "progress": 25,
-                    "message": "Downloading cuDNN packages...",
-                }
-            )
-
-            # Download cuDNN packages
-            async with aiohttp.ClientSession() as session:
-                for url, path, name in [
-                    (cudnn_url, cudnn_path, cudnn_pkg),
-                    (cudnn_dev_url, cudnn_dev_path, f"{cudnn_pkg}-dev"),
-                ]:
-                    try:
-                        await self._broadcast_log_line(f"Downloading {name}...")
-                        async with session.get(url) as response:
-                            if response.status == 200:
-                                async with aiofiles.open(path, "wb") as f:
-                                    await f.write(await response.read())
-                                await self._broadcast_log_line(f"Downloaded {name}")
-                            else:
-                                await self._broadcast_log_line(
-                                    f"Failed to download {name}: HTTP {response.status}"
-                                )
-                                # Try alternative URL pattern
-                                continue
-                    except Exception as download_err:
-                        await self._broadcast_log_line(
-                            f"Download error for {name}: {download_err}"
-                        )
-                        continue
-
-            await self._broadcast_progress(
-                {
-                    "stage": "cudnn",
-                    "progress": 50,
-                    "message": "Installing cuDNN packages...",
-                }
-            )
-
-            if os.path.exists(cudnn_path):
-                await self._broadcast_log_line(
-                    "Extracting cuDNN to CUDA directory..."
-                )
-
-                # Extract .deb file
-                extract_dir = os.path.join(self._download_dir, "cudnn_extract")
-                os.makedirs(extract_dir, exist_ok=True)
-
-                for deb_path in [cudnn_path, cudnn_dev_path]:
-                    if os.path.exists(deb_path):
-                        # Extract using ar and tar
-                        extract_process = await asyncio.create_subprocess_exec(
-                            "bash",
-                            "-c",
-                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
-                            stdout=asyncio.subprocess.PIPE,
-                            stderr=asyncio.subprocess.STDOUT,
-                        )
-                        await extract_process.wait()
-
-                # Copy cuDNN files to CUDA installation
-                cudnn_lib_src = os.path.join(
-                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
-                )
-                cudnn_include_src = os.path.join(extract_dir, "usr", "include")
-
-                cuda_lib_dst = os.path.join(cuda_path, "lib64")
-                cuda_include_dst = os.path.join(cuda_path, "include")
-
-                if os.path.exists(cudnn_lib_src):
-                    for f in os.listdir(cudnn_lib_src):
-                        if "cudnn" in f.lower():
-                            src = os.path.join(cudnn_lib_src, f)
-                            dst = os.path.join(cuda_lib_dst, f)
-                            try:
-                                if os.path.islink(src):
-                                    linkto = os.readlink(src)
-                                    if os.path.exists(dst):
-                                        os.remove(dst)
-                                    os.symlink(linkto, dst)
-                                else:
-                                    shutil.copy2(src, dst)
-                                await self._broadcast_log_line(
-                                    f"Copied {f} to CUDA lib directory"
-                                )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                if os.path.exists(cudnn_include_src):
-                    for f in os.listdir(cudnn_include_src):
-                        if "cudnn" in f.lower():
-                            src = os.path.join(cudnn_include_src, f)
-                            dst = os.path.join(cuda_include_dst, f)
-                            try:
-                                shutil.copy2(src, dst)
-                                await self._broadcast_log_line(
-                                    f"Copied {f} to CUDA include directory"
-                                )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                # Cleanup temporary extract directory only (keep .deb files)
-                shutil.rmtree(extract_dir, ignore_errors=True)
-                # Keep .deb files for future use
-                logger.info(f"cuDNN packages kept at: {cudnn_path}, {cudnn_dev_path}")
-
-                await self._broadcast_log_line("cuDNN extracted to CUDA directory")
-                await self._broadcast_progress(
-                    {
-                        "stage": "cudnn",
-                        "progress": 100,
-                        "message": "cuDNN installed successfully",
-                    }
-                )
-            else:
-                await self._broadcast_log_line(
-                    "cuDNN packages not available, skipping cuDNN installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "cudnn",
-                        "progress": 100,
-                        "message": "cuDNN installation skipped (optional)",
-                    }
-                )
-
-        except Exception as e:
-            await self._broadcast_log_line(f"cuDNN installation error: {e}")
-            await self._broadcast_log_line(
-                "Note: cuDNN is optional. The build will continue without cuDNN support."
-            )
-            await self._broadcast_progress(
-                {
-                    "stage": "cudnn",
-                    "progress": 100,
-                    "message": "cuDNN installation skipped (optional)",
-                }
-            )
-
-    async def _install_tensorrt_linux(self, cuda_version: str, cuda_path: str) -> None:
-        """Install TensorRT library for inference optimization."""
-        await self._broadcast_log_line(
-            "Installing TensorRT (NVIDIA TensorRT inference library)..."
-        )
-        await self._broadcast_progress(
-            {
-                "stage": "tensorrt",
-                "progress": 0,
-                "message": "Installing TensorRT...",
-            }
-        )
-
-        try:
-            # Determine CUDA major version for TensorRT compatibility
-            cuda_major = cuda_version.split(".")[0]
-            tensorrt_version = self.TENSORRT_VERSIONS.get(cuda_major)
-            
-            if not tensorrt_version:
-                await self._broadcast_log_line(
-                    f"TensorRT version not available for CUDA {cuda_version}, skipping"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "tensorrt",
-                        "progress": 100,
-                        "message": "TensorRT installation skipped (version not available)",
-                    }
-                )
-                return
-
-            ubuntu_version = self._get_ubuntu_version()
-            
-            # TensorRT package names
-            # For CUDA 12.x/13.x: libnvinfer10, libnvinfer-dev, libnvinfer-plugin10, libnvinfer-plugin-dev
-            # For CUDA 11.x: libnvinfer8, libnvinfer-dev, libnvinfer-plugin8, libnvinfer-plugin-dev
-            if cuda_major == "12" or cuda_major == "13":
-                tensorrt_pkg = "libnvinfer10"
-                tensorrt_plugin_pkg = "libnvinfer-plugin10"
-            else:
-                tensorrt_pkg = "libnvinfer8"
-                tensorrt_plugin_pkg = "libnvinfer-plugin8"
-
-            # Manual TensorRT installation
-            await self._broadcast_log_line("Installing TensorRT packages...")
-
-            packages = await self._get_repo_packages(ubuntu_version)
-            tensorrt_pkg_entry = self._select_repo_package(
-                packages, tensorrt_pkg, version_prefix=tensorrt_version
-            )
-            tensorrt_dev_pkg_entry = self._select_repo_package(
-                packages, f"{tensorrt_pkg}-dev", version_prefix=tensorrt_version
-            )
-            tensorrt_plugin_entry = self._select_repo_package(
-                packages, tensorrt_plugin_pkg, version_prefix=tensorrt_version
-            )
-            tensorrt_plugin_dev_entry = self._select_repo_package(
-                packages, f"{tensorrt_plugin_pkg}-dev", version_prefix=tensorrt_version
-            )
-
-            if not all(
-                [
-                    tensorrt_pkg_entry,
-                    tensorrt_dev_pkg_entry,
-                    tensorrt_plugin_entry,
-                    tensorrt_plugin_dev_entry,
-                ]
-            ):
-                await self._broadcast_log_line(
-                    "TensorRT packages not found in repository, skipping TensorRT installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "tensorrt",
-                        "progress": 100,
-                        "message": "TensorRT installation skipped (optional)",
-                    }
-                )
-                return
-
-            base_url = (
-                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
-            )
-            tensorrt_url = base_url + tensorrt_pkg_entry.get("Filename", "").lstrip("./")
-            tensorrt_dev_url = base_url + tensorrt_dev_pkg_entry.get("Filename", "").lstrip("./")
-            tensorrt_plugin_url = base_url + tensorrt_plugin_entry.get("Filename", "").lstrip("./")
-            tensorrt_plugin_dev_url = base_url + tensorrt_plugin_dev_entry.get("Filename", "").lstrip("./")
-
-            tensorrt_path = os.path.join(self._download_dir, f"{tensorrt_pkg}.deb")
-            tensorrt_dev_path = os.path.join(self._download_dir, f"{tensorrt_pkg}-dev.deb")
-            tensorrt_plugin_path = os.path.join(self._download_dir, f"{tensorrt_plugin_pkg}.deb")
-            tensorrt_plugin_dev_path = os.path.join(self._download_dir, f"{tensorrt_plugin_pkg}-dev.deb")
-
-            await self._broadcast_progress(
-                {
-                    "stage": "tensorrt",
-                    "progress": 25,
-                    "message": "Downloading TensorRT packages...",
-                }
-            )
-
-            # Download TensorRT packages
-            async with aiohttp.ClientSession() as session:
-                for url, path, name in [
-                    (tensorrt_url, tensorrt_path, tensorrt_pkg),
-                    (tensorrt_dev_url, tensorrt_dev_path, f"{tensorrt_pkg}-dev"),
-                    (tensorrt_plugin_url, tensorrt_plugin_path, tensorrt_plugin_pkg),
-                    (tensorrt_plugin_dev_url, tensorrt_plugin_dev_path, f"{tensorrt_plugin_pkg}-dev"),
-                ]:
-                    try:
-                        await self._broadcast_log_line(f"Downloading {name}...")
-                        async with session.get(url) as response:
-                            if response.status == 200:
-                                async with aiofiles.open(path, "wb") as f:
-                                    await f.write(await response.read())
-                                await self._broadcast_log_line(f"Downloaded {name}")
-                            else:
-                                await self._broadcast_log_line(
-                                    f"Failed to download {name}: HTTP {response.status}"
-                                )
-                                continue
-                    except Exception as download_err:
-                        await self._broadcast_log_line(
-                            f"Download error for {name}: {download_err}"
-                        )
-                        continue
-
-            await self._broadcast_progress(
-                {
-                    "stage": "tensorrt",
-                    "progress": 50,
-                    "message": "Installing TensorRT packages...",
-                }
-            )
-
-            if os.path.exists(tensorrt_path):
-                await self._broadcast_log_line(
-                    "Extracting TensorRT to CUDA directory..."
-                )
-
-                # Extract .deb file
-                extract_dir = os.path.join(self._download_dir, "tensorrt_extract")
-                os.makedirs(extract_dir, exist_ok=True)
-
-                for deb_path in [
-                    tensorrt_path,
-                    tensorrt_dev_path,
-                    tensorrt_plugin_path,
-                    tensorrt_plugin_dev_path,
-                ]:
-                    if os.path.exists(deb_path):
-                        # Extract using ar and tar
-                        extract_process = await asyncio.create_subprocess_exec(
-                            "bash",
-                            "-c",
-                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
-                            stdout=asyncio.subprocess.PIPE,
-                            stderr=asyncio.subprocess.STDOUT,
-                        )
-                        await extract_process.wait()
-
-                # Copy TensorRT files to CUDA installation
-                tensorrt_lib_src = os.path.join(
-                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
-                )
-                tensorrt_include_src = os.path.join(extract_dir, "usr", "include")
-                tensorrt_bin_src = os.path.join(extract_dir, "usr", "bin")
-
-                cuda_lib_dst = os.path.join(cuda_path, "lib64")
-                cuda_include_dst = os.path.join(cuda_path, "include")
-                cuda_bin_dst = os.path.join(cuda_path, "bin")
-
-                # Copy libraries
-                if os.path.exists(tensorrt_lib_src):
-                    for f in os.listdir(tensorrt_lib_src):
-                        if "nvinfer" in f.lower() or "tensorrt" in f.lower():
-                            src = os.path.join(tensorrt_lib_src, f)
-                            dst = os.path.join(cuda_lib_dst, f)
-                            try:
-                                if os.path.islink(src):
-                                    linkto = os.readlink(src)
-                                    if os.path.exists(dst):
-                                        os.remove(dst)
-                                    os.symlink(linkto, dst)
-                                else:
-                                    shutil.copy2(src, dst)
-                                await self._broadcast_log_line(
-                                    f"Copied {f} to CUDA lib directory"
-                                )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                # Copy headers
-                if os.path.exists(tensorrt_include_src):
-                    for f in os.listdir(tensorrt_include_src):
-                        if "nvinfer" in f.lower() or "tensorrt" in f.lower():
-                            src = os.path.join(tensorrt_include_src, f)
-                            dst = os.path.join(cuda_include_dst, f)
-                            try:
-                                if os.path.isdir(src):
-                                    shutil.copytree(src, dst, dirs_exist_ok=True)
-                                else:
-                                    shutil.copy2(src, dst)
-                                await self._broadcast_log_line(
-                                    f"Copied {f} to CUDA include directory"
-                                )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                # Copy binaries (like trtexec)
-                if os.path.exists(tensorrt_bin_src):
-                    for f in os.listdir(tensorrt_bin_src):
-                        if "trt" in f.lower() or "nvinfer" in f.lower():
-                            src = os.path.join(tensorrt_bin_src, f)
-                            dst = os.path.join(cuda_bin_dst, f)
-                            try:
-                                shutil.copy2(src, dst)
-                                os.chmod(dst, 0o755)
-                                await self._broadcast_log_line(
-                                    f"Copied {f} to CUDA bin directory"
-                                )
-                            except Exception as copy_err:
-                                await self._broadcast_log_line(
-                                    f"Failed to copy {f}: {copy_err}"
-                                )
-
-                # Cleanup temporary extract directory only (keep .deb files)
-                shutil.rmtree(extract_dir, ignore_errors=True)
-                # Keep .deb files for future use
-                logger.info(
-                    f"TensorRT packages kept at: {tensorrt_path}, {tensorrt_dev_path}, "
-                    f"{tensorrt_plugin_path}, {tensorrt_plugin_dev_path}"
-                )
-
-                await self._broadcast_log_line("TensorRT extracted to CUDA directory")
-                await self._broadcast_progress(
-                    {
-                        "stage": "tensorrt",
-                        "progress": 100,
-                        "message": "TensorRT installed successfully",
-                    }
-                )
-            else:
-                await self._broadcast_log_line(
-                    "TensorRT packages not available, skipping TensorRT installation"
-                )
-                await self._broadcast_progress(
-                    {
-                        "stage": "tensorrt",
-                        "progress": 100,
-                        "message": "TensorRT installation skipped (optional)",
-                    }
-                )
-
-        except Exception as e:
-            await self._broadcast_log_line(f"TensorRT installation error: {e}")
-            await self._broadcast_log_line(
-                "Note: TensorRT is optional. The build will continue without TensorRT support."
-            )
-            await self._broadcast_progress(
-                {
-                    "stage": "tensorrt",
-                    "progress": 100,
-                    "message": "TensorRT installation skipped (optional)",
-                }
-            )
-
-    async def install(
-        self,
-        version: str = "12.6",
-        install_cudnn: bool = False,
-        install_tensorrt: bool = False,
-    ) -> Dict[str, Any]:
-        """Install CUDA Toolkit with optional cuDNN and TensorRT."""
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another CUDA installer operation is already running"
-                )
-
-            system, arch = self._get_platform()
-
-            if system != "linux":
-                raise RuntimeError(
-                    f"CUDA installation is only supported on Linux, not {system}"
-                )
-
-            if version not in self.SUPPORTED_VERSIONS:
-                raise ValueError(
-                    f"Unsupported CUDA version: {version}. Supported versions: {', '.join(self.SUPPORTED_VERSIONS)}"
-                )
-
-            # Fetch the download URL dynamically
-            await self._broadcast_log_line(
-                f"Fetching download URL for CUDA {version}..."
-            )
-            url = await self._fetch_download_url(version)
-            installer_filename = os.path.basename(url)
-            installer_path = os.path.join(self._download_dir, installer_filename)
-
-            await self._set_operation("install")
-
-            async def _runner():
-                try:
-                    # Download installer
-                    await self._download_installer(version, url, installer_path)
-
-                    # Install (Linux only) - returns the installation path
-                    install_path = await self._install_linux(
-                        installer_path, version, install_cudnn, install_tensorrt
-                    )
-
-                    # Update state (already saved in _install_linux, but update main fields)
-                    state = self._load_state()
-                    state["installed_version"] = version
-                    state["installed_at"] = _utcnow()
-                    state["cuda_path"] = install_path
-                    if install_cudnn:
-                        state["cudnn_installed"] = True
-                    if install_tensorrt:
-                        state["tensorrt_installed"] = True
-                    self._save_state(state)
-
-                    components = ["CUDA Toolkit"]
-                    if install_cudnn:
-                        components.append("cuDNN")
-                    if install_tensorrt:
-                        components.append("TensorRT")
-                    
-                    await self._finish_operation(
-                        True, f"{', '.join(components)} installed successfully"
-                    )
-
-                    # Update current process environment with CUDA paths
-                    # This ensures the running application can use CUDA immediately
-                    cuda_env = self.get_cuda_env(version)
-                    if cuda_env:
-                        os.environ.update(cuda_env)
-                        logger.info(
-                            f"Updated process environment with CUDA {version} paths"
-                        )
-
-                    # Restart llama-swap to pick up new CUDA environment variables
-                    # llama-swap needs to be restarted because subprocess environment
-                    # variables are set at process creation time and can't be changed
-                    try:
-                        from backend.llama_swap_manager import get_llama_swap_manager
-                        llama_swap_manager = get_llama_swap_manager()
-                        await llama_swap_manager.restart_proxy()
-                        logger.info("Restarted llama-swap to pick up new CUDA environment")
-                    except Exception as restart_error:
-                        # Don't fail the installation if restart fails
-                        logger.warning(
-                            f"Failed to restart llama-swap after CUDA installation: {restart_error}. "
-                            f"You may need to manually restart llama-swap to use the new CUDA version."
-                        )
-
-                    # Keep installer file for future use (not deleting)
-                    logger.info(f"Installer file kept at: {installer_path}")
-
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    await self._finish_operation(False, str(exc))
-                    raise
-
-            self._create_task(_runner())
-            return {"message": f"CUDA {version} installation started"}
-
-    def _detect_cudnn_version(self, cuda_path: Optional[str]) -> Optional[str]:
-        """Detect installed cuDNN version by checking library files."""
-        if not cuda_path:
-            return None
-        
-        lib_path = os.path.join(cuda_path, "lib64")
-        if not os.path.exists(lib_path):
-            return None
-        
-        try:
-            for f in os.listdir(lib_path):
-                if "libcudnn" in f and ".so" in f:
-                    match = re.search(r"\.so(?:\.(\d+(?:\.\d+){0,2}))?", f)
-                    if match and match.group(1):
-                        return match.group(1)
-        except Exception:
-            pass
-        
-        return None
-
-    def _detect_tensorrt_version(self, cuda_path: Optional[str]) -> Optional[str]:
-        """Detect installed TensorRT version by checking library files."""
-        if not cuda_path:
-            return None
-        
-        lib_path = os.path.join(cuda_path, "lib64")
-        if not os.path.exists(lib_path):
-            return None
-        
-        try:
-            for f in os.listdir(lib_path):
-                if "libnvinfer" in f and ".so" in f and "plugin" not in f:
-                    match = re.search(r"\.so(?:\.(\d+(?:\.\d+){0,2}))?", f)
-                    if match and match.group(1):
-                        return match.group(1)
-        except Exception:
-            pass
-        
-        return None
-
-    def status(self) -> Dict[str, Any]:
-        """Get CUDA installation status."""
-        version = self._detect_installed_version()
-        cuda_path = self._get_cuda_path()
-        installed = version is not None and cuda_path is not None
-        state = self._load_state()
-        installations = state.get("installations", {})
-
-        # Detect cuDNN and TensorRT
-        cudnn_version = None
-        tensorrt_version = None
-        if cuda_path:
-            cudnn_version = self._detect_cudnn_version(cuda_path)
-            tensorrt_version = self._detect_tensorrt_version(cuda_path)
-
-        # Get all installed versions with their details
-        installed_versions = []
-        for v, info in installations.items():
-            install_path = info.get("path")
-            if install_path and os.path.exists(install_path):
-                installed_versions.append(
-                    {
-                        "version": v,
-                        "path": install_path,
-                        "installed_at": info.get("installed_at"),
-                        "is_system_install": info.get("is_system_install", False),
-                        "is_current": v == version,
-                        "cudnn_installed": info.get("cudnn_installed", False),
-                        "tensorrt_installed": info.get("tensorrt_installed", False),
-                    }
-                )
-
-        return {
-            "installed": installed,
-            "version": version,
-            "cuda_path": cuda_path,
-            "installed_at": state.get("installed_at"),
-            "installed_versions": installed_versions,
-            "operation": self._operation,
-            "operation_started_at": self._operation_started_at,
-            "last_error": self._last_error,
-            "log_path": self._log_path,
-            "available_versions": self.SUPPORTED_VERSIONS,
-            "platform": self._get_platform(),
-            "cudnn": {
-                "installed": cudnn_version is not None,
-                "version": cudnn_version,
-            },
-            "tensorrt": {
-                "installed": tensorrt_version is not None,
-                "version": tensorrt_version,
-            },
-        }
-
-    def is_operation_running(self) -> bool:
-        return self._operation is not None
-
-    def read_log_tail(self, max_bytes: int = 8192) -> str:
-        if not os.path.exists(self._log_path):
-            return ""
-        with open(self._log_path, "rb") as log_file:
-            log_file.seek(0, os.SEEK_END)
-            size = log_file.tell()
-            log_file.seek(max(0, size - max_bytes))
-            data = log_file.read().decode("utf-8", errors="replace")
-            if size > max_bytes:
-                data = data.split("\n", 1)[-1]
-            return data.strip()
-
-    async def uninstall(self, version: Optional[str] = None) -> Dict[str, Any]:
-        """Uninstall CUDA Toolkit."""
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another CUDA installer operation is already running"
-                )
-
-            # Determine which version to uninstall
-            if not version:
-                # Uninstall the currently detected version
-                version = self._detect_installed_version()
-                if not version:
-                    raise RuntimeError("No CUDA installation found to uninstall")
-
-            state = self._load_state()
-            installations = state.get("installations", {})
-
-            if version not in installations:
-                raise RuntimeError(f"CUDA {version} installation not found in state")
-
-            install_info = installations[version]
-            install_path = install_info.get("path")
-
-            if not install_path or not os.path.exists(install_path):
-                # Path doesn't exist, just remove from state
-                logger.warning(
-                    f"CUDA installation path {install_path} does not exist, removing from state only"
-                )
-                installations.pop(version, None)
-                if state.get("installed_version") == version:
-                    state["installed_version"] = None
-                    state["installed_at"] = None
-                    state["cuda_path"] = None
-                self._save_state(state)
-                return {
-                    "message": f"CUDA {version} removed from state (installation path not found)"
-                }
-
-            await self._set_operation("uninstall")
-
-            async def _runner():
-                try:
-                    await self._broadcast_log_line(
-                        f"Starting uninstallation of CUDA {version}..."
-                    )
-                    await self._broadcast_progress(
-                        {
-                            "stage": "uninstall",
-                            "progress": 0,
-                            "message": f"Uninstalling CUDA {version}...",
-                        }
-                    )
-
-                    # Remove the installation directory
-                    if os.path.exists(install_path):
-                        await self._broadcast_log_line(
-                            f"Removing installation directory: {install_path}"
-                        )
-                        try:
-                            shutil.rmtree(install_path)
-                            await self._broadcast_log_line(
-                                f"Successfully removed {install_path}"
-                            )
-                        except Exception as e:
-                            logger.error(
-                                f"Failed to remove CUDA installation directory: {e}"
-                            )
-                            raise RuntimeError(
-                                f"Failed to remove installation directory: {e}"
-                            )
-
-                    # Update state
-                    installations.pop(version, None)
-                    if state.get("installed_version") == version:
-                        state["installed_version"] = None
-                        state["installed_at"] = None
-                        state["cuda_path"] = None
-                    self._save_state(state)
-
-                    # Update or remove the current symlink
-                    self._remove_current_symlink()
-                    await self._broadcast_log_line(
-                        "Updated CUDA current symlink (removed or re-pointed to another version)"
-                    )
-
-                    await self._broadcast_progress(
-                        {
-                            "stage": "uninstall",
-                            "progress": 100,
-                            "message": "CUDA uninstallation completed",
-                        }
-                    )
-                    await self._broadcast_log_line(
-                        f"CUDA {version} uninstalled successfully"
-                    )
-                    await self._finish_operation(
-                        True, f"CUDA {version} uninstalled successfully"
-                    )
-
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    await self._finish_operation(False, str(exc))
-                    raise
-
-            self._create_task(_runner())
-            return {"message": f"CUDA {version} uninstallation started"}
+"""
+CUDA Toolkit Installer
+
+Handles downloading and installing CUDA Toolkit on Linux systems.
+"""
+
+import asyncio
+import json
+import os
+import platform
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import gzip
+from datetime import datetime, timezone
+from typing import Any, Awaitable, Dict, Optional, Tuple
+import aiohttp
+import aiofiles
+
+from backend.logging_config import get_logger
+from backend.progress_manager import get_progress_manager
+
+logger = get_logger(__name__)
+
+_installer_instance: Optional["CUDAInstaller"] = None
+
+
+def get_cuda_installer() -> "CUDAInstaller":
+    global _installer_instance
+    if _installer_instance is None:
+        _installer_instance = CUDAInstaller()
+    return _installer_instance
+
+
+def _utcnow() -> str:
+    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+
+
+class CUDAInstaller:
+    """Install CUDA Toolkit on Linux systems."""
+
+    # Supported CUDA versions - URLs are fetched dynamically from NVIDIA's archive
+    # Format: version -> platform -> architecture (URLs fetched on demand)
+    SUPPORTED_VERSIONS = [
+        "13.0",
+        "12.9",
+        "12.8",
+        "12.7",
+        "12.6",
+        "12.5",
+        "12.4",
+        "12.3",
+        "12.2",
+        "12.1",
+        "12.0",
+        "11.9",
+        "11.8",
+    ]
+
+    # cuDNN version mappings by CUDA major version
+    CUDNN_VERSIONS = {
+        "13": "9.5.1",  # cuDNN 9.x for CUDA 13.x
+        "12": "9.5.1",  # cuDNN 9.x for CUDA 12.x
+        "11": "8.9.7",  # cuDNN 8.x for CUDA 11.x
+    }
+
+    # TensorRT version mappings by CUDA major version
+    TENSORRT_VERSIONS = {
+        "13": "10.7.0",  # TensorRT 10.x for CUDA 13.x
+        "12": "10.7.0",  # TensorRT 10.x for CUDA 12.x
+        "11": "8.6.1",   # TensorRT 8.x for CUDA 11.x
+    }
+
+    def __init__(
+        self,
+        *,
+        log_path: Optional[str] = None,
+        state_path: Optional[str] = None,
+        download_dir: Optional[str] = None,
+    ) -> None:
+        self._lock = asyncio.Lock()
+        self._operation: Optional[str] = None
+        self._operation_started_at: Optional[str] = None
+        self._current_task: Optional[asyncio.Task] = None
+        self._last_error: Optional[str] = None
+        self._download_progress: Dict[str, Any] = {}
+        self._last_logged_percentage: int = -1
+        self._last_progress_broadcast_time: float = 0.0
+        self._pending_progress: Optional[Dict[str, Any]] = None
+        self._progress_broadcast_count: int = 0
+
+        # Determine data root - check Docker path first, then fallback to local
+        if os.path.exists("/app/data"):
+            data_root = "/app/data"
+        else:
+            data_root = os.path.abspath("data")
+
+        log_path = log_path or os.path.join(data_root, "logs", "cuda_install.log")
+        state_path = state_path or os.path.join(
+            data_root, "configs", "cuda_installer.json"
+        )
+        download_dir = download_dir or os.path.join(
+            data_root, "temp", "cuda_installers"
+        )
+        self._cuda_install_dir = os.path.join(data_root, "cuda")
+
+        self._log_path = os.path.abspath(log_path)
+        self._state_path = os.path.abspath(state_path)
+        self._download_dir = os.path.abspath(download_dir)
+        self._url_cache: Dict[str, str] = {}  # Cache for dynamically fetched URLs
+        self._repo_cache: Dict[str, list] = {}  # Cache for NVIDIA repo packages
+        self._ensure_directories()
+
+    def _ensure_directories(self) -> None:
+        os.makedirs(self._download_dir, exist_ok=True)
+        os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
+        os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
+        os.makedirs(self._cuda_install_dir, exist_ok=True)
+
+    def _update_current_symlink(self, install_path: str) -> None:
+        """Create or update the /app/data/cuda/current symlink to point to the active CUDA installation."""
+        current_symlink = os.path.join(self._cuda_install_dir, "current")
+        try:
+            # Remove existing symlink if it exists
+            if os.path.islink(current_symlink):
+                os.remove(current_symlink)
+            elif os.path.exists(current_symlink):
+                # If it's not a symlink but exists, remove it (shouldn't happen, but be safe)
+                os.remove(current_symlink)
+            
+            # Create new symlink pointing to the installation
+            os.symlink(install_path, current_symlink)
+            logger.info(f"Updated CUDA current symlink: {current_symlink} -> {install_path}")
+        except OSError as e:
+            logger.warning(f"Failed to update CUDA current symlink: {e}")
+
+    def _remove_current_symlink(self) -> None:
+        """Remove the current symlink and optionally re-point it to another installed version."""
+        current_symlink = os.path.join(self._cuda_install_dir, "current")
+        try:
+            if os.path.islink(current_symlink) or os.path.exists(current_symlink):
+                os.remove(current_symlink)
+            
+            # Try to find another installed version to point to
+            state = self._load_state()
+            installations = state.get("installations", {})
+            
+            # Find the most recently installed version that still exists
+            latest_version = None
+            latest_time = None
+            for v, info in installations.items():
+                install_path = info.get("path")
+                if install_path and os.path.exists(install_path):
+                    installed_at = info.get("installed_at", "")
+                    if not latest_time or installed_at > latest_time:
+                        latest_time = installed_at
+                        latest_version = v
+            
+            # Re-point to the latest remaining installation
+            if latest_version:
+                install_path = installations[latest_version].get("path")
+                if install_path and os.path.exists(install_path):
+                    os.symlink(install_path, current_symlink)
+                    logger.info(f"Re-pointed CUDA current symlink to: {install_path}")
+        except OSError as e:
+            logger.warning(f"Failed to update CUDA current symlink: {e}")
+
+    def _get_platform(self) -> Tuple[str, str]:
+        """Get platform (os, arch) tuple."""
+        system = platform.system().lower()
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64"):
+            arch = "x86_64"
+        else:
+            arch = machine
+
+        return system, arch
+
+    def _get_ubuntu_version(self) -> str:
+        """Get Ubuntu version for NVIDIA repository URLs."""
+        # Try to detect Ubuntu version from /etc/os-release
+        try:
+            if os.path.exists("/etc/os-release"):
+                with open("/etc/os-release", "r") as f:
+                    for line in f:
+                        if line.startswith("VERSION_ID="):
+                            version = line.split("=")[1].strip().strip('"')
+                            # Extract major.minor (e.g., "24.04" from "24.04.1")
+                            parts = version.split(".")
+                            if len(parts) >= 2:
+                                major_minor = f"{parts[0]}{parts[1]}"
+                                # Check if it's 24.04 or newer
+                                if major_minor >= "2404":
+                                    return "ubuntu2404"
+                                else:
+                                    return "ubuntu2204"
+        except Exception:
+            pass
+        
+        # Default to ubuntu2404 for Ubuntu 24.04 base image
+        return "ubuntu2404"
+
+    def _get_archive_target_version(self) -> str:
+        """Get archive target version for CUDA runfile lookups."""
+        ubuntu_version = self._get_ubuntu_version()
+        if ubuntu_version == "ubuntu2404":
+            return "24.04"
+        return "22.04"
+
+    async def _get_repo_packages(self, ubuntu_version: str) -> list:
+        """Fetch and cache NVIDIA CUDA repo package metadata."""
+        if ubuntu_version in self._repo_cache:
+            return self._repo_cache[ubuntu_version]
+
+        base_url = (
+            f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64"
+        )
+        packages_url = f"{base_url}/Packages.gz"
+        packages_plain_url = f"{base_url}/Packages"
+        packages: list = []
+
+        async with aiohttp.ClientSession() as session:
+            data = None
+            try:
+                async with session.get(packages_url) as response:
+                    if response.status == 200:
+                        compressed = await response.read()
+                        data = gzip.decompress(compressed)
+            except Exception:
+                data = None
+
+            if data is None:
+                try:
+                    async with session.get(packages_plain_url) as response:
+                        if response.status == 200:
+                            data = await response.read()
+                except Exception:
+                    data = None
+
+        if not data:
+            self._repo_cache[ubuntu_version] = []
+            return []
+
+        text = data.decode("utf-8", errors="replace")
+        current = {}
+        for line in text.splitlines():
+            if not line.strip():
+                if current:
+                    packages.append(current)
+                    current = {}
+                continue
+            if line.startswith("Package:"):
+                current["Package"] = line.split(":", 1)[1].strip()
+            elif line.startswith("Version:"):
+                current["Version"] = line.split(":", 1)[1].strip()
+            elif line.startswith("Filename:"):
+                current["Filename"] = line.split(":", 1)[1].strip()
+
+        if current:
+            packages.append(current)
+
+        self._repo_cache[ubuntu_version] = packages
+        return packages
+
+    def _version_key(self, version: str) -> tuple:
+        """Create a sortable key for package version strings."""
+        tokens = re.split(r"[^\w]+", version)
+        key = []
+        for token in tokens:
+            if token.isdigit():
+                key.append(int(token))
+            elif token:
+                key.append(token)
+        return tuple(key)
+
+    def _select_repo_package(
+        self,
+        packages: list,
+        package_name: str,
+        version_prefix: Optional[str] = None,
+        version_contains: Optional[str] = None,
+    ) -> Optional[Dict[str, str]]:
+        """Select the best matching package from repo metadata."""
+        candidates = [
+            pkg for pkg in packages if pkg.get("Package") == package_name
+        ]
+        if version_prefix:
+            candidates = [
+                pkg
+                for pkg in candidates
+                if pkg.get("Version", "").startswith(version_prefix)
+            ]
+        if version_contains:
+            candidates = [
+                pkg
+                for pkg in candidates
+                if version_contains in pkg.get("Version", "")
+            ]
+        if not candidates:
+            return None
+        return max(candidates, key=lambda pkg: self._version_key(pkg.get("Version", "")))
+
+    def _load_state(self) -> Dict[str, Any]:
+        if not os.path.exists(self._state_path):
+            return {}
+        try:
+            with open(self._state_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                return data if isinstance(data, dict) else {}
+        except Exception as exc:
+            logger.warning(f"Failed to load CUDA installer state: {exc}")
+            return {}
+
+    def _save_state(self, state: Dict[str, Any]) -> None:
+        tmp_path = f"{self._state_path}.tmp"
+        with open(tmp_path, "w", encoding="utf-8") as f:
+            json.dump(state, f, indent=2)
+        os.replace(tmp_path, self._state_path)
+
+    def _detect_installed_version(self) -> Optional[str]:
+        """Detect installed CUDA version by checking nvcc or state."""
+        # First check state for installed versions
+        state = self._load_state()
+        installations = state.get("installations", {})
+        if installations:
+            # Return the most recently installed version
+            latest_version = None
+            latest_time = None
+            for v, info in installations.items():
+                installed_at = info.get("installed_at", "")
+                if not latest_time or installed_at > latest_time:
+                    latest_time = installed_at
+                    latest_version = v
+            if latest_version:
+                install_path = installations[latest_version].get("path")
+                if install_path and os.path.exists(install_path):
+                    return latest_version
+
+        # Fallback: try to detect via nvcc command
+        try:
+            # Get CUDA environment to find nvcc
+            cuda_env = self.get_cuda_env()
+            env = os.environ.copy()
+            env.update(cuda_env)
+
+            nvcc_path = shutil.which("nvcc", path=env.get("PATH", ""))
+            if not nvcc_path:
+                return None
+
+            result = subprocess.run(
+                [nvcc_path, "--version"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+                env=env,
+            )
+            if result.returncode == 0:
+                # Parse version from output
+                for line in result.stdout.split("\n"):
+                    if "release" in line.lower():
+                        parts = line.split()
+                        for i, part in enumerate(parts):
+                            if "release" in part.lower() and i + 1 < len(parts):
+                                version_str = parts[i + 1].rstrip(",")
+                                # Extract major.minor
+                                version_parts = version_str.split(".")
+                                if len(version_parts) >= 2:
+                                    return f"{version_parts[0]}.{version_parts[1]}"
+        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+            pass
+        return None
+
+    def _get_cuda_path(self, version: Optional[str] = None) -> Optional[str]:
+        """Get CUDA installation path."""
+        # First, check the current symlink (most reliable for active installation)
+        current_symlink = os.path.join(self._cuda_install_dir, "current")
+        if os.path.islink(current_symlink) or os.path.exists(current_symlink):
+            try:
+                resolved_path = os.path.realpath(current_symlink)
+                if os.path.exists(resolved_path):
+                    nvcc_path = os.path.join(resolved_path, "bin", "nvcc")
+                    if os.path.exists(nvcc_path):
+                        return resolved_path
+            except (OSError, ValueError):
+                pass
+
+        # Check state for installed versions
+        state = self._load_state()
+        installations = state.get("installations", {})
+
+        # If version specified, return that installation path
+        if version and version in installations:
+            install_path = installations[version].get("path")
+            if install_path and os.path.exists(install_path):
+                return install_path
+
+        # Check for latest installed version in state
+        if installations:
+            # Get the most recently installed version
+            latest_version = None
+            latest_time = None
+            for v, info in installations.items():
+                installed_at = info.get("installed_at", "")
+                if not latest_time or installed_at > latest_time:
+                    latest_time = installed_at
+                    latest_version = v
+
+            if latest_version:
+                install_path = installations[latest_version].get("path")
+                if install_path and os.path.exists(install_path):
+                    return install_path
+
+        # Check environment variables (only accept paths under data directory)
+        env_path = os.environ.get("CUDA_PATH") or os.environ.get("CUDA_HOME")
+        if (
+            env_path
+            and os.path.exists(env_path)
+            and os.path.abspath(env_path).startswith(self._cuda_install_dir)
+        ):
+            return env_path
+
+        # Scan the data directory for CUDA installs as fallback
+        try:
+            if os.path.exists(self._cuda_install_dir):
+                for item in sorted(os.listdir(self._cuda_install_dir), reverse=True):
+                    # Skip the current symlink
+                    if item == "current":
+                        continue
+                    full_path = os.path.join(self._cuda_install_dir, item)
+                    if os.path.isdir(full_path):
+                        nvcc_path = os.path.join(full_path, "bin", "nvcc")
+                        if os.path.exists(nvcc_path):
+                            return full_path
+        except OSError:
+            pass
+
+        return None
+
+    def get_cuda_env(self, version: Optional[str] = None) -> Dict[str, str]:
+        """Get environment variables for CUDA installation."""
+        cuda_path = self._get_cuda_path(version)
+        if not cuda_path:
+            return {}
+
+        cuda_bin = os.path.join(cuda_path, "bin")
+        cuda_lib = os.path.join(cuda_path, "lib64")
+
+        env = {
+            "CUDA_HOME": cuda_path,
+            "CUDA_PATH": cuda_path,
+        }
+
+        # Add to PATH if bin directory exists
+        if os.path.exists(cuda_bin):
+            current_path = os.environ.get("PATH", "")
+            if cuda_bin not in current_path:
+                env["PATH"] = f"{cuda_bin}:{current_path}" if current_path else cuda_bin
+
+        # Add to LD_LIBRARY_PATH if lib64 directory exists
+        if os.path.exists(cuda_lib):
+            current_ld_path = os.environ.get("LD_LIBRARY_PATH", "")
+            if cuda_lib not in current_ld_path:
+                env["LD_LIBRARY_PATH"] = (
+                    f"{cuda_lib}:{current_ld_path}" if current_ld_path else cuda_lib
+                )
+
+        # Add TensorRT path if TensorRT is installed
+        tensorrt_version = self._detect_tensorrt_version(cuda_path)
+        if tensorrt_version:
+            env["TENSORRT_PATH"] = cuda_path
+            env["TENSORRT_ROOT"] = cuda_path
+
+        return env
+
+    def _get_archive_url(self, version: str) -> str:
+        """Get NVIDIA download archive URL for a CUDA version."""
+        # Convert version like "12.8" to "12-8-0" for URL
+        version_parts = version.split(".")
+        major = version_parts[0]
+        minor = version_parts[1] if len(version_parts) > 1 else "0"
+        patch = version_parts[2] if len(version_parts) > 2 else "0"
+        version_slug = f"{major}-{minor}-{patch}"
+        target_version = self._get_archive_target_version()
+
+        return (
+            f"https://developer.nvidia.com/cuda-{version_slug}-download-archive"
+            f"?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version={target_version}&target_type=runfile_local"
+        )
+
+    async def _fetch_download_url(self, version: str) -> str:
+        """Fetch the actual download URL from NVIDIA's archive page."""
+        # Check cache first
+        cache_key = f"{version}_linux_x86_64"
+        if cache_key in self._url_cache:
+            return self._url_cache[cache_key]
+
+        archive_url = self._get_archive_url(version)
+        logger.info(f"Fetching CUDA {version} download URL from {archive_url}")
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.get(
+                    archive_url, timeout=aiohttp.ClientTimeout(total=30)
+                ) as response:
+                    if response.status != 200:
+                        raise RuntimeError(
+                            f"Failed to fetch archive page: HTTP {response.status}"
+                        )
+
+                    html = await response.text()
+
+                    # The page contains JSON data with download URLs
+                    # The JSON structure has keys like "Linux/x86_64/Ubuntu/24.04/runfile_local"
+                    # The URL is in the "details" field which contains HTML with href attributes
+                    target_version = self._get_archive_target_version()
+                    json_key = f"Linux/x86_64/Ubuntu/{target_version}/runfile_local"
+
+                    # Pattern 1: Look for href in the details field (HTML may be escaped)
+                    # Match: "Linux/x86_64/Ubuntu/<version>/runfile_local":{..."details":"...href=\"URL\"..."}
+                    pattern1 = rf'"{re.escape(json_key)}"[^}}]*"details"[^"]*href[=:][\\"]*([^"\\s<>]+cuda_\d+\.\d+\.\d+_[^"\\s<>]+_linux\.run)'
+                    matches = re.findall(pattern1, html, re.IGNORECASE | re.DOTALL)
+
+                    if not matches:
+                        # Pattern 2: Look for href with escaped quotes (\u0022 or \")
+                        pattern2 = rf'"{re.escape(json_key)}"[^}}]*href[\\u0022=:]*([^"\\s<>]+cuda_\d+\.\d+\.\d+_[^"\\s<>]+_linux\.run)'
+                        matches = re.findall(pattern2, html, re.IGNORECASE | re.DOTALL)
+
+                    if not matches:
+                        # Pattern 3: Look for the filename field and construct URL
+                        pattern3 = rf'"{re.escape(json_key)}"[^}}]*"filename"[^"]*"([^"]+_linux\.run)"'
+                        filename_matches = re.findall(pattern3, html, re.IGNORECASE)
+                        if filename_matches:
+                            filename = filename_matches[0]
+                            version_full = f"{version}.0"
+                            url = f"https://developer.download.nvidia.com/compute/cuda/{version_full}/local_installers/{filename}"
+                            matches = [url]
+
+                    if not matches:
+                        # Pattern 4: Fallback - look for any URL matching the pattern
+                        version_escaped = version.replace(".", r"\.")
+                        pattern4 = rf'https://developer\.download\.nvidia\.com/compute/cuda/{version_escaped}\.0/local_installers/cuda_{version_escaped}\.0_[^"\'\s<>]+_linux\.run'
+                        matches = re.findall(pattern4, html, re.IGNORECASE)
+
+                    if matches:
+                        url = matches[0]
+                        # Cache it
+                        self._url_cache[cache_key] = url
+                        logger.info(f"Found CUDA {version} download URL: {url}")
+                        return url
+                    else:
+                        raise RuntimeError(
+                            f"Could not find download URL for CUDA {version} on archive page"
+                        )
+
+            except aiohttp.ClientError as e:
+                raise RuntimeError(f"Failed to fetch archive page: {e}")
+
+    async def _broadcast_log_line(self, line: str) -> None:
+        try:
+            await get_progress_manager().broadcast(
+                {
+                    "type": "cuda_install_log",
+                    "line": line,
+                    "timestamp": _utcnow(),
+                }
+            )
+        except Exception as exc:
+            logger.debug(f"Failed to broadcast CUDA log line: {exc}")
+
+    async def _broadcast_progress(self, progress: Dict[str, Any]) -> None:
+        """Broadcast progress updates, throttled to 1 second intervals."""
+        try:
+            current_time = time.time()
+            progress_value = progress.get("progress", 0)
+            is_complete = progress_value >= 100
+            
+            # Always send completion updates immediately
+            if is_complete:
+                await get_progress_manager().broadcast(
+                    {
+                        "type": "cuda_install_progress",
+                        **progress,
+                        "timestamp": _utcnow(),
+                    }
+                )
+                self._last_progress_broadcast_time = current_time
+                self._pending_progress = None
+                return
+            
+            # Always send the first few updates immediately (first 3 updates)
+            # then throttle to 1 second intervals
+            is_first_update = self._last_progress_broadcast_time == 0.0
+            time_since_last_broadcast = current_time - self._last_progress_broadcast_time
+            is_early_update = self._progress_broadcast_count < 3
+            should_send = is_first_update or is_early_update or time_since_last_broadcast >= 1.0
+            
+            if should_send:
+                await get_progress_manager().broadcast(
+                    {
+                        "type": "cuda_install_progress",
+                        **progress,
+                        "timestamp": _utcnow(),
+                    }
+                )
+                self._last_progress_broadcast_time = current_time
+                self._pending_progress = None
+                self._progress_broadcast_count += 1
+            else:
+                # Store the latest progress data for next send
+                self._pending_progress = progress
+        except Exception as exc:
+            logger.exception(f"Failed to broadcast CUDA progress: {exc}")
+
+    async def _set_operation(self, operation: str) -> None:
+        self._operation = operation
+        self._operation_started_at = _utcnow()
+        self._last_error = None
+        await get_progress_manager().broadcast(
+            {
+                "type": "cuda_install_status",
+                "status": operation,
+                "started_at": self._operation_started_at,
+            }
+        )
+
+    async def _finish_operation(self, success: bool, message: str = "") -> None:
+        payload = {
+            "type": "cuda_install_status",
+            "status": "completed" if success else "failed",
+            "operation": self._operation,
+            "message": message,
+            "ended_at": _utcnow(),
+        }
+        await get_progress_manager().broadcast(payload)
+        self._operation = None
+        self._operation_started_at = None
+
+    def _create_task(self, coro: Awaitable[Any]) -> None:
+        loop = asyncio.get_running_loop()
+        task = loop.create_task(coro)
+        self._current_task = task
+
+        def _cleanup(fut: asyncio.Future) -> None:
+            try:
+                fut.result()
+            except Exception as exc:
+                logger.exception("CUDA installer task error")
+            finally:
+                self._current_task = None
+
+        task.add_done_callback(_cleanup)
+
+    async def _download_installer(
+        self, version: str, url: str, installer_path: str
+    ) -> None:
+        """Download CUDA installer with progress tracking."""
+        # Check if installer already exists
+        if os.path.exists(installer_path):
+            file_size = os.path.getsize(installer_path)
+            file_size_mb = file_size / (1024 * 1024)
+            
+            # Verify existing file is not corrupted (should be at least 100MB for CUDA installers)
+            if file_size < 100 * 1024 * 1024:
+                await self._broadcast_log_line(
+                    f"Existing installer file appears corrupted (too small: {file_size_mb:.1f} MB), re-downloading..."
+                )
+                try:
+                    os.remove(installer_path)
+                except OSError:
+                    pass
+            else:
+                # Verify the file is actually valid and matches expected size from server
+                try:
+                    # First, check if it's a valid shell script
+                    with open(installer_path, "rb") as f:
+                        header = f.read(100)
+                        if not header.startswith(b"#!/"):
+                            await self._broadcast_log_line(
+                                f"Existing installer file is not a valid shell script, re-downloading..."
+                            )
+                            try:
+                                os.remove(installer_path)
+                            except OSError:
+                                pass
+                        else:
+                            # File appears valid, now verify size matches server expectation
+                            # Fetch the expected file size from the server
+                            try:
+                                async with aiohttp.ClientSession() as session:
+                                    async with session.head(url, allow_redirects=True) as head_response:
+                                        expected_size = int(head_response.headers.get("Content-Length", 0))
+                                        
+                                        if expected_size > 0:
+                                            # Verify file size matches (with small tolerance)
+                                            size_diff = abs(file_size - expected_size)
+                                            if size_diff > 1024:  # Allow 1KB tolerance
+                                                await self._broadcast_log_line(
+                                                    f"Existing installer file size mismatch: expected {expected_size / (1024*1024):.1f} MB, "
+                                                    f"got {file_size_mb:.1f} MB (difference: {size_diff} bytes). Re-downloading..."
+                                                )
+                                                try:
+                                                    os.remove(installer_path)
+                                                except OSError:
+                                                    pass
+                                            else:
+                                                # File size matches, verify it's stable (not currently being written)
+                                                await asyncio.sleep(0.2)  # Brief pause to ensure file is fully written if being written
+                                                new_size = os.path.getsize(installer_path)
+                                                if new_size != file_size:
+                                                    await self._broadcast_log_line(
+                                                        f"File size changed during verification (was {file_size_mb:.1f} MB, now {new_size / (1024*1024):.1f} MB), "
+                                                        f"file may still be downloading. Re-downloading..."
+                                                    )
+                                                    try:
+                                                        os.remove(installer_path)
+                                                    except OSError:
+                                                        pass
+                                                else:
+                                                    await self._broadcast_log_line(
+                                                        f"Installer file already exists and verified: {installer_path} ({file_size_mb:.1f} MB)"
+                                                    )
+                                                    await self._broadcast_progress(
+                                                        {
+                                                            "stage": "download",
+                                                            "progress": 100,
+                                                            "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
+                                                        }
+                                                    )
+                                                    return
+                                        else:
+                                            # Couldn't get expected size, but file looks valid - use it
+                                            await self._broadcast_log_line(
+                                                f"Installer file already exists: {installer_path} ({file_size_mb:.1f} MB). "
+                                                f"Could not verify size from server, but file appears valid."
+                                            )
+                                            await self._broadcast_progress(
+                                                {
+                                                    "stage": "download",
+                                                    "progress": 100,
+                                                    "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
+                                                }
+                                            )
+                                            return
+                            except Exception as size_check_error:
+                                # If we can't verify size from server, but file looks valid, use it
+                                await self._broadcast_log_line(
+                                    f"Could not verify file size from server: {size_check_error}. "
+                                    f"File appears valid, using existing file: {installer_path} ({file_size_mb:.1f} MB)"
+                                )
+                                await self._broadcast_progress(
+                                    {
+                                        "stage": "download",
+                                        "progress": 100,
+                                        "message": f"Using existing installer file ({file_size_mb:.1f} MB)",
+                                    }
+                                )
+                                return
+                except (OSError, IOError) as e:
+                    await self._broadcast_log_line(
+                        f"Failed to verify existing installer file: {e}, re-downloading..."
+                    )
+                    try:
+                        os.remove(installer_path)
+                    except OSError:
+                        pass
+
+        # Reset logging state for new download
+        self._last_logged_percentage = -1
+        self._last_progress_broadcast_time = 0.0
+        self._pending_progress = None
+        self._progress_broadcast_count = 0
+
+        log_header = f"[{_utcnow()}] Downloading CUDA {version} installer from {url}\n"
+        with open(self._log_path, "w", encoding="utf-8") as log_file:
+            log_file.write(log_header)
+
+        await self._broadcast_log_line(
+            f"Starting download of CUDA {version} installer..."
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "download",
+                "progress": 0,
+                "message": f"Downloading CUDA {version} installer...",
+            }
+        )
+
+        # Configure timeout for large file downloads:
+        # - total: 1 hour (3600s) for very large files and slow connections
+        # - connect: 30s to establish connection
+        # - sock_read: 5 minutes (300s) to allow for slow network during chunk reads
+        timeout = aiohttp.ClientTimeout(
+            total=3600,  # 1 hour total timeout
+            connect=30,  # 30 seconds to connect
+            sock_read=300,  # 5 minutes per read operation
+        )
+
+        downloaded = 0
+        total_size = 0
+        try:
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                async with session.get(url) as response:
+                    response.raise_for_status()
+                    total_size = int(response.headers.get("Content-Length", 0))
+
+                    async with aiofiles.open(installer_path, "wb") as f:
+                        async for chunk in response.content.iter_chunked(8192):
+                            await f.write(chunk)
+                            downloaded += len(chunk)
+
+                        if total_size > 0:
+                            progress = int((downloaded / total_size) * 100)
+                            # Format sizes in MB
+                            downloaded_mb = downloaded / (1024 * 1024)
+                            total_mb = total_size / (1024 * 1024)
+                            await self._broadcast_progress(
+                                {
+                                    "stage": "download",
+                                    "progress": progress,
+                                    "message": f"Downloading CUDA {version} installer... ({downloaded_mb:.1f}/{total_mb:.1f} MB)",
+                                    "bytes_downloaded": downloaded,
+                                    "total_bytes": total_size,
+                                }
+                            )
+
+                        # Log progress only at key percentage milestones (10%, 25%, 50%, 75%, 90%, 100%)
+                        # Only log when we cross a milestone, not when we're within it
+                        should_log = False
+
+                        # Check if we've crossed a key percentage milestone
+                        if total_size > 0:
+                            progress = int((downloaded / total_size) * 100)
+                            if progress != self._last_logged_percentage and progress in [
+                                10,
+                                25,
+                                50,
+                                75,
+                                90,
+                                100,
+                            ]:
+                                should_log = True
+                                self._last_logged_percentage = progress
+
+                        if should_log:
+                            downloaded_mb = downloaded / (1024 * 1024)
+                            total_mb = total_size / (1024 * 1024)
+                            log_line = f"Downloaded {downloaded_mb:.1f}/{total_mb:.1f} MB ({progress}%)\n"
+                            with open(
+                                self._log_path, "a", encoding="utf-8"
+                            ) as log_file:
+                                log_file.write(log_line)
+                            await self._broadcast_log_line(
+                                f"Downloaded {downloaded_mb:.1f} MB / {total_mb:.1f} MB ({progress}%)"
+                            )
+                
+                # File is automatically flushed when the context manager exits
+        except asyncio.TimeoutError as e:
+            # Clean up partial download on timeout
+            if os.path.exists(installer_path):
+                try:
+                    os.remove(installer_path)
+                except OSError:
+                    pass
+            downloaded_mb = downloaded / (1024 * 1024) if downloaded > 0 else 0
+            total_mb = total_size / (1024 * 1024) if total_size > 0 else 0
+            error_msg = (
+                f"Download timeout: Failed to download CUDA {version} installer. "
+                f"Downloaded {downloaded_mb:.1f} MB of {total_mb:.1f} MB. "
+                f"This may be due to a slow network connection. Please try again."
+            )
+            await self._broadcast_log_line(error_msg)
+            raise RuntimeError(error_msg) from e
+        except aiohttp.ClientError as e:
+            # Clean up partial download on client error
+            if os.path.exists(installer_path):
+                try:
+                    os.remove(installer_path)
+                except OSError:
+                    pass
+            error_msg = (
+                f"Network error while downloading CUDA {version} installer: {e}. "
+                f"Please check your network connection and try again."
+            )
+            await self._broadcast_log_line(error_msg)
+            raise RuntimeError(error_msg) from e
+
+        # Wait a brief moment to ensure file system has fully written the file
+        # This helps ensure the file is completely written to disk before verification
+        await asyncio.sleep(0.5)
+        
+        # Verify downloaded file exists and is complete
+        if not os.path.exists(installer_path):
+            raise RuntimeError(f"Downloaded file not found: {installer_path}")
+        
+        # Verify file size matches expected size (with a small tolerance for filesystem differences)
+        actual_size = os.path.getsize(installer_path)
+        if total_size > 0:
+            size_diff = abs(actual_size - total_size)
+            if size_diff > 1024:  # Allow 1KB tolerance for filesystem differences
+                raise RuntimeError(
+                    f"Downloaded file size mismatch: expected {total_size} bytes, "
+                    f"got {actual_size} bytes (difference: {size_diff} bytes). File may be corrupted or incomplete."
+                )
+        
+        if actual_size < 100 * 1024 * 1024:  # Less than 100MB is suspicious
+            raise RuntimeError(
+                f"Downloaded file appears to be corrupted or incomplete: "
+                f"{installer_path} (size: {actual_size} bytes)"
+            )
+        
+        # Verify the file is a valid shell script (CUDA .run files are self-extracting)
+        try:
+            with open(installer_path, "rb") as verify_file:
+                header = verify_file.read(100)
+                if not header.startswith(b"#!/"):
+                    raise RuntimeError(
+                        f"Downloaded file does not appear to be a valid shell script: {installer_path}"
+                    )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to verify downloaded file integrity: {installer_path}, error: {e}"
+            )
+        
+        await self._broadcast_log_line(
+            f"Download completed and verified: {installer_path} ({actual_size / (1024*1024):.1f} MB)"
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "download",
+                "progress": 100,
+                "message": "Download completed and verified",
+            }
+        )
+
+    def _is_docker_container(self) -> bool:
+        """Check if running inside a Docker container."""
+        # Check for Docker-specific files
+        docker_indicators = [
+            "/.dockerenv",
+            "/proc/self/cgroup",
+        ]
+
+        # Check /.dockerenv
+        if os.path.exists("/.dockerenv"):
+            return True
+
+        # Check /proc/self/cgroup for Docker
+        try:
+            if os.path.exists("/proc/self/cgroup"):
+                with open("/proc/self/cgroup", "r") as f:
+                    content = f.read()
+                    if "docker" in content or "containerd" in content:
+                        return True
+        except (OSError, IOError):
+            pass
+
+        return False
+
+    async def _install_linux(
+        self,
+        installer_path: str,
+        version: str,
+        install_cudnn: bool = False,
+        install_tensorrt: bool = False,
+    ) -> str:
+        """
+        Install CUDA on Linux using runfile installer.
+        
+        Uses optimized installer options for custom location installation:
+        - Silent installation with EULA acceptance
+        - Toolkit-only installation (no driver)
+        - Override installation checks for custom paths
+        - Skip OpenGL libraries (not needed in Docker/headless environments)
+        - Skip man pages to reduce installation size
+        
+        Args:
+            installer_path: Path to the CUDA installer runfile
+            version: CUDA version being installed
+            install_cudnn: Whether to install cuDNN
+            install_tensorrt: Whether to install TensorRT
+        """
+        await self._broadcast_log_line("Starting CUDA installation on Linux...")
+        await self._broadcast_progress(
+            {
+                "stage": "install",
+                "progress": 0,
+                "message": "Installing CUDA Toolkit...",
+            }
+        )
+
+        # Verify installer file exists and is not corrupted
+        if not os.path.exists(installer_path):
+            raise RuntimeError(f"Installer file not found: {installer_path}")
+        
+        file_size = os.path.getsize(installer_path)
+        if file_size < 100 * 1024 * 1024:  # Less than 100MB is suspicious for CUDA installers
+            raise RuntimeError(
+                f"Installer file appears to be corrupted or incomplete: {installer_path} "
+                f"(size: {file_size / (1024*1024):.1f} MB, expected > 100 MB)"
+            )
+        
+        # Verify the file starts with a shell script header (CUDA .run files are self-extracting)
+        try:
+            with open(installer_path, "rb") as f:
+                header = f.read(100)
+                if not header.startswith(b"#!/"):
+                    raise RuntimeError(
+                        f"Installer file does not appear to be a valid shell script: {installer_path}"
+                    )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to verify installer file: {installer_path}, error: {e}"
+            )
+        
+        await self._broadcast_log_line(
+            f"Verifying installer file: {installer_path} ({file_size / (1024*1024):.1f} MB)"
+        )
+
+        # Make installer executable
+        os.chmod(installer_path, 0o755)
+
+        # Always install to the data directory for persistence
+        install_path = os.path.join(self._cuda_install_dir, f"cuda-{version}")
+        await self._broadcast_log_line(f"Installing to data directory: {install_path}")
+        os.makedirs(install_path, exist_ok=True)
+
+        # Build installer arguments with optimized options for custom location installation
+        # 
+        # Selected options based on NVIDIA CUDA installer documentation:
+        # - --silent: Required for silent installation, implies EULA acceptance
+        # - --toolkit: Install toolkit only (not driver) - required for non-root installations
+        # - --override: Override compiler, third-party library, and toolkit detection checks
+        #   (essential for custom installation paths)
+        # - --toolkitpath: Install to custom data directory path
+        # - --no-opengl-libs: Skip OpenGL libraries (not needed in Docker/headless environments)
+        # - --no-man-page: Skip man pages to reduce installation size
+        #
+        install_args = [
+            "bash",
+            installer_path,
+            "--silent",                    # Silent installation with EULA acceptance
+            "--toolkit",                   # Install toolkit only (not driver)
+            "--override",                  # Override installation checks for custom paths
+            f"--toolkitpath={install_path}", # Install to custom data directory
+            "--no-opengl-libs",            # Skip OpenGL libraries (not needed in Docker)
+            "--no-man-page",               # Skip man pages to reduce size
+        ]
+        
+        await self._broadcast_log_line(f"Installer arguments: {' '.join(install_args[2:])}")  # Skip 'bash' and installer_path
+
+        # Set up environment to prevent /dev/tty access issues in Docker
+        env = os.environ.copy()
+        env["DEBIAN_FRONTEND"] = "noninteractive"
+        # Disable interactive prompts
+        env["PERL_BADLANG"] = "0"
+        # Ensure we're in a non-interactive environment
+        env["TERM"] = "dumb"
+        # Prevent installer from trying to access /dev/tty
+        env["CI"] = "true"  # Indicate we're in a CI/non-interactive environment
+
+        process = await asyncio.create_subprocess_exec(
+            *install_args,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+            stdin=asyncio.subprocess.DEVNULL,  # Redirect stdin to prevent /dev/tty access
+            env=env,
+        )
+
+        # Collect output for error analysis
+        output_lines = []
+        
+        async def _stream_output():
+            if process.stdout is None:
+                return
+            with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
+                while True:
+                    chunk = await process.stdout.readline()
+                    if not chunk:
+                        break
+                    text = chunk.decode("utf-8", errors="replace")
+                    output_lines.append(text)
+                    log_file.write(text)
+                    await self._broadcast_log_line(text.rstrip("\n"))
+
+        await asyncio.gather(process.wait(), _stream_output())
+
+        if process.returncode != 0:
+            # Check for specific error patterns
+            output_text = "".join(output_lines)
+            
+            # Check for /dev/tty errors
+            if "/dev/tty" in output_text.lower() or "cannot create /dev/tty" in output_text.lower():
+                error_msg = (
+                    f"CUDA installer failed due to /dev/tty access issue (common in Docker). "
+                    f"This may indicate the installer file is corrupted or the environment is not properly configured. "
+                    f"Exit code: {process.returncode}. "
+                    f"Please check the installation logs for details. "
+                    f"If the file appears corrupted, try deleting it and re-downloading."
+                )
+            # Check for gzip/corruption errors
+            elif "gzip" in output_text.lower() and ("unexpected end" in output_text.lower() or "corrupt" in output_text.lower()):
+                error_msg = (
+                    f"CUDA installer file appears to be corrupted (gzip error detected). "
+                    f"Please delete the installer file at {installer_path} and try again. "
+                    f"Exit code: {process.returncode}."
+                )
+            else:
+                error_msg = (
+                    f"CUDA installer exited with code {process.returncode}. "
+                    "Please check the installation logs for details."
+                )
+            
+            raise RuntimeError(error_msg)
+
+        # Verify installation and set up environment
+        cuda_home = install_path
+        cuda_bin = os.path.join(cuda_home, "bin")
+        cuda_lib = os.path.join(cuda_home, "lib64")
+
+        # Verify key directories exist
+        if not os.path.exists(cuda_bin) or not os.path.exists(cuda_lib):
+            raise RuntimeError(
+                f"CUDA installation completed but expected directories not found. "
+                f"Expected: {cuda_bin}, {cuda_lib}"
+            )
+
+        await self._broadcast_log_line(
+            f"CUDA installed successfully to: {install_path}"
+        )
+        await self._broadcast_log_line(f"CUDA_HOME={cuda_home}")
+        await self._broadcast_log_line(f"Adding to PATH: {cuda_bin}")
+        await self._broadcast_log_line(f"Adding to LD_LIBRARY_PATH: {cuda_lib}")
+
+        # Install NCCL (required for multi-GPU and llama.cpp CUDA builds)
+        await self._install_nccl_linux(version, install_path)
+
+        # Install nvidia-smi (required for GPU monitoring)
+        await self._install_nvidia_smi_linux(install_path)
+
+        # Install cuDNN if requested
+        if install_cudnn:
+            await self._install_cudnn_linux(version, install_path)
+
+        # Install TensorRT if requested
+        if install_tensorrt:
+            await self._install_tensorrt_linux(version, install_path)
+
+        # Save installation path to state
+        state = self._load_state()
+        if "installations" not in state:
+            state["installations"] = {}
+        state["installations"][version] = {
+            "path": install_path,
+            "installed_at": _utcnow(),
+            "is_system_install": False,
+            "cudnn_installed": install_cudnn,
+            "tensorrt_installed": install_tensorrt,
+        }
+        self._save_state(state)
+
+        # Update the current symlink to point to this installation
+        self._update_current_symlink(install_path)
+        await self._broadcast_log_line(
+            f"Updated CUDA current symlink: /app/data/cuda/current -> {install_path}"
+        )
+
+        components = ["CUDA", "NCCL", "nvidia-smi"]
+        if install_cudnn:
+            components.append("cuDNN")
+        if install_tensorrt:
+            components.append("TensorRT")
+
+        await self._broadcast_progress(
+            {
+                "stage": "install",
+                "progress": 100,
+                "message": f"{', '.join(components)} installation completed",
+            }
+        )
+
+        return install_path
+
+    async def _install_nccl_linux(self, cuda_version: str, cuda_path: str) -> None:
+        """Install NCCL library for multi-GPU support."""
+        await self._broadcast_log_line(
+            "Installing NCCL (NVIDIA Collective Communications Library)..."
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "nccl",
+                "progress": 0,
+                "message": "Installing NCCL...",
+            }
+        )
+
+        ubuntu_version = self._get_ubuntu_version()
+
+        # Download NCCL from NVIDIA's repo package index
+        await self._broadcast_log_line("Attempting manual NCCL installation...")
+
+        try:
+            cuda_major = cuda_version.split(".")[0]
+            packages = await self._get_repo_packages(ubuntu_version)
+            nccl_pkg = self._select_repo_package(
+                packages,
+                "libnccl2",
+                version_prefix="2.",
+                version_contains=f"+cuda{cuda_major}",
+            )
+            nccl_dev_pkg = self._select_repo_package(
+                packages,
+                "libnccl-dev",
+                version_prefix="2.",
+                version_contains=f"+cuda{cuda_major}",
+            )
+
+            if not nccl_pkg or not nccl_dev_pkg:
+                await self._broadcast_log_line(
+                    "NCCL packages not found in repository, skipping NCCL installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "nccl",
+                        "progress": 100,
+                        "message": "NCCL installation skipped (optional)",
+                    }
+                )
+                return
+
+            base_url = (
+                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
+            )
+            nccl_url = base_url + nccl_pkg.get("Filename", "").lstrip("./")
+            nccl_dev_url = base_url + nccl_dev_pkg.get("Filename", "").lstrip("./")
+
+            nccl_path = os.path.join(self._download_dir, "libnccl2.deb")
+            nccl_dev_path = os.path.join(self._download_dir, "libnccl-dev.deb")
+
+            await self._broadcast_progress(
+                {
+                    "stage": "nccl",
+                    "progress": 25,
+                    "message": "Downloading NCCL packages...",
+                }
+            )
+
+            # Download NCCL packages
+            async with aiohttp.ClientSession() as session:
+                for url, path, name in [
+                    (nccl_url, nccl_path, "libnccl2"),
+                    (nccl_dev_url, nccl_dev_path, "libnccl-dev"),
+                ]:
+                    try:
+                        await self._broadcast_log_line(f"Downloading {name}...")
+                        async with session.get(url) as response:
+                            if response.status == 200:
+                                async with aiofiles.open(path, "wb") as f:
+                                    await f.write(await response.read())
+                                await self._broadcast_log_line(f"Downloaded {name}")
+                            else:
+                                await self._broadcast_log_line(
+                                    f"Failed to download {name}: HTTP {response.status}"
+                                )
+                                # Try alternative URL with different NCCL version
+                                continue
+                    except Exception as download_err:
+                        await self._broadcast_log_line(
+                            f"Download error for {name}: {download_err}"
+                        )
+                        continue
+
+            await self._broadcast_progress(
+                {
+                    "stage": "nccl",
+                    "progress": 50,
+                    "message": "Installing NCCL packages...",
+                }
+            )
+
+            if os.path.exists(nccl_path):
+                await self._broadcast_log_line(
+                    "Extracting NCCL to CUDA directory..."
+                )
+
+                # Extract .deb file (it's an ar archive containing data.tar)
+                extract_dir = os.path.join(self._download_dir, "nccl_extract")
+                os.makedirs(extract_dir, exist_ok=True)
+
+                for deb_path in [nccl_path, nccl_dev_path]:
+                    if os.path.exists(deb_path):
+                        # Extract using ar and tar
+                        extract_process = await asyncio.create_subprocess_exec(
+                            "bash",
+                            "-c",
+                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
+                            stdout=asyncio.subprocess.PIPE,
+                            stderr=asyncio.subprocess.STDOUT,
+                        )
+                        await extract_process.wait()
+
+                # Copy NCCL files to CUDA installation
+                nccl_lib_src = os.path.join(
+                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
+                )
+                nccl_include_src = os.path.join(extract_dir, "usr", "include")
+
+                cuda_lib_dst = os.path.join(cuda_path, "lib64")
+                cuda_include_dst = os.path.join(cuda_path, "include")
+
+                if os.path.exists(nccl_lib_src):
+                    # First pass: collect files and symlinks, copy actual files first
+                    files_to_copy = []
+                    symlinks_to_create = []
+                    
+                    for f in os.listdir(nccl_lib_src):
+                        if "nccl" in f.lower():
+                            src = os.path.join(nccl_lib_src, f)
+                            dst = os.path.join(cuda_lib_dst, f)
+                            
+                            if os.path.islink(src):
+                                # Resolve symlink to find actual target
+                                link_target = os.readlink(src)
+                                # If relative symlink, resolve relative to source directory
+                                if not os.path.isabs(link_target):
+                                    link_target = os.path.normpath(
+                                        os.path.join(os.path.dirname(src), link_target)
+                                    )
+                                # Find the actual target file name
+                                actual_target = os.path.basename(link_target)
+                                symlinks_to_create.append((f, actual_target, dst))
+                            else:
+                                files_to_copy.append((f, src, dst))
+                    
+                    # Copy all actual files first
+                    for f, src, dst in files_to_copy:
+                        try:
+                            shutil.copy2(src, dst)
+                            await self._broadcast_log_line(
+                                f"Copied {f} to CUDA lib directory"
+                            )
+                        except Exception as copy_err:
+                            await self._broadcast_log_line(
+                                f"Failed to copy {f}: {copy_err}"
+                            )
+                    
+                    # Then create symlinks pointing to the copied files
+                    for link_name, target_name, dst in symlinks_to_create:
+                        try:
+                            if os.path.exists(dst):
+                                os.remove(dst)
+                            # Create symlink pointing to target in same directory
+                            os.symlink(target_name, dst)
+                            await self._broadcast_log_line(
+                                f"Created symlink {link_name} -> {target_name} in CUDA lib directory"
+                            )
+                        except Exception as link_err:
+                            await self._broadcast_log_line(
+                                f"Failed to create symlink {link_name}: {link_err}"
+                            )
+
+                if os.path.exists(nccl_include_src):
+                    for f in os.listdir(nccl_include_src):
+                        if "nccl" in f.lower():
+                            src = os.path.join(nccl_include_src, f)
+                            dst = os.path.join(cuda_include_dst, f)
+                            try:
+                                if os.path.isdir(src):
+                                    # Handle directories by copying recursively
+                                    if os.path.exists(dst):
+                                        shutil.rmtree(dst)
+                                    shutil.copytree(src, dst)
+                                    await self._broadcast_log_line(
+                                        f"Copied directory {f} to CUDA include directory"
+                                    )
+                                else:
+                                    # Handle regular files
+                                    shutil.copy2(src, dst)
+                                    await self._broadcast_log_line(
+                                        f"Copied {f} to CUDA include directory"
+                                    )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                # Cleanup temporary extract directory only (keep .deb files)
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                # Keep .deb files for future use
+                logger.info(f"NCCL packages kept at: {nccl_path}, {nccl_dev_path}")
+
+                await self._broadcast_log_line("NCCL extracted to CUDA directory")
+                await self._broadcast_progress(
+                    {
+                        "stage": "nccl",
+                        "progress": 100,
+                        "message": "NCCL installed successfully",
+                    }
+                )
+            else:
+                await self._broadcast_log_line(
+                    "NCCL packages not available, skipping NCCL installation"
+                )
+                await self._broadcast_log_line(
+                    "Note: NCCL is optional but recommended for multi-GPU builds"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "nccl",
+                        "progress": 100,
+                        "message": "NCCL installation skipped (optional)",
+                    }
+                )
+
+        except Exception as e:
+            await self._broadcast_log_line(f"NCCL installation error: {e}")
+            await self._broadcast_log_line(
+                "Note: NCCL is optional. The build will continue without multi-GPU support."
+            )
+            await self._broadcast_progress(
+                {
+                    "stage": "nccl",
+                    "progress": 100,
+                    "message": "NCCL installation skipped (optional)",
+                }
+            )
+
+    async def _install_nvidia_smi_linux(self, cuda_path: str) -> None:
+        """Install nvidia-smi binary for GPU monitoring."""
+        await self._broadcast_log_line(
+            "Installing nvidia-smi (NVIDIA System Management Interface)..."
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "nvidia-smi",
+                "progress": 0,
+                "message": "Installing nvidia-smi...",
+            }
+        )
+
+        # Check if nvidia-smi already exists in CUDA installation
+        cuda_bin = os.path.join(cuda_path, "bin")
+        nvidia_smi_dst = os.path.join(cuda_bin, "nvidia-smi")
+        if os.path.exists(nvidia_smi_dst):
+            await self._broadcast_log_line(
+                "nvidia-smi already exists in CUDA installation, skipping"
+            )
+            await self._broadcast_progress(
+                {
+                    "stage": "nvidia-smi",
+                    "progress": 100,
+                    "message": "nvidia-smi already installed",
+                }
+            )
+            return
+
+        ubuntu_version = self._get_ubuntu_version()
+
+        try:
+            # Try to find nvidia-utils package which contains nvidia-smi
+            packages = await self._get_repo_packages(ubuntu_version)
+            nvidia_utils_pkg = None
+            
+            # Try multiple package name patterns
+            for pkg_name in ["nvidia-utils", "nvidia-driver-utils", "nvidia-utils-"]:
+                nvidia_utils_pkg = self._select_repo_package(
+                    packages,
+                    pkg_name,
+                )
+                if nvidia_utils_pkg:
+                    break
+
+            if not nvidia_utils_pkg:
+                await self._broadcast_log_line(
+                    "nvidia-utils package not found in repository, skipping nvidia-smi installation"
+                )
+                await self._broadcast_log_line(
+                    "Note: nvidia-smi will not be available. GPU monitoring may be limited."
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "nvidia-smi",
+                        "progress": 100,
+                        "message": "nvidia-smi installation skipped (package not available)",
+                    }
+                )
+                return
+
+            base_url = (
+                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
+            )
+            nvidia_utils_url = base_url + nvidia_utils_pkg.get("Filename", "").lstrip("./")
+
+            nvidia_utils_path = os.path.join(self._download_dir, "nvidia-utils.deb")
+
+            await self._broadcast_progress(
+                {
+                    "stage": "nvidia-smi",
+                    "progress": 25,
+                    "message": "Downloading nvidia-utils package...",
+                }
+            )
+
+            # Download nvidia-utils package
+            async with aiohttp.ClientSession() as session:
+                try:
+                    await self._broadcast_log_line("Downloading nvidia-utils...")
+                    async with session.get(nvidia_utils_url) as response:
+                        if response.status == 200:
+                            async with aiofiles.open(nvidia_utils_path, "wb") as f:
+                                await f.write(await response.read())
+                            await self._broadcast_log_line("Downloaded nvidia-utils")
+                        else:
+                            await self._broadcast_log_line(
+                                f"Failed to download nvidia-utils: HTTP {response.status}"
+                            )
+                            raise RuntimeError(f"Failed to download nvidia-utils: HTTP {response.status}")
+                except Exception as download_err:
+                    await self._broadcast_log_line(
+                        f"Download error for nvidia-utils: {download_err}"
+                    )
+                    raise
+
+            await self._broadcast_progress(
+                {
+                    "stage": "nvidia-smi",
+                    "progress": 50,
+                    "message": "Extracting nvidia-smi...",
+                }
+            )
+
+            if os.path.exists(nvidia_utils_path):
+                await self._broadcast_log_line(
+                    "Extracting nvidia-smi to CUDA directory..."
+                )
+
+                # Extract .deb file
+                extract_dir = os.path.join(self._download_dir, "nvidia_utils_extract")
+                os.makedirs(extract_dir, exist_ok=True)
+
+                # Extract using ar and tar
+                extract_process = await asyncio.create_subprocess_exec(
+                    "bash",
+                    "-c",
+                    f"cd {extract_dir} && ar x {nvidia_utils_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.STDOUT,
+                )
+                await extract_process.wait()
+
+                # Copy nvidia-smi binary to CUDA installation
+                nvidia_smi_src = os.path.join(extract_dir, "usr", "bin", "nvidia-smi")
+                cuda_bin_dst = os.path.join(cuda_path, "bin")
+                nvidia_smi_dst = os.path.join(cuda_bin_dst, "nvidia-smi")
+
+                if os.path.exists(nvidia_smi_src):
+                    os.makedirs(cuda_bin_dst, exist_ok=True)
+                    try:
+                        shutil.copy2(nvidia_smi_src, nvidia_smi_dst)
+                        os.chmod(nvidia_smi_dst, 0o755)
+                        await self._broadcast_log_line(
+                            "Copied nvidia-smi to CUDA bin directory"
+                        )
+                        await self._broadcast_progress(
+                            {
+                                "stage": "nvidia-smi",
+                                "progress": 100,
+                                "message": "nvidia-smi installed successfully",
+                            }
+                        )
+                    except Exception as copy_err:
+                        await self._broadcast_log_line(
+                            f"Failed to copy nvidia-smi: {copy_err}"
+                        )
+                        raise
+                else:
+                    await self._broadcast_log_line(
+                        "nvidia-smi not found in extracted package"
+                    )
+                    await self._broadcast_progress(
+                        {
+                            "stage": "nvidia-smi",
+                            "progress": 100,
+                            "message": "nvidia-smi installation skipped (not in package)",
+                        }
+                    )
+
+                # Cleanup temporary extract directory only (keep .deb file)
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                # Keep .deb file for future use
+                if os.path.exists(nvidia_utils_path):
+                    logger.info(f"nvidia-utils package kept at: {nvidia_utils_path}")
+
+            else:
+                await self._broadcast_log_line(
+                    "nvidia-utils package not available, skipping nvidia-smi installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "nvidia-smi",
+                        "progress": 100,
+                        "message": "nvidia-smi installation skipped (package not available)",
+                    }
+                )
+
+        except Exception as e:
+            await self._broadcast_log_line(f"nvidia-smi installation error: {e}")
+            await self._broadcast_log_line(
+                "Note: nvidia-smi installation failed. GPU monitoring may be limited."
+            )
+            await self._broadcast_progress(
+                {
+                    "stage": "nvidia-smi",
+                    "progress": 100,
+                    "message": "nvidia-smi installation skipped (error occurred)",
+                }
+            )
+
+    async def _install_cudnn_linux(self, cuda_version: str, cuda_path: str) -> None:
+        """Install cuDNN library for deep learning primitives."""
+        await self._broadcast_log_line(
+            "Installing cuDNN (CUDA Deep Neural Network library)..."
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "cudnn",
+                "progress": 0,
+                "message": "Installing cuDNN...",
+            }
+        )
+
+        try:
+            # Determine CUDA major version for cuDNN compatibility
+            cuda_major = cuda_version.split(".")[0]
+            cudnn_version = self.CUDNN_VERSIONS.get(cuda_major)
+            
+            if not cudnn_version:
+                await self._broadcast_log_line(
+                    f"cuDNN version not available for CUDA {cuda_version}, skipping"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "cudnn",
+                        "progress": 100,
+                        "message": "cuDNN installation skipped (version not available)",
+                    }
+                )
+                return
+
+            ubuntu_version = self._get_ubuntu_version()
+            
+            # cuDNN package names vary by CUDA version
+            # For CUDA 12.x: libcudnn9-cuda-12, libcudnn9-dev-cuda-12
+            # For CUDA 11.x: libcudnn8-cuda-11, libcudnn8-dev-cuda-11
+            if cuda_major == "12" or cuda_major == "13":
+                cudnn_pkg = "libcudnn9"
+                cudnn_cuda_suffix = f"cuda-{cuda_major}"
+            else:
+                cudnn_pkg = "libcudnn8"
+                cudnn_cuda_suffix = f"cuda-{cuda_major}"
+
+            # Manual cuDNN installation
+            await self._broadcast_log_line("Installing cuDNN packages...")
+
+            cudnn_package_name = f"{cudnn_pkg}-{cudnn_cuda_suffix}"
+            cudnn_dev_package_name = f"{cudnn_pkg}-dev-{cudnn_cuda_suffix}"
+            packages = await self._get_repo_packages(ubuntu_version)
+            cudnn_pkg_entry = self._select_repo_package(
+                packages, cudnn_package_name, version_prefix=cudnn_version
+            )
+            cudnn_dev_pkg_entry = self._select_repo_package(
+                packages, cudnn_dev_package_name, version_prefix=cudnn_version
+            )
+
+            if not cudnn_pkg_entry or not cudnn_dev_pkg_entry:
+                await self._broadcast_log_line(
+                    "cuDNN packages not found in repository, skipping cuDNN installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "cudnn",
+                        "progress": 100,
+                        "message": "cuDNN installation skipped (optional)",
+                    }
+                )
+                return
+
+            base_url = (
+                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
+            )
+            cudnn_url = base_url + cudnn_pkg_entry.get("Filename", "").lstrip("./")
+            cudnn_dev_url = base_url + cudnn_dev_pkg_entry.get("Filename", "").lstrip("./")
+
+            cudnn_path = os.path.join(self._download_dir, f"{cudnn_pkg}.deb")
+            cudnn_dev_path = os.path.join(self._download_dir, f"{cudnn_pkg}-dev.deb")
+
+            await self._broadcast_progress(
+                {
+                    "stage": "cudnn",
+                    "progress": 25,
+                    "message": "Downloading cuDNN packages...",
+                }
+            )
+
+            # Download cuDNN packages
+            async with aiohttp.ClientSession() as session:
+                for url, path, name in [
+                    (cudnn_url, cudnn_path, cudnn_pkg),
+                    (cudnn_dev_url, cudnn_dev_path, f"{cudnn_pkg}-dev"),
+                ]:
+                    try:
+                        await self._broadcast_log_line(f"Downloading {name}...")
+                        async with session.get(url) as response:
+                            if response.status == 200:
+                                async with aiofiles.open(path, "wb") as f:
+                                    await f.write(await response.read())
+                                await self._broadcast_log_line(f"Downloaded {name}")
+                            else:
+                                await self._broadcast_log_line(
+                                    f"Failed to download {name}: HTTP {response.status}"
+                                )
+                                # Try alternative URL pattern
+                                continue
+                    except Exception as download_err:
+                        await self._broadcast_log_line(
+                            f"Download error for {name}: {download_err}"
+                        )
+                        continue
+
+            await self._broadcast_progress(
+                {
+                    "stage": "cudnn",
+                    "progress": 50,
+                    "message": "Installing cuDNN packages...",
+                }
+            )
+
+            if os.path.exists(cudnn_path):
+                await self._broadcast_log_line(
+                    "Extracting cuDNN to CUDA directory..."
+                )
+
+                # Extract .deb file
+                extract_dir = os.path.join(self._download_dir, "cudnn_extract")
+                os.makedirs(extract_dir, exist_ok=True)
+
+                for deb_path in [cudnn_path, cudnn_dev_path]:
+                    if os.path.exists(deb_path):
+                        # Extract using ar and tar
+                        extract_process = await asyncio.create_subprocess_exec(
+                            "bash",
+                            "-c",
+                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
+                            stdout=asyncio.subprocess.PIPE,
+                            stderr=asyncio.subprocess.STDOUT,
+                        )
+                        await extract_process.wait()
+
+                # Copy cuDNN files to CUDA installation
+                cudnn_lib_src = os.path.join(
+                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
+                )
+                cudnn_include_src = os.path.join(extract_dir, "usr", "include")
+
+                cuda_lib_dst = os.path.join(cuda_path, "lib64")
+                cuda_include_dst = os.path.join(cuda_path, "include")
+
+                if os.path.exists(cudnn_lib_src):
+                    for f in os.listdir(cudnn_lib_src):
+                        if "cudnn" in f.lower():
+                            src = os.path.join(cudnn_lib_src, f)
+                            dst = os.path.join(cuda_lib_dst, f)
+                            try:
+                                if os.path.islink(src):
+                                    linkto = os.readlink(src)
+                                    if os.path.exists(dst):
+                                        os.remove(dst)
+                                    os.symlink(linkto, dst)
+                                else:
+                                    shutil.copy2(src, dst)
+                                await self._broadcast_log_line(
+                                    f"Copied {f} to CUDA lib directory"
+                                )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                if os.path.exists(cudnn_include_src):
+                    for f in os.listdir(cudnn_include_src):
+                        if "cudnn" in f.lower():
+                            src = os.path.join(cudnn_include_src, f)
+                            dst = os.path.join(cuda_include_dst, f)
+                            try:
+                                shutil.copy2(src, dst)
+                                await self._broadcast_log_line(
+                                    f"Copied {f} to CUDA include directory"
+                                )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                # Cleanup temporary extract directory only (keep .deb files)
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                # Keep .deb files for future use
+                logger.info(f"cuDNN packages kept at: {cudnn_path}, {cudnn_dev_path}")
+
+                await self._broadcast_log_line("cuDNN extracted to CUDA directory")
+                await self._broadcast_progress(
+                    {
+                        "stage": "cudnn",
+                        "progress": 100,
+                        "message": "cuDNN installed successfully",
+                    }
+                )
+            else:
+                await self._broadcast_log_line(
+                    "cuDNN packages not available, skipping cuDNN installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "cudnn",
+                        "progress": 100,
+                        "message": "cuDNN installation skipped (optional)",
+                    }
+                )
+
+        except Exception as e:
+            await self._broadcast_log_line(f"cuDNN installation error: {e}")
+            await self._broadcast_log_line(
+                "Note: cuDNN is optional. The build will continue without cuDNN support."
+            )
+            await self._broadcast_progress(
+                {
+                    "stage": "cudnn",
+                    "progress": 100,
+                    "message": "cuDNN installation skipped (optional)",
+                }
+            )
+
+    async def _install_tensorrt_linux(self, cuda_version: str, cuda_path: str) -> None:
+        """Install TensorRT library for inference optimization."""
+        await self._broadcast_log_line(
+            "Installing TensorRT (NVIDIA TensorRT inference library)..."
+        )
+        await self._broadcast_progress(
+            {
+                "stage": "tensorrt",
+                "progress": 0,
+                "message": "Installing TensorRT...",
+            }
+        )
+
+        try:
+            # Determine CUDA major version for TensorRT compatibility
+            cuda_major = cuda_version.split(".")[0]
+            tensorrt_version = self.TENSORRT_VERSIONS.get(cuda_major)
+            
+            if not tensorrt_version:
+                await self._broadcast_log_line(
+                    f"TensorRT version not available for CUDA {cuda_version}, skipping"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "tensorrt",
+                        "progress": 100,
+                        "message": "TensorRT installation skipped (version not available)",
+                    }
+                )
+                return
+
+            ubuntu_version = self._get_ubuntu_version()
+            
+            # TensorRT package names
+            # For CUDA 12.x/13.x: libnvinfer10, libnvinfer-dev, libnvinfer-plugin10, libnvinfer-plugin-dev
+            # For CUDA 11.x: libnvinfer8, libnvinfer-dev, libnvinfer-plugin8, libnvinfer-plugin-dev
+            if cuda_major == "12" or cuda_major == "13":
+                tensorrt_pkg = "libnvinfer10"
+                tensorrt_plugin_pkg = "libnvinfer-plugin10"
+            else:
+                tensorrt_pkg = "libnvinfer8"
+                tensorrt_plugin_pkg = "libnvinfer-plugin8"
+
+            # Manual TensorRT installation
+            await self._broadcast_log_line("Installing TensorRT packages...")
+
+            packages = await self._get_repo_packages(ubuntu_version)
+            tensorrt_pkg_entry = self._select_repo_package(
+                packages, tensorrt_pkg, version_prefix=tensorrt_version
+            )
+            tensorrt_dev_pkg_entry = self._select_repo_package(
+                packages, f"{tensorrt_pkg}-dev", version_prefix=tensorrt_version
+            )
+            tensorrt_plugin_entry = self._select_repo_package(
+                packages, tensorrt_plugin_pkg, version_prefix=tensorrt_version
+            )
+            tensorrt_plugin_dev_entry = self._select_repo_package(
+                packages, f"{tensorrt_plugin_pkg}-dev", version_prefix=tensorrt_version
+            )
+
+            if not all(
+                [
+                    tensorrt_pkg_entry,
+                    tensorrt_dev_pkg_entry,
+                    tensorrt_plugin_entry,
+                    tensorrt_plugin_dev_entry,
+                ]
+            ):
+                await self._broadcast_log_line(
+                    "TensorRT packages not found in repository, skipping TensorRT installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "tensorrt",
+                        "progress": 100,
+                        "message": "TensorRT installation skipped (optional)",
+                    }
+                )
+                return
+
+            base_url = (
+                f"https://developer.download.nvidia.com/compute/cuda/repos/{ubuntu_version}/x86_64/"
+            )
+            tensorrt_url = base_url + tensorrt_pkg_entry.get("Filename", "").lstrip("./")
+            tensorrt_dev_url = base_url + tensorrt_dev_pkg_entry.get("Filename", "").lstrip("./")
+            tensorrt_plugin_url = base_url + tensorrt_plugin_entry.get("Filename", "").lstrip("./")
+            tensorrt_plugin_dev_url = base_url + tensorrt_plugin_dev_entry.get("Filename", "").lstrip("./")
+
+            tensorrt_path = os.path.join(self._download_dir, f"{tensorrt_pkg}.deb")
+            tensorrt_dev_path = os.path.join(self._download_dir, f"{tensorrt_pkg}-dev.deb")
+            tensorrt_plugin_path = os.path.join(self._download_dir, f"{tensorrt_plugin_pkg}.deb")
+            tensorrt_plugin_dev_path = os.path.join(self._download_dir, f"{tensorrt_plugin_pkg}-dev.deb")
+
+            await self._broadcast_progress(
+                {
+                    "stage": "tensorrt",
+                    "progress": 25,
+                    "message": "Downloading TensorRT packages...",
+                }
+            )
+
+            # Download TensorRT packages
+            async with aiohttp.ClientSession() as session:
+                for url, path, name in [
+                    (tensorrt_url, tensorrt_path, tensorrt_pkg),
+                    (tensorrt_dev_url, tensorrt_dev_path, f"{tensorrt_pkg}-dev"),
+                    (tensorrt_plugin_url, tensorrt_plugin_path, tensorrt_plugin_pkg),
+                    (tensorrt_plugin_dev_url, tensorrt_plugin_dev_path, f"{tensorrt_plugin_pkg}-dev"),
+                ]:
+                    try:
+                        await self._broadcast_log_line(f"Downloading {name}...")
+                        async with session.get(url) as response:
+                            if response.status == 200:
+                                async with aiofiles.open(path, "wb") as f:
+                                    await f.write(await response.read())
+                                await self._broadcast_log_line(f"Downloaded {name}")
+                            else:
+                                await self._broadcast_log_line(
+                                    f"Failed to download {name}: HTTP {response.status}"
+                                )
+                                continue
+                    except Exception as download_err:
+                        await self._broadcast_log_line(
+                            f"Download error for {name}: {download_err}"
+                        )
+                        continue
+
+            await self._broadcast_progress(
+                {
+                    "stage": "tensorrt",
+                    "progress": 50,
+                    "message": "Installing TensorRT packages...",
+                }
+            )
+
+            if os.path.exists(tensorrt_path):
+                await self._broadcast_log_line(
+                    "Extracting TensorRT to CUDA directory..."
+                )
+
+                # Extract .deb file
+                extract_dir = os.path.join(self._download_dir, "tensorrt_extract")
+                os.makedirs(extract_dir, exist_ok=True)
+
+                for deb_path in [
+                    tensorrt_path,
+                    tensorrt_dev_path,
+                    tensorrt_plugin_path,
+                    tensorrt_plugin_dev_path,
+                ]:
+                    if os.path.exists(deb_path):
+                        # Extract using ar and tar
+                        extract_process = await asyncio.create_subprocess_exec(
+                            "bash",
+                            "-c",
+                            f"cd {extract_dir} && ar x {deb_path} && tar xf data.tar.* 2>/dev/null || tar xf data.tar 2>/dev/null",
+                            stdout=asyncio.subprocess.PIPE,
+                            stderr=asyncio.subprocess.STDOUT,
+                        )
+                        await extract_process.wait()
+
+                # Copy TensorRT files to CUDA installation
+                tensorrt_lib_src = os.path.join(
+                    extract_dir, "usr", "lib", "x86_64-linux-gnu"
+                )
+                tensorrt_include_src = os.path.join(extract_dir, "usr", "include")
+                tensorrt_bin_src = os.path.join(extract_dir, "usr", "bin")
+
+                cuda_lib_dst = os.path.join(cuda_path, "lib64")
+                cuda_include_dst = os.path.join(cuda_path, "include")
+                cuda_bin_dst = os.path.join(cuda_path, "bin")
+
+                # Copy libraries
+                if os.path.exists(tensorrt_lib_src):
+                    for f in os.listdir(tensorrt_lib_src):
+                        if "nvinfer" in f.lower() or "tensorrt" in f.lower():
+                            src = os.path.join(tensorrt_lib_src, f)
+                            dst = os.path.join(cuda_lib_dst, f)
+                            try:
+                                if os.path.islink(src):
+                                    linkto = os.readlink(src)
+                                    if os.path.exists(dst):
+                                        os.remove(dst)
+                                    os.symlink(linkto, dst)
+                                else:
+                                    shutil.copy2(src, dst)
+                                await self._broadcast_log_line(
+                                    f"Copied {f} to CUDA lib directory"
+                                )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                # Copy headers
+                if os.path.exists(tensorrt_include_src):
+                    for f in os.listdir(tensorrt_include_src):
+                        if "nvinfer" in f.lower() or "tensorrt" in f.lower():
+                            src = os.path.join(tensorrt_include_src, f)
+                            dst = os.path.join(cuda_include_dst, f)
+                            try:
+                                if os.path.isdir(src):
+                                    shutil.copytree(src, dst, dirs_exist_ok=True)
+                                else:
+                                    shutil.copy2(src, dst)
+                                await self._broadcast_log_line(
+                                    f"Copied {f} to CUDA include directory"
+                                )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                # Copy binaries (like trtexec)
+                if os.path.exists(tensorrt_bin_src):
+                    for f in os.listdir(tensorrt_bin_src):
+                        if "trt" in f.lower() or "nvinfer" in f.lower():
+                            src = os.path.join(tensorrt_bin_src, f)
+                            dst = os.path.join(cuda_bin_dst, f)
+                            try:
+                                shutil.copy2(src, dst)
+                                os.chmod(dst, 0o755)
+                                await self._broadcast_log_line(
+                                    f"Copied {f} to CUDA bin directory"
+                                )
+                            except Exception as copy_err:
+                                await self._broadcast_log_line(
+                                    f"Failed to copy {f}: {copy_err}"
+                                )
+
+                # Cleanup temporary extract directory only (keep .deb files)
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                # Keep .deb files for future use
+                logger.info(
+                    f"TensorRT packages kept at: {tensorrt_path}, {tensorrt_dev_path}, "
+                    f"{tensorrt_plugin_path}, {tensorrt_plugin_dev_path}"
+                )
+
+                await self._broadcast_log_line("TensorRT extracted to CUDA directory")
+                await self._broadcast_progress(
+                    {
+                        "stage": "tensorrt",
+                        "progress": 100,
+                        "message": "TensorRT installed successfully",
+                    }
+                )
+            else:
+                await self._broadcast_log_line(
+                    "TensorRT packages not available, skipping TensorRT installation"
+                )
+                await self._broadcast_progress(
+                    {
+                        "stage": "tensorrt",
+                        "progress": 100,
+                        "message": "TensorRT installation skipped (optional)",
+                    }
+                )
+
+        except Exception as e:
+            await self._broadcast_log_line(f"TensorRT installation error: {e}")
+            await self._broadcast_log_line(
+                "Note: TensorRT is optional. The build will continue without TensorRT support."
+            )
+            await self._broadcast_progress(
+                {
+                    "stage": "tensorrt",
+                    "progress": 100,
+                    "message": "TensorRT installation skipped (optional)",
+                }
+            )
+
+    async def install(
+        self,
+        version: str = "12.6",
+        install_cudnn: bool = False,
+        install_tensorrt: bool = False,
+    ) -> Dict[str, Any]:
+        """Install CUDA Toolkit with optional cuDNN and TensorRT."""
+        async with self._lock:
+            if self._operation:
+                raise RuntimeError(
+                    "Another CUDA installer operation is already running"
+                )
+
+            system, arch = self._get_platform()
+
+            if system != "linux":
+                raise RuntimeError(
+                    f"CUDA installation is only supported on Linux, not {system}"
+                )
+
+            if version not in self.SUPPORTED_VERSIONS:
+                raise ValueError(
+                    f"Unsupported CUDA version: {version}. Supported versions: {', '.join(self.SUPPORTED_VERSIONS)}"
+                )
+
+            # Fetch the download URL dynamically
+            await self._broadcast_log_line(
+                f"Fetching download URL for CUDA {version}..."
+            )
+            url = await self._fetch_download_url(version)
+            installer_filename = os.path.basename(url)
+            installer_path = os.path.join(self._download_dir, installer_filename)
+
+            await self._set_operation("install")
+
+            async def _runner():
+                try:
+                    # Download installer
+                    await self._download_installer(version, url, installer_path)
+
+                    # Install (Linux only) - returns the installation path
+                    install_path = await self._install_linux(
+                        installer_path, version, install_cudnn, install_tensorrt
+                    )
+
+                    # Update state (already saved in _install_linux, but update main fields)
+                    state = self._load_state()
+                    state["installed_version"] = version
+                    state["installed_at"] = _utcnow()
+                    state["cuda_path"] = install_path
+                    if install_cudnn:
+                        state["cudnn_installed"] = True
+                    if install_tensorrt:
+                        state["tensorrt_installed"] = True
+                    self._save_state(state)
+
+                    components = ["CUDA Toolkit"]
+                    if install_cudnn:
+                        components.append("cuDNN")
+                    if install_tensorrt:
+                        components.append("TensorRT")
+                    
+                    await self._finish_operation(
+                        True, f"{', '.join(components)} installed successfully"
+                    )
+
+                    # Update current process environment with CUDA paths
+                    # This ensures the running application can use CUDA immediately
+                    cuda_env = self.get_cuda_env(version)
+                    if cuda_env:
+                        os.environ.update(cuda_env)
+                        logger.info(
+                            f"Updated process environment with CUDA {version} paths"
+                        )
+
+                    # Restart llama-swap to pick up new CUDA environment variables
+                    # llama-swap needs to be restarted because subprocess environment
+                    # variables are set at process creation time and can't be changed
+                    try:
+                        from backend.llama_swap_manager import get_llama_swap_manager
+                        llama_swap_manager = get_llama_swap_manager()
+                        await llama_swap_manager.restart_proxy()
+                        logger.info("Restarted llama-swap to pick up new CUDA environment")
+                    except Exception as restart_error:
+                        # Don't fail the installation if restart fails
+                        logger.warning(
+                            f"Failed to restart llama-swap after CUDA installation: {restart_error}. "
+                            f"You may need to manually restart llama-swap to use the new CUDA version."
+                        )
+
+                    # Keep installer file for future use (not deleting)
+                    logger.info(f"Installer file kept at: {installer_path}")
+
+                except Exception as exc:
+                    self._last_error = str(exc)
+                    await self._finish_operation(False, str(exc))
+                    raise
+
+            self._create_task(_runner())
+            return {"message": f"CUDA {version} installation started"}
+
+    def _detect_cudnn_version(self, cuda_path: Optional[str]) -> Optional[str]:
+        """Detect installed cuDNN version by checking library files."""
+        if not cuda_path:
+            return None
+        
+        lib_path = os.path.join(cuda_path, "lib64")
+        if not os.path.exists(lib_path):
+            return None
+        
+        try:
+            for f in os.listdir(lib_path):
+                if "libcudnn" in f and ".so" in f:
+                    match = re.search(r"\.so(?:\.(\d+(?:\.\d+){0,2}))?", f)
+                    if match and match.group(1):
+                        return match.group(1)
+        except Exception:
+            pass
+        
+        return None
+
+    def _detect_tensorrt_version(self, cuda_path: Optional[str]) -> Optional[str]:
+        """Detect installed TensorRT version by checking library files."""
+        if not cuda_path:
+            return None
+        
+        lib_path = os.path.join(cuda_path, "lib64")
+        if not os.path.exists(lib_path):
+            return None
+        
+        try:
+            for f in os.listdir(lib_path):
+                if "libnvinfer" in f and ".so" in f and "plugin" not in f:
+                    match = re.search(r"\.so(?:\.(\d+(?:\.\d+){0,2}))?", f)
+                    if match and match.group(1):
+                        return match.group(1)
+        except Exception:
+            pass
+        
+        return None
+
+    def status(self) -> Dict[str, Any]:
+        """Get CUDA installation status."""
+        version = self._detect_installed_version()
+        cuda_path = self._get_cuda_path()
+        installed = version is not None and cuda_path is not None
+        state = self._load_state()
+        installations = state.get("installations", {})
+
+        # Detect cuDNN and TensorRT
+        cudnn_version = None
+        tensorrt_version = None
+        if cuda_path:
+            cudnn_version = self._detect_cudnn_version(cuda_path)
+            tensorrt_version = self._detect_tensorrt_version(cuda_path)
+
+        # Get all installed versions with their details
+        installed_versions = []
+        for v, info in installations.items():
+            install_path = info.get("path")
+            if install_path and os.path.exists(install_path):
+                installed_versions.append(
+                    {
+                        "version": v,
+                        "path": install_path,
+                        "installed_at": info.get("installed_at"),
+                        "is_system_install": info.get("is_system_install", False),
+                        "is_current": v == version,
+                        "cudnn_installed": info.get("cudnn_installed", False),
+                        "tensorrt_installed": info.get("tensorrt_installed", False),
+                    }
+                )
+
+        return {
+            "installed": installed,
+            "version": version,
+            "cuda_path": cuda_path,
+            "installed_at": state.get("installed_at"),
+            "installed_versions": installed_versions,
+            "operation": self._operation,
+            "operation_started_at": self._operation_started_at,
+            "last_error": self._last_error,
+            "log_path": self._log_path,
+            "available_versions": self.SUPPORTED_VERSIONS,
+            "platform": self._get_platform(),
+            "cudnn": {
+                "installed": cudnn_version is not None,
+                "version": cudnn_version,
+            },
+            "tensorrt": {
+                "installed": tensorrt_version is not None,
+                "version": tensorrt_version,
+            },
+        }
+
+    def is_operation_running(self) -> bool:
+        return self._operation is not None
+
+    def read_log_tail(self, max_bytes: int = 8192) -> str:
+        if not os.path.exists(self._log_path):
+            return ""
+        with open(self._log_path, "rb") as log_file:
+            log_file.seek(0, os.SEEK_END)
+            size = log_file.tell()
+            log_file.seek(max(0, size - max_bytes))
+            data = log_file.read().decode("utf-8", errors="replace")
+            if size > max_bytes:
+                data = data.split("\n", 1)[-1]
+            return data.strip()
+
+    async def uninstall(self, version: Optional[str] = None) -> Dict[str, Any]:
+        """Uninstall CUDA Toolkit."""
+        async with self._lock:
+            if self._operation:
+                raise RuntimeError(
+                    "Another CUDA installer operation is already running"
+                )
+
+            # Determine which version to uninstall
+            if not version:
+                # Uninstall the currently detected version
+                version = self._detect_installed_version()
+                if not version:
+                    raise RuntimeError("No CUDA installation found to uninstall")
+
+            state = self._load_state()
+            installations = state.get("installations", {})
+
+            if version not in installations:
+                raise RuntimeError(f"CUDA {version} installation not found in state")
+
+            install_info = installations[version]
+            install_path = install_info.get("path")
+
+            if not install_path or not os.path.exists(install_path):
+                # Path doesn't exist, just remove from state
+                logger.warning(
+                    f"CUDA installation path {install_path} does not exist, removing from state only"
+                )
+                installations.pop(version, None)
+                if state.get("installed_version") == version:
+                    state["installed_version"] = None
+                    state["installed_at"] = None
+                    state["cuda_path"] = None
+                self._save_state(state)
+                return {
+                    "message": f"CUDA {version} removed from state (installation path not found)"
+                }
+
+            await self._set_operation("uninstall")
+
+            async def _runner():
+                try:
+                    await self._broadcast_log_line(
+                        f"Starting uninstallation of CUDA {version}..."
+                    )
+                    await self._broadcast_progress(
+                        {
+                            "stage": "uninstall",
+                            "progress": 0,
+                            "message": f"Uninstalling CUDA {version}...",
+                        }
+                    )
+
+                    # Remove the installation directory
+                    if os.path.exists(install_path):
+                        await self._broadcast_log_line(
+                            f"Removing installation directory: {install_path}"
+                        )
+                        try:
+                            shutil.rmtree(install_path)
+                            await self._broadcast_log_line(
+                                f"Successfully removed {install_path}"
+                            )
+                        except Exception as e:
+                            logger.error(
+                                f"Failed to remove CUDA installation directory: {e}"
+                            )
+                            raise RuntimeError(
+                                f"Failed to remove installation directory: {e}"
+                            )
+
+                    # Update state
+                    installations.pop(version, None)
+                    if state.get("installed_version") == version:
+                        state["installed_version"] = None
+                        state["installed_at"] = None
+                        state["cuda_path"] = None
+                    self._save_state(state)
+
+                    # Update or remove the current symlink
+                    self._remove_current_symlink()
+                    await self._broadcast_log_line(
+                        "Updated CUDA current symlink (removed or re-pointed to another version)"
+                    )
+
+                    await self._broadcast_progress(
+                        {
+                            "stage": "uninstall",
+                            "progress": 100,
+                            "message": "CUDA uninstallation completed",
+                        }
+                    )
+                    await self._broadcast_log_line(
+                        f"CUDA {version} uninstalled successfully"
+                    )
+                    await self._finish_operation(
+                        True, f"CUDA {version} uninstalled successfully"
+                    )
+
+                except Exception as exc:
+                    self._last_error = str(exc)
+                    await self._finish_operation(False, str(exc))
+                    raise
+
+            self._create_task(_runner())
+            return {"message": f"CUDA {version} uninstallation started"}
diff --git a/backend/data_store.py b/backend/data_store.py
new file mode 100644
index 0000000..36aebdf
--- /dev/null
+++ b/backend/data_store.py
@@ -0,0 +1,216 @@
+"""YAML-backed data store replacing SQLite."""
+
+import os
+import threading
+from typing import Any, Dict, List, Optional
+
+import yaml
+
+from backend.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+def _get_config_dir() -> str:
+    """Return config directory (Docker: /app/data/config, local: data/config)."""
+    if os.path.exists("/app/data"):
+        return "/app/data/config"
+    return os.path.abspath("data/config")
+
+
+def generate_proxy_name(huggingface_id: str, quantization: Optional[str] = None) -> str:
+    """
+    Generate a proxy name for llama-swap using HuggingFace ID and optional quantization.
+    """
+    huggingface_slug = (
+        huggingface_id.replace("/", "-").replace(" ", "-").replace(".", "-").lower()
+    )
+    if quantization:
+        quantization_slug = quantization.replace(" ", "-").lower()
+        return f"{huggingface_slug}.{quantization_slug}"
+    return huggingface_slug
+
+
+class DataStore:
+    """Thread-safe YAML-backed data store replacing SQLite."""
+
+    def __init__(self, config_dir: Optional[str] = None):
+        self._config_dir = os.path.abspath(config_dir or _get_config_dir())
+        self._lock = threading.Lock()
+        self._ensure_files_exist()
+
+    def _ensure_files_exist(self) -> None:
+        """Create config dir and default YAML files if they don't exist."""
+        os.makedirs(self._config_dir, exist_ok=True)
+        for filename, default in [
+            ("models.yaml", {"models": []}),
+            (
+                "engines.yaml",
+                {
+                    "llama_cpp": {"active_version": None, "versions": []},
+                    "ik_llama": {"active_version": None, "versions": []},
+                    "lmdeploy": {
+                        "installed": False,
+                        "version": None,
+                        "install_type": None,
+                        "source_repo": None,
+                        "source_branch": None,
+                        "venv_path": None,
+                    },
+                    "cuda": {"installed_version": None, "install_path": None},
+                },
+            ),
+            ("settings.yaml", {"huggingface_token": "", "proxy_port": 2000}),
+        ]:
+            path = os.path.join(self._config_dir, filename)
+            if not os.path.exists(path):
+                self._write_yaml(path, default)
+
+    def _read_yaml(self, filename: str) -> dict:
+        """Read and parse a YAML file. Returns empty dict on error."""
+        path = os.path.join(self._config_dir, filename)
+        with self._lock:
+            if not os.path.exists(path):
+                return {}
+            try:
+                with open(path, "r") as f:
+                    return yaml.safe_load(f) or {}
+            except Exception as e:
+                logger.warning(f"Failed to read {path}: {e}")
+                return {}
+
+    def _write_yaml(self, path: str, data: dict) -> None:
+        """Atomic write: write to temp file then rename."""
+        tmp_path = path + ".tmp"
+        try:
+            with open(tmp_path, "w") as f:
+                yaml.dump(data, f, default_flow_style=False, sort_keys=False)
+            os.replace(tmp_path, path)
+        except Exception as e:
+            if os.path.exists(tmp_path):
+                try:
+                    os.remove(tmp_path)
+                except OSError:
+                    pass
+            raise e
+
+    def _save_yaml(self, filename: str, data: dict) -> None:
+        """Thread-safe write to a YAML file."""
+        path = os.path.join(self._config_dir, filename)
+        with self._lock:
+            self._write_yaml(path, data)
+
+    # --- Models ---
+
+    def list_models(self) -> List[dict]:
+        return self._read_yaml("models.yaml").get("models", [])
+
+    def get_model(self, model_id: str) -> Optional[dict]:
+        for m in self.list_models():
+            if m.get("id") == model_id:
+                return m
+        return None
+
+    def add_model(self, model: dict) -> dict:
+        data = self._read_yaml("models.yaml")
+        data.setdefault("models", []).append(model)
+        self._save_yaml("models.yaml", data)
+        return model
+
+    def update_model(self, model_id: str, updates: dict) -> Optional[dict]:
+        data = self._read_yaml("models.yaml")
+        for m in data.get("models", []):
+            if m.get("id") == model_id:
+                m.update(updates)
+                self._save_yaml("models.yaml", data)
+                return m
+        return None
+
+    def delete_model(self, model_id: str) -> bool:
+        data = self._read_yaml("models.yaml")
+        models = data.get("models", [])
+        new_models = [m for m in models if m.get("id") != model_id]
+        if len(new_models) == len(models):
+            return False
+        data["models"] = new_models
+        self._save_yaml("models.yaml", data)
+        return True
+
+    # --- Engines (llama_cpp, ik_llama) ---
+
+    def get_engine_versions(self, engine: str) -> List[dict]:
+        """engine is 'llama_cpp' or 'ik_llama'."""
+        return self._read_yaml("engines.yaml").get(engine, {}).get("versions", [])
+
+    def get_active_engine_version(self, engine: str) -> Optional[dict]:
+        data = self._read_yaml("engines.yaml").get(engine, {})
+        active = data.get("active_version")
+        if not active:
+            return None
+        for v in data.get("versions", []):
+            if v.get("version") == active:
+                return v
+        return None
+
+    def add_engine_version(self, engine: str, version_data: dict) -> None:
+        data = self._read_yaml("engines.yaml")
+        data.setdefault(engine, {}).setdefault("versions", []).append(version_data)
+        self._save_yaml("engines.yaml", data)
+
+    def set_active_engine_version(self, engine: str, version: str) -> None:
+        data = self._read_yaml("engines.yaml")
+        data.setdefault(engine, {})["active_version"] = version
+        self._save_yaml("engines.yaml", data)
+
+    def delete_engine_version(self, engine: str, version: str) -> bool:
+        data = self._read_yaml("engines.yaml")
+        engine_data = data.get(engine, {})
+        versions = engine_data.get("versions", [])
+        new_versions = [v for v in versions if v.get("version") != version]
+        if len(new_versions) == len(versions):
+            return False
+        engine_data["versions"] = new_versions
+        if engine_data.get("active_version") == version:
+            engine_data["active_version"] = None
+        self._save_yaml("engines.yaml", data)
+        return True
+
+    # --- LMDeploy ---
+
+    def get_lmdeploy_status(self) -> dict:
+        return self._read_yaml("engines.yaml").get("lmdeploy", {})
+
+    def update_lmdeploy(self, updates: dict) -> None:
+        data = self._read_yaml("engines.yaml")
+        data.setdefault("lmdeploy", {}).update(updates)
+        self._save_yaml("engines.yaml", data)
+
+    # --- CUDA ---
+
+    def get_cuda_status(self) -> dict:
+        return self._read_yaml("engines.yaml").get("cuda", {})
+
+    def update_cuda(self, updates: dict) -> None:
+        data = self._read_yaml("engines.yaml")
+        data.setdefault("cuda", {}).update(updates)
+        self._save_yaml("engines.yaml", data)
+
+    # --- Settings ---
+
+    def get_settings(self) -> dict:
+        return self._read_yaml("settings.yaml")
+
+    def update_settings(self, updates: dict) -> None:
+        data = self._read_yaml("settings.yaml")
+        data.update(updates)
+        self._save_yaml("settings.yaml", data)
+
+
+_store: Optional[DataStore] = None
+
+
+def get_store() -> DataStore:
+    global _store
+    if _store is None:
+        _store = DataStore()
+    return _store
diff --git a/backend/database.py b/backend/database.py
deleted file mode 100644
index 65a29fa..0000000
--- a/backend/database.py
+++ /dev/null
@@ -1,364 +0,0 @@
-from sqlalchemy import (
-    create_engine,
-    Column,
-    Integer,
-    String,
-    DateTime,
-    Boolean,
-    Text,
-    Float,
-    ForeignKey,
-    JSON,
-    text,
-    inspect,
-)
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker, relationship
-from datetime import datetime
-from typing import Dict, List
-import os
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-# Determine database path - use /app/data in Docker, ./data locally
-if os.path.exists("/app/data"):
-    db_dir = "/app/data"
-    db_path = "/app/data/db.sqlite"
-else:
-    db_dir = "data"
-    db_path = "data/db.sqlite"
-
-DATABASE_URL = f"sqlite:///{db_path}"
-
-engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
-
-Base = declarative_base()
-
-
-def get_db():
-    """Dependency to get database session"""
-    db = SessionLocal()
-    try:
-        yield db
-    finally:
-        db.close()
-
-
-def generate_proxy_name(huggingface_id: str, quantization: str) -> str:
-    """
-    Generate a centralized proxy name for llama-swap using HuggingFace ID and quantization.
-    This ensures consistent naming across all components.
-    """
-    # Create unique proxy name using HuggingFace ID and quantization to avoid conflicts
-    huggingface_slug = (
-        huggingface_id.replace("/", "-").replace(" ", "-").replace(".", "-").lower()
-    )
-    quantization_slug = quantization.replace(" ", "-").lower()
-    return f"{huggingface_slug}.{quantization_slug}"
-
-
-class Model(Base):
-    __tablename__ = "models"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, index=True)
-    huggingface_id = Column(String, index=True)  # Removed unique constraint
-    base_model_name = Column(String, index=True)  # Model name without quantization
-    file_path = Column(String)
-    file_size = Column(Integer)  # in bytes
-    quantization = Column(String)  # Q4_K_M, Q8_0, etc.
-    model_type = Column(String)  # llama, mistral, etc.
-    downloaded_at = Column(DateTime)
-    is_active = Column(Boolean, default=False)
-    config = Column(JSON)  # JSON object of llama.cpp parameters
-    proxy_name = Column(String, index=True)  # Centralized proxy name for llama-swap
-    model_format = Column(String, default="gguf", server_default="gguf", index=True)
-    pipeline_tag = Column(String, index=True)
-
-
-class LlamaVersion(Base):
-    __tablename__ = "llama_versions"
-
-    id = Column(Integer, primary_key=True, index=True)
-    version = Column(String, unique=True, index=True)
-    install_type = Column(String)  # "release", "source", "patched"
-    binary_path = Column(String)
-    source_commit = Column(String)  # For source builds
-    patches = Column(Text)  # JSON array of patch URLs/metadata
-    installed_at = Column(DateTime, default=datetime.utcnow)
-    is_active = Column(Boolean, default=False)  # Changed from is_default to is_active
-    build_config = Column(JSON)  # Store BuildConfig as JSON
-    repository_source = Column(
-        String, default="llama.cpp"
-    )  # "llama.cpp" or "ik_llama.cpp"
-
-
-class RunningInstance(Base):
-    __tablename__ = "running_instances"
-
-    id = Column(Integer, primary_key=True, index=True)
-    model_id = Column(Integer, index=True)
-    llama_version = Column(String)
-    proxy_model_name = Column(String)  # NEW: Model name in llama-swap
-    started_at = Column(DateTime)
-    config = Column(Text)  # JSON string of runtime config
-    runtime_type = Column(
-        String, default="llama_cpp", server_default="llama_cpp", index=True
-    )
-
-
-def sync_model_active_status(db):
-    """Sync model is_active status with running instances"""
-
-    # Get all running instances
-    running_instances = db.query(RunningInstance).all()
-    active_model_ids = set()
-
-    for instance in running_instances:
-        active_model_ids.add(instance.model_id)
-
-    # Update all models' is_active status
-    all_models = db.query(Model).all()
-    updated_count = 0
-
-    for model in all_models:
-        new_status = model.id in active_model_ids
-        if model.is_active != new_status:
-            model.is_active = new_status
-            updated_count += 1
-
-    if updated_count > 0:
-        db.commit()
-        logger.info(f"Synced {updated_count} models' is_active status")
-
-    return updated_count
-
-
-async def init_db():
-    """Initialize database tables"""
-    # Use the same db_dir determined at module load time
-    os.makedirs(db_dir, exist_ok=True)
-    # Ensure the database directory is writable
-    import stat
-    try:
-        os.chmod(db_dir, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
-    except Exception as perm_error:
-        logger.warning(f"Could not set permissions on {db_dir}: {perm_error}")
-    
-    # If database file exists, ensure it's writable
-    if os.path.exists(db_path):
-        try:
-            # Check if we can write to the database file
-            if not os.access(db_path, os.W_OK):
-                logger.error(f"Database file {db_path} is not writable. Please check file permissions.")
-                logger.error(f"Current user: {os.getuid() if hasattr(os, 'getuid') else 'unknown'}")
-                logger.error(f"File owner: {os.stat(db_path).st_uid if hasattr(os.stat(db_path), 'st_uid') else 'unknown'}")
-                raise PermissionError(f"Database file {db_path} is not writable")
-        except Exception as perm_error:
-            logger.warning(f"Could not check database file permissions: {perm_error}")
-    
-    Base.metadata.create_all(bind=engine)
-
-    try:
-        ensure_model_format_column()
-    except Exception as exc:
-        logger.warning(f"Failed to ensure model_format column: {exc}")
-    try:
-        ensure_running_instance_runtime_column()
-    except Exception as exc:
-        logger.warning(f"Failed to ensure running_instances.runtime_type column: {exc}")
-    try:
-        ensure_pipeline_tag_column()
-    except Exception as exc:
-        logger.warning(f"Failed to ensure models.pipeline_tag column: {exc}")
-    try:
-        ensure_repository_source_column()
-    except Exception as exc:
-        logger.warning(f"Failed to ensure repository_source column: {exc}")
-
-    # Migrate existing models to populate base_model_name
-    migrate_existing_models()
-
-    # Migrate safetensors models: merge multiple rows per repo into single row
-    try:
-        migrate_safetensors_models_to_unified()
-    except Exception as exc:
-        logger.warning(f"Failed to migrate safetensors models: {exc}")
-
-
-def migrate_existing_models():
-    """Migrate existing models to populate base_model_name field"""
-    db = SessionLocal()
-    try:
-        models = db.query(Model).filter(Model.base_model_name.is_(None)).all()
-
-        for model in models:
-            # Extract base model name from huggingface_id or name
-            if model.huggingface_id:
-                # Extract model name from huggingface_id (e.g., "microsoft/DialoGPT-medium" -> "DialoGPT")
-                parts = model.huggingface_id.split("/")
-                if len(parts) > 1:
-                    model.base_model_name = parts[-1].split("-")[
-                        0
-                    ]  # Remove quantization suffix
-                else:
-                    model.base_model_name = model.huggingface_id
-            elif model.name:
-                # Extract from name if no huggingface_id
-                model.base_model_name = model.name.split("-")[0]
-            else:
-                model.base_model_name = "unknown"
-
-        db.commit()
-        logger.info(f"Migrated {len(models)} models with base_model_name")
-
-    except Exception as e:
-        logger.error(f"Error migrating models: {e}")
-        db.rollback()
-    finally:
-        db.close()
-
-
-def ensure_model_format_column():
-    """Ensure the models table has the model_format column (retrofit for existing DBs)."""
-    inspector = inspect(engine)
-    columns = [column["name"] for column in inspector.get_columns("models")]
-    if "model_format" in columns:
-        return
-
-    with engine.connect() as connection:
-        connection.execute(text("ALTER TABLE models ADD COLUMN model_format VARCHAR"))
-        connection.execute(
-            text("UPDATE models SET model_format = 'gguf' WHERE model_format IS NULL")
-        )
-    logger.info("Added model_format column to models table")
-
-
-def ensure_running_instance_runtime_column():
-    """Ensure running_instances table tracks runtime_type."""
-    inspector = inspect(engine)
-    columns = [column["name"] for column in inspector.get_columns("running_instances")]
-    if "runtime_type" in columns:
-        return
-
-    with engine.connect() as connection:
-        connection.execute(
-            text("ALTER TABLE running_instances ADD COLUMN runtime_type VARCHAR")
-        )
-        connection.execute(
-            text(
-                "UPDATE running_instances SET runtime_type = 'llama_cpp' WHERE runtime_type IS NULL"
-            )
-        )
-    logger.info("Added runtime_type column to running_instances table")
-
-
-def migrate_safetensors_models_to_unified():
-    """Migrate safetensors models: merge multiple Model rows per repo into a single row."""
-    db = SessionLocal()
-    try:
-        # Find all safetensors models grouped by huggingface_id
-        safetensors_models = (
-            db.query(Model).filter(Model.model_format == "safetensors").all()
-        )
-
-        # Group by huggingface_id
-        by_repo: Dict[str, List[Model]] = {}
-        for model in safetensors_models:
-            hf_id = model.huggingface_id or "unknown"
-            by_repo.setdefault(hf_id, []).append(model)
-
-        merged_count = 0
-        for huggingface_id, models in by_repo.items():
-            if len(models) <= 1:
-                continue  # Already unified
-
-            # Keep the first model, merge others into it
-            primary = models[0]
-            others = models[1:]
-
-            # Aggregate file_size
-            total_size = sum(m.file_size or 0 for m in models)
-            if total_size:
-                primary.file_size = total_size
-
-            # Merge metadata: use most complete pipeline_tag, model_type, etc.
-            for other in others:
-                if not primary.pipeline_tag and other.pipeline_tag:
-                    primary.pipeline_tag = other.pipeline_tag
-                if not primary.model_type and other.model_type:
-                    primary.model_type = other.model_type
-                if not primary.base_model_name and other.base_model_name:
-                    primary.base_model_name = other.base_model_name
-                # Use earliest downloaded_at
-                if other.downloaded_at and (
-                    not primary.downloaded_at
-                    or other.downloaded_at < primary.downloaded_at
-                ):
-                    primary.downloaded_at = other.downloaded_at
-
-            # Update RunningInstance records to point to primary model
-            for other in others:
-                instances = (
-                    db.query(RunningInstance)
-                    .filter(RunningInstance.model_id == other.id)
-                    .all()
-                )
-                for instance in instances:
-                    instance.model_id = primary.id
-
-            # Delete duplicate models
-            for other in others:
-                db.delete(other)
-
-            merged_count += len(others)
-            logger.info(
-                f"Merged {len(others)} safetensors Model rows for {huggingface_id} into model_id={primary.id}"
-            )
-
-        if merged_count > 0:
-            db.commit()
-            logger.info(
-                f"Migration complete: merged {merged_count} safetensors Model rows"
-            )
-        else:
-            logger.debug("No safetensors Model rows to merge")
-
-    except Exception as e:
-        logger.error(f"Error migrating safetensors models: {e}")
-        db.rollback()
-    finally:
-        db.close()
-
-
-def ensure_pipeline_tag_column():
-    """Ensure the models table stores pipeline tags."""
-    inspector = inspect(engine)
-    columns = [column["name"] for column in inspector.get_columns("models")]
-    if "pipeline_tag" in columns:
-        return
-
-    with engine.connect() as connection:
-        connection.execute(text("ALTER TABLE models ADD COLUMN pipeline_tag VARCHAR"))
-    logger.info("Added pipeline_tag column to models table")
-
-
-def ensure_repository_source_column():
-    """Ensure the llama_versions table has the repository_source column."""
-    inspector = inspect(engine)
-    columns = [column["name"] for column in inspector.get_columns("llama_versions")]
-    if "repository_source" in columns:
-        return
-
-    with engine.connect() as connection:
-        connection.execute(
-            text("ALTER TABLE llama_versions ADD COLUMN repository_source VARCHAR")
-        )
-        connection.execute(
-            text(
-                "UPDATE llama_versions SET repository_source = 'llama.cpp' WHERE repository_source IS NULL"
-            )
-        )
-    logger.info("Added repository_source column to llama_versions table")
diff --git a/backend/gguf_reader.py b/backend/gguf_reader.py
index ef4c3c5..2ad9dd1 100644
--- a/backend/gguf_reader.py
+++ b/backend/gguf_reader.py
@@ -9,11 +9,27 @@
 from typing import Dict, Optional, Any, List, Tuple, BinaryIO
 
 from backend.logging_config import get_logger
-from backend.architecture_profiles import compute_layers_for_architecture
 
 logger = get_logger(__name__)
 
 
+def _compute_layers_for_architecture(
+    architecture: str,
+    metadata: dict,
+    base_block_count: int,
+) -> dict:
+    """Compute block_count and effective_layer_count from architecture and metadata."""
+    block_count = max(0, int(base_block_count))
+    # Most architectures add one output head layer
+    effective = block_count + 1
+    arch = (architecture or "").lower()
+    if arch == "glm4moe":
+        nextn = metadata.get("glm4moe.nextn_predict_layers")
+        if nextn is not None:
+            effective = block_count + int(nextn)
+    return {"block_count": block_count, "effective_layer_count": effective}
+
+
 class GGUFValueType(IntEnum):
     """
     GGUF Value Types as defined in the specification.
@@ -677,7 +693,7 @@ def read_gguf_metadata(file_path: str) -> Optional[Dict[str, Any]]:
 
             # Then compute architecture-aware block_count and effective_layer_count
             architecture = metadata.get("general.architecture", "").lower()
-            layer_info = compute_layers_for_architecture(
+            layer_info = _compute_layers_for_architecture(
                 architecture=architecture,
                 metadata=metadata,
                 base_block_count=base_block_count,
diff --git a/backend/huggingface.py b/backend/huggingface.py
index 6d59a42..288ab57 100644
--- a/backend/huggingface.py
+++ b/backend/huggingface.py
@@ -1,11 +1,9 @@
 from huggingface_hub import HfApi, hf_hub_download, list_models
 from typing import List, Dict, Optional, Tuple, Any
 import asyncio
-import aiohttp
 import json
 import os
 import threading
-from tqdm import tqdm
 import time
 import re
 import traceback
@@ -39,6 +37,45 @@
 _safetensors_metadata_ttl = 600  # 10 minutes
 
 
+def get_accurate_file_sizes(repo_id: str, paths: List[str]) -> Dict[str, Optional[int]]:
+    """Fetch accurate file sizes from HuggingFace API via get_paths_info."""
+    if not paths:
+        return {}
+    try:
+        paths_info = hf_api.get_paths_info(repo_id=repo_id, paths=paths)
+        return {
+            getattr(pi, "path", getattr(pi, "rfilename", "")): getattr(pi, "size", None)
+            for pi in paths_info
+        }
+    except Exception as e:
+        logger.warning(f"get_paths_info failed for {repo_id}: {e}")
+        return {}
+
+
+def get_mmproj_f16_filename(repo_id: str) -> Optional[str]:
+    """
+    If the repo contains vision projector (mmproj) GGUF files, return the F16 one to download.
+    Prefers mmproj-F16.gguf, then any *mmproj*F16*.gguf, then first mmproj*.gguf.
+    Returns None if no mmproj files or on API error.
+    """
+    try:
+        files = list(hf_api.list_repo_files(repo_id=repo_id))
+    except Exception as e:
+        logger.debug(f"list_repo_files failed for {repo_id}: {e}")
+        return None
+    mmproj = [f for f in files if "mmproj" in f.lower() and f.lower().endswith(".gguf")]
+    if not mmproj:
+        return None
+    # Prefer exact mmproj-F16.gguf, then any filename containing F16, then first mmproj
+    for name in mmproj:
+        if name == "mmproj-F16.gguf":
+            return name
+    for name in mmproj:
+        if "f16" in name.lower():
+            return name
+    return mmproj[0]
+
+
 def _download_repo_json(repo_id: str, filename: str) -> Optional[Dict[str, Any]]:
     try:
         path = hf_hub_download(repo_id, filename, local_dir_use_symlinks=False)
@@ -92,6 +129,127 @@ def _get_download_directory(model_format: str, huggingface_id: str) -> str:
     return MODEL_BASE_DIR
 
 
+def _hf_repo_folder_name(huggingface_id: str) -> str:
+    """Return the HF cache folder name for a model repo (e.g. models--Org--Repo)."""
+    return "models--" + huggingface_id.replace("/", "--")
+
+
+def resolve_cached_model_path(huggingface_id: str, filename: str) -> Optional[str]:
+    """Return the local path for a cached HF model file without triggering a download.
+
+    Returns None if the file is not in the HF cache.
+    """
+    try:
+        return hf_hub_download(
+            repo_id=huggingface_id,
+            filename=filename,
+            local_files_only=True,
+        )
+    except Exception:
+        return None
+
+
+def delete_cached_model_file(huggingface_id: str, filename: str) -> bool:
+    """Delete a specific model file from the HuggingFace cache.
+
+    Removes both the snapshot symlink and the underlying content blob.
+    Returns True if the file was found and deleted, False otherwise.
+    """
+    try:
+        cached_path = hf_hub_download(
+            repo_id=huggingface_id,
+            filename=filename,
+            local_files_only=True,
+        )
+    except Exception:
+        logger.warning(
+            f"delete_cached_model_file: {huggingface_id}/{filename} not found in HF cache"
+        )
+        return False
+
+    if os.path.islink(cached_path):
+        blob_path = os.path.realpath(cached_path)
+        try:
+            os.unlink(cached_path)
+        except OSError as e:
+            logger.warning(f"Could not remove symlink {cached_path}: {e}")
+        if os.path.exists(blob_path):
+            try:
+                os.remove(blob_path)
+            except OSError as e:
+                logger.warning(f"Could not remove blob {blob_path}: {e}")
+    elif os.path.exists(cached_path):
+        try:
+            os.remove(cached_path)
+        except OSError as e:
+            logger.warning(f"Could not remove file {cached_path}: {e}")
+
+    logger.info(f"Deleted cached model file: {huggingface_id}/{filename}")
+    return True
+
+
+def resolve_model_path(
+    huggingface_id: str,
+    filename: Optional[str] = None,
+    model_format: str = "gguf",
+) -> Optional[str]:
+    """
+    Resolve a model's local path from current storage (data/models/...).
+    For GGUF: returns path to the specific file if filename is given.
+    For safetensors: returns the repo directory (filename ignored).
+    Returns None if the path does not exist. Does not create directories.
+    """
+    if not huggingface_id:
+        return None
+    safe_repo = _safe_repo_name(huggingface_id)
+    base_dir = FORMAT_SUBDIRS.get(model_format, MODEL_BASE_DIR)
+    repo_dir = os.path.join(base_dir, safe_repo)
+    for prefix in ("", "/app"):
+        candidate = repo_dir if not prefix else os.path.join(prefix, repo_dir)
+        if not os.path.exists(candidate):
+            continue
+        if model_format == "gguf" and filename:
+            path = os.path.join(candidate, filename)
+            if os.path.isfile(path):
+                return path
+            continue
+        if model_format == "safetensors" or not filename:
+            if os.path.isdir(candidate):
+                return candidate
+    return None
+
+
+def get_model_disk_size(
+    huggingface_id: str,
+    filename: Optional[str] = None,
+    model_format: str = "gguf",
+) -> int:
+    """
+    Compute actual disk usage in bytes for a model in current storage.
+    For GGUF: size of the given file. For safetensors: sum of all files in repo dir.
+    """
+    path = resolve_model_path(huggingface_id, filename, model_format)
+    if not path:
+        return 0
+    if os.path.isfile(path):
+        try:
+            return os.path.getsize(path)
+        except OSError:
+            return 0
+    if os.path.isdir(path):
+        total = 0
+        try:
+            for _dirpath, _dirnames, filenames in os.walk(path):
+                for f in filenames:
+                    fp = os.path.join(_dirpath, f)
+                    if os.path.isfile(fp):
+                        total += os.path.getsize(fp)
+        except OSError:
+            pass
+        return total
+    return 0
+
+
 def _get_manifest_lock(
     model_format: str, huggingface_id: Optional[str] = None
 ) -> threading.Lock:
@@ -997,7 +1155,6 @@ async def _search_with_api(query: str, limit: int, model_format: str) -> List[Di
             search=query,
             limit=min(limit * 2, 50),  # Get more models to filter from
             sort="downloads",
-            direction=-1,
             filter=filter_value,
             expand=[
                 "author",
@@ -1069,11 +1226,13 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
             if model_format == "gguf":
                 # Group GGUF files by logical quantization, handling multi-part shards
                 # Accept both plain `.gguf` and multi-part patterns like `.gguf.part1of2`
+                # Exclude mmproj (vision/multimodal projection) files – they are extensions, not standalone quants
                 gguf_siblings = [
                     s
                     for s in model.siblings
                     if isinstance(getattr(s, "rfilename", None), str)
                     and re.search(r"\.gguf(\.|$)", s.rfilename)
+                    and "mmproj" not in s.rfilename.lower()
                 ]
                 logger.info(f"Model {model.id}: {len(gguf_siblings)} GGUF files found")
                 if not gguf_siblings:
@@ -1139,6 +1298,23 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                         else 0.0
                     )
 
+                # Siblings from list_models often have size=None; fetch accurate sizes from Hub
+                try:
+                    all_filenames = [s.rfilename for s in gguf_siblings]
+                    accurate_sizes = get_accurate_file_sizes(model.id, all_filenames)
+                    if accurate_sizes:
+                        for entry in quantizations.values():
+                            for f in entry["files"]:
+                                f["size"] = accurate_sizes.get(f["filename"]) or f["size"] or 0
+                            entry["total_size"] = sum(f["size"] for f in entry["files"])
+                            entry["size_mb"] = (
+                                round(entry["total_size"] / (1024 * 1024), 2)
+                                if entry["total_size"]
+                                else 0.0
+                            )
+                except Exception as size_err:
+                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
+
                 # If no quantizations were detected after grouping, skip this model
                 if not quantizations:
                     return None
@@ -1162,6 +1338,15 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                 )
                 if not safetensors_files:
                     return None
+                # Fetch accurate sizes; list_models siblings often have size=None
+                try:
+                    st_filenames = [f["filename"] for f in safetensors_files]
+                    accurate_sizes = get_accurate_file_sizes(model.id, st_filenames)
+                    if accurate_sizes:
+                        for f in safetensors_files:
+                            f["size"] = accurate_sizes.get(f["filename"]) or 0
+                except Exception as size_err:
+                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
         else:
             return None
 
@@ -1510,23 +1695,18 @@ async def get_model_details(model_id: str) -> Dict:
 async def download_model(
     huggingface_id: str, filename: str, model_format: str = "gguf"
 ) -> tuple[str, int]:
-    """Download model from HuggingFace"""
+    """Download model from HuggingFace to the native HF cache."""
     try:
-        models_dir = _get_download_directory(model_format, huggingface_id)
-
-        # Sanitize filename
         filename = _sanitize_filename(filename)
 
-        # Download the file
         file_path = hf_hub_download(
             repo_id=huggingface_id,
             filename=filename,
-            local_dir=models_dir,
-            local_dir_use_symlinks=False,
         )
 
-        # Get file size
-        file_size = os.path.getsize(file_path)
+        # Use realpath so getsize works even when file_path is a symlink
+        real_path = os.path.realpath(file_path)
+        file_size = os.path.getsize(real_path if os.path.exists(real_path) else file_path)
 
         return file_path, file_size
 
@@ -1535,331 +1715,154 @@ async def download_model(
         raise
 
 
-async def download_model_with_websocket_progress(
+async def download_model_with_progress(
     huggingface_id: str,
     filename: str,
-    websocket_manager,
+    progress_manager,
     task_id: str,
     total_bytes: int = 0,
     model_format: str = "gguf",
     huggingface_id_for_progress: str = None,
 ):
-    """Download model with WebSocket progress updates by tracking filesystem size"""
-    import asyncio
+    """Download model to the HF native cache with SSE progress updates.
+
+    Progress is tracked by monitoring the .incomplete blob file that hf_hub_download
+    writes to the HF cache during the download.
+    """
+    import threading
     import time
+    from huggingface_hub.constants import HF_HUB_CACHE
 
-    logger.info(f"=== DOWNLOAD PROGRESS START ===")
-    logger.info(f"Download task: {task_id}")
-    logger.info(f"HuggingFace ID: {huggingface_id}")
-    logger.info(f"Filename: {filename}")
-    logger.info(f"Total bytes from search: {total_bytes}")
-    logger.info(f"WebSocket manager: {websocket_manager}")
-    logger.info(f"Active connections: {len(websocket_manager.active_connections)}")
+    filename = _sanitize_filename(filename)
+    progress_hf_id = huggingface_id_for_progress or huggingface_id
 
-    try:
-        models_dir = _get_download_directory(model_format, huggingface_id)
+    logger.info(f"Starting HF-cache download: {huggingface_id}/{filename} task={task_id}")
 
-        # Sanitize filename and build path
-        filename = _sanitize_filename(filename)
-        file_path = os.path.join(models_dir, filename)
-        directory = os.path.dirname(file_path)
-        if directory and not os.path.exists(directory):
-            os.makedirs(directory, exist_ok=True)
-
-        # Send initial progress
-        logger.info(f"Sending initial progress message...")
-        progress_hf_id = huggingface_id_for_progress or huggingface_id
-        await websocket_manager.send_download_progress(
-            task_id=task_id,
-            progress=0,
-            message=f"Starting download of {filename}",
-            bytes_downloaded=0,
-            total_bytes=total_bytes,
-            speed_mbps=0,
-            eta_seconds=0,
-            filename=filename,
-            model_format=model_format,
-            huggingface_id=progress_hf_id,
-        )
-        logger.info(f"Initial progress message sent")
+    # Resolve total size if not provided
+    if total_bytes == 0:
+        try:
+            file_info = HfApi().repo_file_info(repo_id=huggingface_id, filename=filename)
+            total_bytes = file_info.size or 0
+            logger.info(f"Got file size from HuggingFace API: {total_bytes}")
+        except Exception as e:
+            logger.warning(f"Could not get file size: {e}")
+
+    await progress_manager.send_download_progress(
+        task_id=task_id,
+        progress=0,
+        message=f"Starting download of {filename}",
+        bytes_downloaded=0,
+        total_bytes=total_bytes,
+        speed_mbps=0,
+        eta_seconds=0,
+        filename=filename,
+        model_format=model_format,
+        huggingface_id=progress_hf_id,
+    )
 
-        # Get file size from HuggingFace API if not provided
-        if total_bytes == 0:
-            try:
-                from huggingface_hub import HfApi
-
-                api = HfApi()
-                file_info = api.repo_file_info(repo_id=huggingface_id, path=filename)
-                total_bytes = file_info.size
-                logger.info(f"Got file size from HuggingFace API: {total_bytes}")
-            except Exception as e:
-                logger.warning(f"Could not get file size from HuggingFace API: {e}")
-                # If we can't get the size, we'll estimate it
-                total_bytes = 0
-
-        # Send total size update
-        if total_bytes > 0:
-            await websocket_manager.send_download_progress(
+    # Run the blocking hf_hub_download in a background thread
+    repo_folder = _hf_repo_folder_name(huggingface_id)
+    blobs_dir = os.path.join(HF_HUB_CACHE, repo_folder, "blobs")
+
+    download_result: dict = {"file_path": None, "error": None, "done": False}
+
+    def _do_download():
+        try:
+            download_result["file_path"] = hf_hub_download(
+                repo_id=huggingface_id,
+                filename=filename,
+            )
+        except Exception as exc:
+            download_result["error"] = exc
+        finally:
+            download_result["done"] = True
+
+    thread = threading.Thread(target=_do_download, daemon=True)
+    thread.start()
+
+    # Poll the .incomplete blob for progress
+    start_time = time.time()
+    last_bytes = 0
+    last_poll = start_time
+
+    while not download_result["done"]:
+        await asyncio.sleep(0.5)
+
+        incomplete_bytes = 0
+        if os.path.isdir(blobs_dir):
+            for fname in os.listdir(blobs_dir):
+                if fname.endswith(".incomplete"):
+                    try:
+                        incomplete_bytes = max(
+                            incomplete_bytes,
+                            os.path.getsize(os.path.join(blobs_dir, fname)),
+                        )
+                    except OSError:
+                        pass
+
+        if incomplete_bytes > 0:
+            now = time.time()
+            elapsed_total = now - start_time
+            elapsed_poll = now - last_poll
+            delta = incomplete_bytes - last_bytes
+            speed_mbps = (delta / elapsed_poll / (1024 * 1024)) if elapsed_poll > 0 else 0
+            progress = min(99, int(incomplete_bytes / total_bytes * 100)) if total_bytes else 0
+            eta = (
+                int((total_bytes - incomplete_bytes) / (incomplete_bytes / elapsed_total))
+                if elapsed_total > 0 and incomplete_bytes > 0 and total_bytes > incomplete_bytes
+                else 0
+            )
+            await progress_manager.send_download_progress(
                 task_id=task_id,
-                progress=0,
+                progress=progress,
                 message=f"Downloading {filename}",
-                bytes_downloaded=0,
+                bytes_downloaded=incomplete_bytes,
                 total_bytes=total_bytes,
-                speed_mbps=0,
-                eta_seconds=0,
+                speed_mbps=round(speed_mbps, 2),
+                eta_seconds=eta,
                 filename=filename,
                 model_format=model_format,
                 huggingface_id=progress_hf_id,
             )
+            last_bytes = incomplete_bytes
+            last_poll = now
 
-        # Start the download with built-in progress tracking
-        logger.info(f"🚀 Starting download with built-in progress tracking...")
-
-        file_path, file_size = await download_with_progress_tracking(
-            huggingface_id,
-            filename,
-            file_path,
-            models_dir,
-            websocket_manager,
-            task_id,
-            total_bytes,
-            model_format,
-            progress_hf_id,
-        )
-
-        # Send final completion
-        await websocket_manager.send_download_progress(
+    if download_result["error"]:
+        err = download_result["error"]
+        await progress_manager.send_download_progress(
             task_id=task_id,
-            progress=100,
-            message=f"Download completed: {filename}",
-            bytes_downloaded=file_size,
-            total_bytes=file_size,
+            progress=0,
+            message=f"Download failed: {err}",
+            bytes_downloaded=0,
+            total_bytes=total_bytes,
             speed_mbps=0,
             eta_seconds=0,
             filename=filename,
             model_format=model_format,
             huggingface_id=progress_hf_id,
         )
+        raise err
+
+    # Success: get final path and size
+    file_path = download_result["file_path"]
+    real_path = os.path.realpath(file_path) if file_path else file_path
+    file_size = os.path.getsize(real_path if os.path.exists(real_path) else file_path)
+
+    await progress_manager.send_download_progress(
+        task_id=task_id,
+        progress=100,
+        message=f"Download completed: {filename}",
+        bytes_downloaded=file_size,
+        total_bytes=file_size,
+        speed_mbps=0,
+        eta_seconds=0,
+        filename=filename,
+        model_format=model_format,
+        huggingface_id=progress_hf_id,
+    )
 
-        return file_path, file_size
-
-    except Exception as e:
-        # Send error notification
-        if websocket_manager and task_id:
-            progress_hf_id = huggingface_id_for_progress or huggingface_id
-            await websocket_manager.send_download_progress(
-                task_id=task_id,
-                progress=0,
-                message=f"Download failed: {str(e)}",
-                bytes_downloaded=0,
-                total_bytes=0,
-                speed_mbps=0,
-                eta_seconds=0,
-                filename=filename,
-                model_format=model_format,
-                huggingface_id=progress_hf_id,
-            )
-            await websocket_manager.send_notification(
-                "error",
-                "Download Failed",
-                f"Failed to download {filename}: {str(e)}",
-                task_id,
-            )
-        raise
-
-
-async def download_with_progress_tracking(
-    huggingface_id: str,
-    filename: str,
-    file_path: str,
-    models_dir: str,
-    websocket_manager,
-    task_id: str,
-    total_bytes: int,
-    model_format: str,
-    huggingface_id_for_progress: str = None,
-):
-    """Download the file using custom http_get method with progress tracking"""
-    try:
-        import aiofiles
-
-        logger.info(
-            f"📁 Starting download of {filename} ({total_bytes} bytes) [{model_format}]"
-        )
-
-        # Use the standard HuggingFace resolve URL (this is the default/preferred method)
-        safe_filename = _sanitize_filename(filename)
-        download_url = (
-            f"https://huggingface.co/{huggingface_id}/resolve/main/{safe_filename}"
-        )
-        actual_file_size = total_bytes  # Start with the provided size
-
-        # Optionally get exact file size from HuggingFace API
-        try:
-            api = HfApi()
-            file_info = api.repo_file_info(
-                repo_id=huggingface_id, filename=safe_filename
-            )
-            if hasattr(file_info, "size") and file_info.size:
-                actual_file_size = file_info.size
-                logger.info(
-                    f"📊 Got file size from HuggingFace API: {actual_file_size} bytes ({actual_file_size / (1024*1024):.2f} MB)"
-                )
-        except Exception as e:
-            logger.debug(
-                f"Could not get file size from API: {e}, using provided size: {total_bytes}"
-            )
-
-        logger.info(f"📁 Download URL: {download_url}")
-
-        # Build headers manually
-        hf_headers = {
-            "User-Agent": "llama-cpp-studio/1.0.0",
-            "Accept": "*/*",
-            "Accept-Encoding": "gzip, deflate",
-        }
-
-        # Create final destination path
-        final_path = os.path.join(models_dir, safe_filename)
-        final_dir = os.path.dirname(final_path)
-        if final_dir and not os.path.exists(final_dir):
-            os.makedirs(final_dir, exist_ok=True)
-
-        # Custom progress bar that sends WebSocket updates
-        progress_hf_id = huggingface_id_for_progress or huggingface_id
-
-        class WebSocketProgressBar(tqdm):
-            def __init__(self, *args, **kwargs):
-                super().__init__(*args, **kwargs)
-                self.websocket_manager = websocket_manager
-                self.task_id = task_id
-                self.filename = filename
-                self.huggingface_id = progress_hf_id
-                self.start_time = time.time()
-                self.last_update_time = self.start_time
-
-            def update(self, n=1):
-                super().update(n)
-                # Send WebSocket update with current progress
-                current_time = time.time()
-                if (
-                    current_time - self.last_update_time >= 0.5
-                ):  # Update every 0.5 seconds
-                    if self.total > 0:
-                        progress = int((self.n / self.total) * 100)
-                        current_bytes = int(self.n)
-
-                        # Calculate speed and ETA
-                        elapsed_time = current_time - self.start_time
-                        speed_bytes_per_sec = (
-                            current_bytes / elapsed_time if elapsed_time > 0 else 0
-                        )
-                        speed_mbps = speed_bytes_per_sec / (1024 * 1024)
-
-                        remaining_bytes = self.total - self.n
-                        eta_seconds = (
-                            int(remaining_bytes / speed_bytes_per_sec)
-                            if speed_bytes_per_sec > 0
-                            else 0
-                        )
-
-                        logger.debug(
-                            f"📊 Progress: {progress}% ({current_bytes}/{self.total} bytes) - {speed_mbps:.1f} MB/s"
-                        )
-
-                        # Send WebSocket update
-                        try:
-                            loop = asyncio.get_event_loop()
-                            if loop.is_running():
-                                asyncio.create_task(
-                                    self.websocket_manager.send_download_progress(
-                                        task_id=self.task_id,
-                                        progress=progress,
-                                        message=f"Downloading {self.filename}",
-                                        bytes_downloaded=current_bytes,
-                                        total_bytes=self.total,
-                                        speed_mbps=speed_mbps,
-                                        eta_seconds=eta_seconds,
-                                        filename=self.filename,
-                                        model_format=model_format,
-                                        huggingface_id=self.huggingface_id,
-                                    )
-                                )
-                        except Exception as e:
-                            logger.error(f"Error sending progress update: {e}")
-
-                        self.last_update_time = current_time
-
-        # Create our custom progress bar
-        custom_progress_bar = WebSocketProgressBar(
-            desc=safe_filename,
-            total=actual_file_size,  # Use the actual file size
-            unit="B",
-            unit_scale=True,
-            unit_divisor=1024,
-            disable=False,
-        )
-
-        # Download using aiohttp with timeout and our custom progress bar
-        timeout = aiohttp.ClientTimeout(
-            total=3600, connect=30
-        )  # 1 hour total, 30s connect
-        async with aiohttp.ClientSession(
-            headers=hf_headers, timeout=timeout
-        ) as session:
-            async with session.get(download_url) as response:
-                if response.status != 200:
-                    raise Exception(f"Failed to download: HTTP {response.status}")
-
-                # Get actual file size from response headers
-                content_length = response.headers.get("content-length")
-                if content_length:
-                    response_size = int(content_length)
-                    if response_size != actual_file_size:
-                        logger.debug(
-                            f"📏 Size difference: API said {actual_file_size}, response says {response_size} (diff: {abs(response_size - actual_file_size)} bytes)"
-                        )
-                        # Use the response size as it's more accurate
-                        actual_file_size = response_size
-                        custom_progress_bar.total = actual_file_size
-                        logger.info(
-                            f"📊 Using response size: {actual_file_size} bytes ({actual_file_size / (1024*1024):.2f} MB)"
-                        )
-
-                # Download with progress tracking
-                # Use 64KB chunks for better performance with large files
-                chunk_size = 65536
-                downloaded_bytes = 0
-                async with aiofiles.open(final_path, "wb") as f:
-                    async for chunk in response.content.iter_chunked(chunk_size):
-                        await f.write(chunk)
-                        downloaded_bytes += len(chunk)
-                        custom_progress_bar.update(len(chunk))
-
-        # Close the progress bar
-        custom_progress_bar.close()
-
-        logger.info(f"📁 Downloaded to: {final_path}")
-
-        # Validate downloaded file size
-        file_size = os.path.getsize(final_path)
-        if actual_file_size and actual_file_size > 0 and file_size != actual_file_size:
-            logger.warning(
-                f"⚠️ Download size mismatch: expected {actual_file_size}, got {file_size}"
-            )
-            # Allow small differences (like metadata)
-            if abs(file_size - actual_file_size) > 1024:  # More than 1KB difference
-                raise Exception(
-                    f"Download incomplete: expected {actual_file_size} bytes, got {file_size} bytes"
-                )
-
-        return final_path, file_size
+    return file_path, file_size
 
-    except Exception as e:
-        logger.error(f"Download error: {str(e)}")
-        logger.error(f"Error type: {type(e).__name__}")
-        logger.error(f"Traceback:\n{traceback.format_exc()}")
-        raise
 
 
 async def get_quantization_sizes_from_hf(
@@ -1898,27 +1901,23 @@ async def get_quantization_sizes_from_hf(
         updated: Dict[str, Dict] = {}
 
         if all_filenames:
-            try:
-                # Newer API: batch query specific paths for metadata
-                paths_info = hf_api.get_paths_info(
-                    repo_id=huggingface_id, paths=all_filenames
-                )
-                # Build lookup
-                file_sizes: Dict[str, Optional[int]] = {
-                    pi.path: getattr(pi, "size", None) for pi in paths_info
-                }
-            except Exception as batch_err:
-                logger.warning(
-                    f"get_paths_info failed for {huggingface_id}: {batch_err}"
-                )
+            file_sizes = get_accurate_file_sizes(huggingface_id, all_filenames)
+            if not file_sizes:
                 # Fallback: fetch full metadata once
-                model_info = hf_api.model_info(
-                    repo_id=huggingface_id, files_metadata=True
-                )
-                file_sizes = {}
-                if hasattr(model_info, "siblings") and model_info.siblings:
-                    for sibling in model_info.siblings:
-                        file_sizes[sibling.rfilename] = getattr(sibling, "size", None)
+                try:
+                    model_info = hf_api.model_info(
+                        repo_id=huggingface_id, files_metadata=True
+                    )
+                    if hasattr(model_info, "siblings") and model_info.siblings:
+                        for sibling in model_info.siblings:
+                            key = getattr(sibling, "path", getattr(sibling, "rfilename", ""))
+                            if key:
+                                file_sizes[key] = getattr(sibling, "size", None)
+                except Exception as fallback_err:
+                    logger.warning(
+                        f"model_info fallback failed for {huggingface_id}: {fallback_err}"
+                    )
+                    file_sizes = {}
 
             for quant_name, filenames in quant_to_files.items():
                 files_with_sizes = []
diff --git a/backend/llama_manager.py b/backend/llama_manager.py
index a4bfa88..cbcb928 100644
--- a/backend/llama_manager.py
+++ b/backend/llama_manager.py
@@ -76,6 +76,8 @@ class LlamaManager:
     # Repository URLs
     LLAMA_CPP_REPO = "https://github.com/ggerganov/llama.cpp.git"
     IK_LLAMA_CPP_REPO = "https://github.com/ikawrakow/ik_llama.cpp.git"
+    # Pre-built CUDA releases (ai-dock builds; used for "Install Release")
+    LLAMA_CPP_CUDA_RELEASES_API = "https://api.github.com/repos/ai-dock/llama.cpp-cuda/releases"
 
     REPOSITORY_SOURCES = {
         "llama.cpp": LLAMA_CPP_REPO,
@@ -83,7 +85,11 @@ class LlamaManager:
     }
 
     def __init__(self):
-        self.llama_dir = "data/llama-cpp"
+        # Use absolute path so clone/build work regardless of process cwd (e.g. --app-dir backend)
+        if os.path.exists("/app/data"):
+            self.llama_dir = "/app/data/llama-cpp"
+        else:
+            self.llama_dir = os.path.abspath(os.path.join(os.getcwd(), "data", "llama-cpp"))
         os.makedirs(self.llama_dir, exist_ok=True)
         # Ensure directory has proper permissions (read, write, execute for owner)
         try:
@@ -92,6 +98,7 @@ def __init__(self):
         except Exception as e:
             logger.warning(f"Could not set permissions on {self.llama_dir}: {e}")
         self._cached_cuda_architectures: Optional[str] = None
+        self._cached_cmake_path: Optional[str] = None
 
     def _check_cuda_toolkit_available(
         self,
@@ -326,8 +333,11 @@ def _verify_cuda_toolkit_complete(self, cuda_root: str) -> Tuple[bool, List[str]
     def _get_cmake_version(self) -> Optional[Tuple[int, int, int]]:
         """Get CMake version as tuple (major, minor, patch)."""
         try:
+            cmake_exe = self._find_cmake_executable()
+            if not cmake_exe:
+                return None
             result = subprocess.run(
-                ["cmake", "--version"], capture_output=True, text=True, timeout=5
+                [cmake_exe, "--version"], capture_output=True, text=True, timeout=5
             )
             if result.returncode == 0:
                 # Parse "cmake version X.Y.Z"
@@ -342,6 +352,24 @@ def _get_cmake_version(self) -> Optional[Tuple[int, int, int]]:
             pass
         return None
 
+    def _find_cmake_executable(self) -> Optional[str]:
+        """Find a usable cmake executable from env or PATH."""
+        if self._cached_cmake_path and os.path.exists(self._cached_cmake_path):
+            return self._cached_cmake_path
+
+        candidates = [
+            os.getenv("CMAKE"),
+            os.getenv("CMAKE_EXECUTABLE"),
+            shutil.which("cmake"),
+        ]
+
+        for candidate in candidates:
+            if candidate and os.path.exists(candidate):
+                self._cached_cmake_path = candidate
+                return candidate
+
+        return None
+
     def _get_cuda_version(self, nvcc_path: str) -> Optional[Tuple[int, int]]:
         """Get CUDA version from nvcc as tuple (major, minor)."""
         try:
@@ -401,9 +429,9 @@ async def _detect_cuda_architectures(self) -> Optional[str]:
         return detected
 
     def _fetch_release(self, tag_name: str) -> Dict:
-        """Fetch release metadata for a tag."""
+        """Fetch release metadata for a tag from ai-dock/llama.cpp-cuda (CUDA builds)."""
         response = requests.get(
-            f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{tag_name}",
+            f"{self.LLAMA_CPP_CUDA_RELEASES_API}/tags/{tag_name}",
             allow_redirects=True,
         )
         response.raise_for_status()
@@ -413,6 +441,10 @@ def _tokenize_asset_name(self, asset_name: str) -> List[str]:
         return [token for token in re.split(r"[.\-_\s]+", asset_name.lower()) if token]
 
     def _is_asset_compatible(self, asset_name: str) -> Tuple[bool, Optional[str]]:
+        # ai-dock/llama.cpp-cuda: single .tar.gz per release (e.g. llama.cpp-b8233-cuda-12.8.tar.gz)
+        if re.match(r"^llama\.cpp-[^\-]+-cuda-[0-9.]+\.tar\.gz$", asset_name, re.IGNORECASE):
+            return True, None
+
         tokens = self._tokenize_asset_name(asset_name)
 
         if not tokens:
@@ -453,6 +485,11 @@ def _extract_asset_features(self, asset_name: str) -> List[str]:
         tokens = self._tokenize_asset_name(asset_name)
         features = []
 
+        # ai-dock tarballs contain llama-server and are CUDA builds
+        if re.match(r"^llama\.cpp-[^\-]+-cuda-[0-9.]+\.tar\.gz$", asset_name, re.IGNORECASE):
+            features.extend(["llama-server", "CUDA"])
+            return features
+
         feature_map = {
             "cuda": "CUDA",
             "vulkan": "Vulkan",
@@ -701,8 +738,31 @@ def get_optimal_build_threads(self) -> int:
         except:
             return 1  # Fallback to single thread
 
+    async def _run_command(
+        self,
+        *args,
+        cwd: Optional[str] = None,
+        env: Optional[dict] = None,
+        timeout: Optional[int] = None,
+        merge_stderr: bool = False,
+    ) -> subprocess.CompletedProcess:
+        """Run a subprocess in a thread for cross-platform compatibility."""
+
+        def _runner():
+            return subprocess.run(
+                list(args),
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT if merge_stderr else subprocess.PIPE,
+                timeout=timeout,
+                check=False,
+            )
+
+        return await asyncio.to_thread(_runner)
+
     async def validate_build(
-        self, binary_path: str, websocket_manager=None, task_id: str = None
+        self, binary_path: str, progress_manager=None, task_id: str = None
     ) -> bool:
         """Run basic validation on built binary"""
         try:
@@ -711,13 +771,9 @@ async def validate_build(
                 return False
 
             # Test 2: Run --version command
-            process = await asyncio.create_subprocess_exec(
-                binary_path,
-                "--version",
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-            )
-            stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=10)
+            process = await self._run_command(binary_path, "--version", timeout=10)
+            stdout = process.stdout or b""
+            stderr = process.stderr or b""
 
             if process.returncode != 0:
                 return False
@@ -736,15 +792,15 @@ async def validate_build(
     async def install_release(
         self,
         tag_name: str,
-        websocket_manager=None,
+        progress_manager=None,
         task_id: str = None,
         asset_id: Optional[int] = None,
     ) -> str:
-        """Install llama.cpp from GitHub release with WebSocket progress updates"""
+        """Install llama.cpp from GitHub release with SSE progress updates"""
         try:
             # Stage 1: Get release info
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="fetch",
                     progress=10,
@@ -768,8 +824,8 @@ async def install_release(
             )
 
             # Stage 2: Download binary
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="download",
                     progress=30,
@@ -817,8 +873,8 @@ async def install_release(
                     logger.warning(f"Unable to verify downloaded artifact size: {exc}")
 
             # Stage 3: Extract binary
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="extract",
                     progress=70,
@@ -852,8 +908,8 @@ async def install_release(
             os.remove(download_path)
 
             # Stage 4: Find and verify executable
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="verify",
                     progress=90,
@@ -877,8 +933,8 @@ async def install_release(
             logger.info(
                 f"llama-server executable found and verified: {final_server_path}"
             )
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="verify",
                     progress=100,
@@ -905,9 +961,9 @@ async def install_release(
 
         except Exception as e:
             logger.error(f"Installation failed with error: {e}")
-            if websocket_manager and task_id:
+            if progress_manager and task_id:
                 try:
-                    await websocket_manager.send_build_progress(
+                    await progress_manager.send_build_progress(
                         task_id=task_id,
                         stage="error",
                         progress=0,
@@ -915,7 +971,7 @@ async def install_release(
                         log_lines=[f"Error: {str(e)}"],
                     )
                 except Exception as ws_error:
-                    logger.error(f"Failed to send error to WebSocket: {ws_error}")
+                    logger.error(f"Failed to send error via SSE: {ws_error}")
             raise Exception(f"Failed to install release {tag_name}: {e}")
 
     async def build_source(
@@ -923,7 +979,7 @@ async def build_source(
         commit_sha: str,
         patches: List[str] = None,
         build_config: BuildConfig = None,
-        websocket_manager=None,
+        progress_manager=None,
         task_id: str = None,
         repository_url: str = None,
         version_name: str = None,
@@ -942,8 +998,8 @@ async def build_source(
                     break
 
             # Send initial progress
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="init",
                     progress=0,
@@ -971,8 +1027,8 @@ async def build_source(
                 logger.warning(f"Could not set permissions on {version_dir}: {e}")
 
             # Stage 1: Clone repository (simplified)
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="clone",
                     progress=20,
@@ -984,20 +1040,16 @@ async def build_source(
 
             # Simple git clone with timeout
             try:
-                clone_process = await asyncio.create_subprocess_exec(
+                clone_process = await self._run_command(
                     "git",
                     "clone",
                     repository_url,
                     clone_dir,
-                    stdout=asyncio.subprocess.PIPE,
-                    stderr=asyncio.subprocess.PIPE,
-                )
-
-                clone_stdout, clone_stderr = await asyncio.wait_for(
-                    clone_process.communicate(), timeout=300  # 5 minute timeout
+                    timeout=300,
                 )
 
                 if clone_process.returncode != 0:
+                    clone_stderr = clone_process.stderr or b""
                     error_msg = clone_stderr.decode().strip()
                     raise Exception(f"Git clone failed: {error_msg}")
 
@@ -1005,13 +1057,11 @@ async def build_source(
 
             except asyncio.TimeoutError:
                 logger.error("Git clone timed out")
-                clone_process.kill()
-                await clone_process.wait()
                 raise Exception("Git clone timed out - network issues")
 
             # Stage 2: Checkout specific commit/branch (simplified)
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="checkout",
                     progress=40,
@@ -1020,38 +1070,30 @@ async def build_source(
                 )
 
             try:
-                checkout_process = await asyncio.create_subprocess_exec(
+                checkout_process = await self._run_command(
                     "git",
                     "checkout",
                     commit_sha,
                     cwd=clone_dir,
-                    stdout=asyncio.subprocess.PIPE,
-                    stderr=asyncio.subprocess.PIPE,
-                )
-
-                checkout_stdout, checkout_stderr = await asyncio.wait_for(
-                    checkout_process.communicate(), timeout=60
+                    timeout=60,
                 )
 
                 if checkout_process.returncode != 0:
+                    checkout_stderr = checkout_process.stderr or b""
                     error_msg = checkout_stderr.decode().strip()
                     # Try main as fallback for "master" (legacy support)
                     if commit_sha == "master":
                         logger.info("Failed to checkout 'master', trying 'main'")
-                        main_process = await asyncio.create_subprocess_exec(
+                        main_process = await self._run_command(
                             "git",
                             "checkout",
                             "main",
                             cwd=clone_dir,
-                            stdout=asyncio.subprocess.PIPE,
-                            stderr=asyncio.subprocess.PIPE,
-                        )
-
-                        main_stdout, main_stderr = await asyncio.wait_for(
-                            main_process.communicate(), timeout=60
+                            timeout=60,
                         )
 
                         if main_process.returncode != 0:
+                            main_stderr = main_process.stderr or b""
                             raise Exception(
                                 f"Failed to checkout both 'master' and 'main': {main_stderr.decode()}"
                             )
@@ -1065,8 +1107,8 @@ async def build_source(
 
             # Stage 3: Apply patches (if any)
             if patches:
-                if websocket_manager and task_id:
-                    await websocket_manager.send_build_progress(
+                if progress_manager and task_id:
+                    await progress_manager.send_build_progress(
                         task_id=task_id,
                         stage="patch",
                         progress=50,
@@ -1078,8 +1120,8 @@ async def build_source(
                     await self._apply_patch(clone_dir, patch_url)
 
             # Stage 4: Build following official documentation
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="configure",
                     progress=60,
@@ -1133,8 +1175,8 @@ async def build_source(
                         error_msg = f"CUDA build requested but CUDA Toolkit not found.\n\n{cuda_error}"
 
                     logger.error(error_msg)
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="configure",
                             progress=60,
@@ -1158,8 +1200,8 @@ async def build_source(
                         if result.returncode != 0:
                             error_msg = f"nvcc found at {nvcc_path} but failed to execute (exit code {result.returncode})"
                             logger.error(error_msg)
-                            if websocket_manager and task_id:
-                                await websocket_manager.send_build_progress(
+                            if progress_manager and task_id:
+                                await progress_manager.send_build_progress(
                                     task_id=task_id,
                                     stage="configure",
                                     progress=60,
@@ -1174,8 +1216,8 @@ async def build_source(
                     except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as e:
                         error_msg = f"Failed to verify nvcc at {nvcc_path}: {e}"
                         logger.error(error_msg)
-                        if websocket_manager and task_id:
-                            await websocket_manager.send_build_progress(
+                        if progress_manager and task_id:
+                            await progress_manager.send_build_progress(
                                 task_id=task_id,
                                 stage="configure",
                                 progress=60,
@@ -1186,8 +1228,8 @@ async def build_source(
                 else:
                     error_msg = f"nvcc not found at expected path {nvcc_path} (CUDA root: {cuda_root})"
                     logger.error(error_msg)
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="configure",
                             progress=60,
@@ -1199,8 +1241,14 @@ async def build_source(
                 # Store validated CUDA root for later use
                 validated_cuda_root = cuda_root
 
+            cmake_exe = self._find_cmake_executable()
+            if not cmake_exe:
+                raise Exception(
+                    "CMake was not found. Install CMake or set CMAKE/CMAKE_EXECUTABLE to its path."
+                )
+
             # Build CMake arguments
-            cmake_args = ["cmake", ".."]
+            cmake_args = [cmake_exe, ".."]
 
             # Add build type
             cmake_args.append(f"-DCMAKE_BUILD_TYPE={build_config.build_type}")
@@ -1581,17 +1629,15 @@ def set_flag(flag: str, value: bool):
                 # Log cmake arguments for debugging
                 logger.info(f"CMake command: {' '.join(cmake_args)}")
 
-                cmake_process = await asyncio.create_subprocess_exec(
+                cmake_process = await self._run_command(
                     *cmake_args,
                     cwd=build_dir,
                     env=env,
-                    stdout=asyncio.subprocess.PIPE,
-                    stderr=asyncio.subprocess.PIPE,
+                    timeout=180,
                 )
 
-                cmake_stdout, cmake_stderr = await asyncio.wait_for(
-                    cmake_process.communicate(), timeout=180  # 3 minute timeout
-                )
+                cmake_stdout = cmake_process.stdout or b""
+                cmake_stderr = cmake_process.stderr or b""
 
                 if cmake_process.returncode != 0:
                     error_msg = cmake_stderr.decode().strip()
@@ -1688,20 +1734,17 @@ def set_flag(flag: str, value: bool):
 
                 # List available targets for debugging (especially useful for ik_llama.cpp)
                 try:
-                    targets_process = await asyncio.create_subprocess_exec(
-                        "cmake",
+                    targets_process = await self._run_command(
+                        cmake_exe,
                         "--build",
                         ".",
                         "--target",
                         "help",
                         cwd=build_dir,
                         env=env,
-                        stdout=asyncio.subprocess.PIPE,
-                        stderr=asyncio.subprocess.PIPE,
-                    )
-                    targets_stdout, targets_stderr = await asyncio.wait_for(
-                        targets_process.communicate(), timeout=30
+                        timeout=30,
                     )
+                    targets_stdout = targets_process.stdout or b""
                     if targets_process.returncode == 0:
                         targets_output = targets_stdout.decode(
                             "utf-8", errors="replace"
@@ -1727,8 +1770,8 @@ def set_flag(flag: str, value: bool):
                                 logger.warning(
                                     f"llama-server target not found in available targets. Repository: {repo_source_name}"
                                 )
-                                if websocket_manager and task_id:
-                                    await websocket_manager.send_build_progress(
+                                if progress_manager and task_id:
+                                    await progress_manager.send_build_progress(
                                         task_id=task_id,
                                         stage="configure",
                                         progress=65,
@@ -1744,8 +1787,8 @@ def set_flag(flag: str, value: bool):
                 raise Exception("CMake configuration timed out")
 
             # Stage 5: Build
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="build",
                     progress=70,
@@ -1776,89 +1819,43 @@ def set_flag(flag: str, value: bool):
                     )
 
                 # Explicitly build llama-server target
-                build_process = await asyncio.create_subprocess_exec(
-                    "cmake",
+                build_process = await self._run_command(
+                    cmake_exe,
                     "--build",
                     ".",
                     "--target",
                     "llama-server",
-                    "--",
-                    "-j",
+                    "--parallel",
                     str(thread_count),
                     cwd=build_dir,
                     env=env,
-                    stdout=asyncio.subprocess.PIPE,
-                    stderr=asyncio.subprocess.STDOUT,  # Merge stderr into stdout
+                    timeout=1800,
+                    merge_stderr=True,
                 )
 
-                # Stream build output for better diagnostics
-                build_output_lines = []
-                last_progress_update = time.time()
-
-                async def read_output():
-                    nonlocal last_progress_update
-                    while True:
-                        line = await build_process.stdout.readline()
-                        if not line:
-                            break
-                        decoded_line = line.decode("utf-8", errors="replace").rstrip()
-                        build_output_lines.append(decoded_line)
-                        logger.debug(f"Build output: {decoded_line}")
-                        # Send progress updates for important lines and periodically
-                        if websocket_manager and task_id:
-                            should_send = False
-                            line_lower = decoded_line.lower()
-                            # Always send errors and warnings
-                            if any(
-                                keyword in line_lower
-                                for keyword in ["error", "warning", "fatal", "failed"]
-                            ):
-                                should_send = True
-                            # Send periodic updates (every 5 seconds) to show progress
-                            elif time.time() - last_progress_update > 5:
-                                should_send = True
-                                last_progress_update = time.time()
-                            # Send important build milestones
-                            elif any(
-                                keyword in line_lower
-                                for keyword in [
-                                    "building",
-                                    "linking",
-                                    "built target",
-                                    "scanning",
-                                    "configuring",
-                                ]
-                            ):
-                                should_send = True
-
-                            if should_send:
-                                await websocket_manager.send_build_progress(
-                                    task_id=task_id,
-                                    stage="build",
-                                    progress=70,
-                                    message="Building llama.cpp...",
-                                    log_lines=[decoded_line],
-                                )
-
-                # Start reading output
-                read_task = asyncio.create_task(read_output())
-
-                # Wait for process to complete
-                returncode = await asyncio.wait_for(
-                    build_process.wait(), timeout=1800  # 30 minute timeout for build
+                build_output = (build_process.stdout or b"").decode(
+                    "utf-8", errors="replace"
                 )
+                build_output_lines = [
+                    line.rstrip() for line in build_output.splitlines() if line.strip()
+                ]
+                returncode = build_process.returncode
 
-                # Wait for output reading to finish
-                await read_task
-
-                build_output = "\n".join(build_output_lines)
+                if progress_manager and task_id and build_output_lines:
+                    await progress_manager.send_build_progress(
+                        task_id=task_id,
+                        stage="build",
+                        progress=70,
+                        message="Building llama.cpp...",
+                        log_lines=build_output_lines[-20:],
+                    )
 
                 if returncode != 0:
                     logger.error(f"Build failed with return code {returncode}")
                     logger.error(f"Build output:\n{build_output}")
-                    # Send error output via websocket if available
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    # Send error output via SSE if available
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="build",
                             progress=70,
@@ -1911,8 +1908,8 @@ async def read_output():
                     logger.warning(
                         f"Build target 'llama-server' not found, trying 'server' target (for examples/server)..."
                     )
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="build",
                             progress=70,
@@ -1924,41 +1921,29 @@ async def read_output():
 
                     # Try 'server' target (used when server is in examples/)
                     logger.info("Attempting to build 'server' target...")
-                    server_target_process = await asyncio.create_subprocess_exec(
-                        "cmake",
+                    server_target_process = await self._run_command(
+                        cmake_exe,
                         "--build",
                         ".",
                         "--target",
                         "server",
-                        "--",
-                        "-j",
+                        "--parallel",
                         str(thread_count),
                         cwd=build_dir,
                         env=env,
-                        stdout=asyncio.subprocess.PIPE,
-                        stderr=asyncio.subprocess.STDOUT,
+                        timeout=1800,
+                        merge_stderr=True,
                     )
 
-                    server_target_output_lines = []
-
-                    async def read_server_target_output():
-                        while True:
-                            line = await server_target_process.stdout.readline()
-                            if not line:
-                                break
-                            decoded_line = line.decode(
-                                "utf-8", errors="replace"
-                            ).rstrip()
-                            server_target_output_lines.append(decoded_line)
-                            logger.debug(f"Server target build output: {decoded_line}")
-
-                    read_server_task = asyncio.create_task(read_server_target_output())
-                    server_target_returncode = await asyncio.wait_for(
-                        server_target_process.wait(), timeout=1800
+                    server_target_output = (server_target_process.stdout or b"").decode(
+                        "utf-8", errors="replace"
                     )
-                    await read_server_task
-
-                    server_target_output = "\n".join(server_target_output_lines)
+                    server_target_output_lines = [
+                        line.rstrip()
+                        for line in server_target_output.splitlines()
+                        if line.strip()
+                    ]
+                    server_target_returncode = server_target_process.returncode
 
                     if server_target_returncode == 0:
                         logger.info("Successfully built 'server' target")
@@ -1971,8 +1956,8 @@ async def read_server_target_output():
                         logger.error(
                             f"Server target build output:\n{server_target_output}"
                         )
-                        if websocket_manager and task_id:
-                            await websocket_manager.send_build_progress(
+                        if progress_manager and task_id:
+                            await progress_manager.send_build_progress(
                                 task_id=task_id,
                                 stage="build",
                                 progress=70,
@@ -1983,39 +1968,28 @@ async def read_server_target_output():
                             )
                         # Try building all targets as last resort
                         logger.info("Attempting to build all targets as fallback...")
-                        all_targets_process = await asyncio.create_subprocess_exec(
-                            "cmake",
+                        all_targets_process = await self._run_command(
+                            cmake_exe,
                             "--build",
                             ".",
-                            "--",
-                            "-j",
+                            "--parallel",
                             str(thread_count),
                             cwd=build_dir,
                             env=env,
-                            stdout=asyncio.subprocess.PIPE,
-                            stderr=asyncio.subprocess.STDOUT,
+                            timeout=1800,
+                            merge_stderr=True,
                         )
-                    all_targets_output_lines = []
-
-                    async def read_all_targets_output():
-                        while True:
-                            line = await all_targets_process.stdout.readline()
-                            if not line:
-                                break
-                            decoded_line = line.decode(
-                                "utf-8", errors="replace"
-                            ).rstrip()
-                            all_targets_output_lines.append(decoded_line)
-                            logger.debug(f"All targets build output: {decoded_line}")
-
-                    read_all_task = asyncio.create_task(read_all_targets_output())
-                    all_targets_returncode = await asyncio.wait_for(
-                        all_targets_process.wait(), timeout=1800
+                    all_targets_output = (all_targets_process.stdout or b"").decode(
+                        "utf-8", errors="replace"
                     )
-                    await read_all_task
+                    all_targets_output_lines = [
+                        line.rstrip()
+                        for line in all_targets_output.splitlines()
+                        if line.strip()
+                    ]
+                    all_targets_returncode = all_targets_process.returncode
 
                     if all_targets_returncode != 0:
-                        all_targets_output = "\n".join(all_targets_output_lines)
                         logger.error(
                             f"Building all targets failed with return code {all_targets_returncode}"
                         )
@@ -2036,8 +2010,8 @@ async def read_all_targets_output():
                         f"Build completed with return code 0 but contains errors"
                     )
                     logger.error(f"Build output:\n{build_output}")
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="build",
                             progress=70,
@@ -2085,8 +2059,8 @@ async def read_all_targets_output():
                     logger.warning(
                         "Binary not found in common locations immediately after build - will search more thoroughly"
                     )
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="build",
                             progress=75,
@@ -2100,8 +2074,8 @@ async def read_all_targets_output():
                 raise Exception("Build timed out")
 
             # Stage 6: Find executable
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="verify",
                     progress=90,
@@ -2231,9 +2205,9 @@ async def read_all_targets_output():
                         error_msg += f"3. CMake configuration succeeded\n"
                         error_msg += f"4. Build output for errors"
 
-                    # Send detailed error via websocket
-                    if websocket_manager and task_id:
-                        await websocket_manager.send_build_progress(
+                    # Send detailed error via SSE
+                    if progress_manager and task_id:
+                        await progress_manager.send_build_progress(
                             task_id=task_id,
                             stage="error",
                             progress=0,
@@ -2255,8 +2229,8 @@ async def read_all_targets_output():
             logger.info(f"Build completed, validating binary: {version_server_path}")
 
             # Validate the build
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="validate",
                     progress=95,
@@ -2265,13 +2239,13 @@ async def read_all_targets_output():
                 )
 
             is_valid = await self.validate_build(
-                version_server_path, websocket_manager, task_id
+                version_server_path, progress_manager, task_id
             )
 
             if not is_valid:
                 logger.warning("Build validation failed")
-                if websocket_manager and task_id:
-                    await websocket_manager.send_build_progress(
+                if progress_manager and task_id:
+                    await progress_manager.send_build_progress(
                         task_id=task_id,
                         stage="validate",
                         progress=95,
@@ -2281,8 +2255,8 @@ async def read_all_targets_output():
 
             logger.info(f"Build completed successfully: {version_server_path}")
 
-            if websocket_manager and task_id:
-                await websocket_manager.send_build_progress(
+            if progress_manager and task_id:
+                await progress_manager.send_build_progress(
                     task_id=task_id,
                     stage="complete",
                     progress=100,
@@ -2297,9 +2271,9 @@ async def read_all_targets_output():
 
         except Exception as e:
             logger.error(f"Build failed: {e}")
-            if websocket_manager and task_id:
+            if progress_manager and task_id:
                 try:
-                    await websocket_manager.send_build_progress(
+                    await progress_manager.send_build_progress(
                         task_id=task_id,
                         stage="error",
                         progress=0,
@@ -2307,7 +2281,7 @@ async def read_all_targets_output():
                         log_lines=[f"Error: {str(e)}"],
                     )
                 except Exception as ws_error:
-                    logger.error(f"Failed to send error to WebSocket: {ws_error}")
+                    logger.error(f"Failed to send error via SSE: {ws_error}")
             raise Exception(f"Failed to build from source {commit_sha}: {e}")
 
     async def _apply_patch(self, repo_dir: str, patch_url: str):
@@ -2332,17 +2306,16 @@ async def _apply_patch(self, repo_dir: str, patch_url: str):
             with open(patch_file, "w") as f:
                 f.write(patch_content)
 
-            apply_process = await asyncio.create_subprocess_exec(
+            apply_process = await self._run_command(
                 "git",
                 "apply",
                 patch_file,
                 cwd=repo_dir,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
+                timeout=60,
             )
-            apply_stdout, apply_stderr = await apply_process.communicate()
 
             if apply_process.returncode != 0:
+                apply_stderr = apply_process.stderr or b""
                 raise Exception(f"Failed to apply patch: {apply_stderr.decode()}")
 
             os.remove(patch_file)
diff --git a/backend/llama_swap_config.py b/backend/llama_swap_config.py
index cf717bc..3e071e7 100644
--- a/backend/llama_swap_config.py
+++ b/backend/llama_swap_config.py
@@ -265,26 +265,20 @@ def is_ik_llama_cpp(llama_server_path: Optional[str]) -> bool:
     except Exception as e:
         logger.debug(f"Error detecting ik_llama.cpp via flags: {e}")
 
-    # Fallback: Check database for repository_source
+    # Fallback: Check store for repository_source
     try:
-        from backend.database import SessionLocal, LlamaVersion
-
-        db = SessionLocal()
-        try:
-            active_version = (
-                db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-            )
-            if active_version and active_version.repository_source:
-                is_ik = active_version.repository_source == "ik_llama.cpp"
-                if is_ik:
-                    logger.debug(
-                        f"Detected ik_llama.cpp via database repository_source: {active_version.repository_source}"
-                    )
-                return is_ik
-        finally:
-            db.close()
+        from backend.data_store import get_store
+        store = get_store()
+        active_version = store.get_active_engine_version("ik_llama") or store.get_active_engine_version("llama_cpp")
+        if active_version and active_version.get("repository_source"):
+            is_ik = active_version.get("repository_source") == "ik_llama.cpp"
+            if is_ik:
+                logger.debug(
+                    f"Detected ik_llama.cpp via store repository_source: {active_version.get('repository_source')}"
+                )
+            return is_ik
     except Exception as e:
-        logger.debug(f"Error checking database for ik_llama.cpp: {e}")
+        logger.debug(f"Error checking store for ik_llama.cpp: {e}")
 
     return False
 
@@ -395,43 +389,67 @@ def get_param_mapping(is_ik: bool) -> Dict[str, list]:
 
 def get_active_binary_path_from_db() -> Optional[str]:
     """
-    Gets the active llama-server binary path from the database.
+    Gets the active llama-server binary path from the data store.
 
     Returns:
         Absolute path to the llama-server binary, or None if not found.
     """
     try:
-        from backend.database import SessionLocal, LlamaVersion
-
-        db = SessionLocal()
-        try:
-            active_version = (
-                db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-            )
-            if not active_version or not active_version.binary_path:
-                logger.warning("No active llama-cpp version found in database")
-                return None
+        from backend.data_store import get_store
 
-            # Convert to absolute path
-            binary_path = active_version.binary_path
+        store = get_store()
+        for engine in ("llama_cpp", "ik_llama"):
+            active_version = store.get_active_engine_version(engine)
+            if not active_version or not active_version.get("binary_path"):
+                continue
+            binary_path = active_version["binary_path"]
             if not os.path.isabs(binary_path):
                 binary_path = os.path.join("/app", binary_path)
-
-            # Verify the path exists
             if os.path.exists(binary_path):
                 return binary_path
-            else:
-                logger.warning(
-                    f"Binary path from database does not exist: {binary_path}"
-                )
-                return None
-        finally:
-            db.close()
+            abs_path = os.path.abspath(binary_path)
+            if os.path.exists(abs_path):
+                return abs_path
+        logger.warning("No active llama-cpp version found in data store")
+        return None
     except Exception as e:
-        logger.error(f"Error getting binary path from database: {e}")
+        logger.error(f"Error getting binary path from data store: {e}")
         return None
 
 
+def _build_lmdeploy_cmd(
+    model: Any,
+    config: Dict[str, Any],
+    lmdeploy_bin: str,
+    _model_attr: Any,
+) -> str:
+    """Build lmdeploy serve api_server command for llama-swap config."""
+    hf_id = _model_attr(model, "huggingface_id")
+    if not hf_id:
+        raise ValueError("LMDeploy model must have huggingface_id")
+    cmd_parts = [lmdeploy_bin, "serve", "api_server", hf_id]
+    cmd_parts.extend(["--server-port", "${PORT}"])
+    cmd_parts.extend(["--backend", "turbomind"])
+    if config.get("session_len") is not None:
+        cmd_parts.extend(["--session-len", str(config["session_len"])])
+    if config.get("max_batch_size") is not None:
+        cmd_parts.extend(["--max-batch-size", str(config["max_batch_size"])])
+    if config.get("tensor_parallel") is not None:
+        cmd_parts.extend(["--tp", str(config["tensor_parallel"])])
+    if config.get("dtype"):
+        cmd_parts.extend(["--dtype", str(config["dtype"])])
+    if config.get("quant_policy") is not None:
+        cmd_parts.extend(["--quant-policy", str(config["quant_policy"])])
+    if config.get("enable_prefix_caching"):
+        cmd_parts.append("--enable-prefix-caching")
+    if config.get("chat_template"):
+        cmd_parts.extend(["--chat-template", str(config["chat_template"])])
+    # Escape single quotes in the command for bash -c '...'
+    inner_cmd = " ".join(cmd_parts)
+    inner_cmd = inner_cmd.replace("'", "'\\''")
+    return f"bash -c '{inner_cmd}'"
+
+
 def generate_llama_swap_config(
     models: Dict[str, Dict[str, Any]],
     llama_server_path: Optional[str] = None,
@@ -486,20 +504,83 @@ def generate_llama_swap_config(
         "models": {},
     }
 
-    # First, add all models from the database (if provided)
+    def _model_attr(m: Any, key: str, default: Any = None) -> Any:
+        """Get attribute from model (dict or object)."""
+        if isinstance(m, dict):
+            return m.get(key, default)
+        return getattr(m, key, default)
+
+    # Resolve LMDeploy binary and build proxy->model map for overlay (used for both all_models and running overlay)
+    lmdeploy_bin = None
+    all_models_by_proxy: Dict[str, Any] = {}
+    try:
+        from backend.data_store import get_store as _get_store
+        store = _get_store()
+        lmdeploy_status = store.get_lmdeploy_status()
+        if lmdeploy_status.get("installed") and lmdeploy_status.get("venv_path"):
+            venv = lmdeploy_status["venv_path"]
+            lmdeploy_bin = os.path.join(venv, "bin", "lmdeploy")
+            if not os.path.isabs(lmdeploy_bin):
+                lmdeploy_bin = os.path.join("/app", lmdeploy_bin)
+            if not os.path.exists(lmdeploy_bin):
+                lmdeploy_bin = None
+    except Exception as e:
+        logger.debug(f"Could not resolve LMDeploy binary: {e}")
+
+    # First, add all models from the data store (if provided)
     if all_models:
+        from backend.data_store import generate_proxy_name as _gen_proxy_name
+
         for model in all_models:
-            # Use the centralized proxy name from the database
-            if not model.proxy_name:
+            proxy_model_name = _model_attr(model, "proxy_name")
+            if not proxy_model_name:
+                proxy_model_name = _gen_proxy_name(
+                    _model_attr(model, "huggingface_id", ""),
+                    _model_attr(model, "quantization"),
+                )
+            if not proxy_model_name:
                 logger.warning(
-                    f"Model '{model.name}' does not have a proxy_name set, skipping"
+                    f"Model '{_model_attr(model, 'display_name') or _model_attr(model, 'name')}' does not have a proxy_name set, skipping"
                 )
                 continue
+            all_models_by_proxy[proxy_model_name] = model
+
+            engine = _model_attr(model, "engine")
+            model_format = _model_attr(model, "format") or _model_attr(model, "model_format") or "gguf"
+            is_lmdeploy = engine == "lmdeploy" or model_format == "safetensors"
+            if is_lmdeploy and lmdeploy_bin:
+                config = _coerce_model_config(_model_attr(model, "config"))
+                try:
+                    cmd_with_env = _build_lmdeploy_cmd(model, config, lmdeploy_bin, _model_attr)
+                    config_data["models"][proxy_model_name] = {"cmd": cmd_with_env}
+                except Exception as e:
+                    logger.warning(f"Failed to build LMDeploy cmd for {proxy_model_name}: {e}")
+                continue
+
+            hf_id = _model_attr(model, "huggingface_id")
+            filename = _model_attr(model, "filename") or (
+                os.path.basename(_model_attr(model, "file_path") or "") or None
+            )
+
+            # Resolve model path: HF cache first, then legacy file_path
+            model_path = None
+            if hf_id and filename:
+                from backend.huggingface import resolve_cached_model_path
+                model_path = resolve_cached_model_path(hf_id, filename)
 
-            proxy_model_name = model.proxy_name
-            model_path = model.file_path
+            if not model_path:
+                # Legacy fallback: stored file_path (old-style records)
+                legacy = _model_attr(model, "file_path")
+                if legacy:
+                    model_path = legacy if os.path.isabs(legacy) else f"/app/{legacy}"
 
-            # Convert model path to absolute path
+            if not model_path:
+                logger.warning(
+                    f"Model '{proxy_model_name}' path could not be resolved (hf_id={hf_id}, filename={filename}), skipping"
+                )
+                continue
+
+            # Ensure absolute path (HF cache returns absolute; legacy may not)
             if not os.path.isabs(model_path):
                 model_path = f"/app/{model_path}"
 
@@ -523,7 +604,7 @@ def generate_llama_swap_config(
                 working_dir = working_dir.replace("/bin/", "/build/bin/")
 
             # Parse existing config if available
-            config = _coerce_model_config(model.config)
+            config = _coerce_model_config(_model_attr(model, "config"))
             if proxy_model_name and config.get("jinja") is not None:
                 logger.debug(
                     f"Model {proxy_model_name}: jinja={config.get('jinja')} (type: {type(config.get('jinja'))})"
@@ -539,6 +620,16 @@ def generate_llama_swap_config(
                 "--port",
                 "${PORT}",
             ]
+            # Vision: if model has mmproj (multimodal projector), add --mmproj so vision is available
+            mmproj_filename = _model_attr(model, "mmproj_filename")
+            if mmproj_filename and hf_id:
+                from backend.huggingface import resolve_cached_model_path
+                mmproj_path = resolve_cached_model_path(hf_id, mmproj_filename)
+                if mmproj_path and os.path.exists(mmproj_path):
+                    if not os.path.isabs(mmproj_path):
+                        mmproj_path = f"/app/{mmproj_path}"
+                    quoted_mmproj = _quote_arg_if_needed(mmproj_path)
+                    cmd_args.extend(["--mmproj", quoted_mmproj])
 
             # Default values to skip (these cause errors if flag isn't supported)
             default_values = {
@@ -740,6 +831,19 @@ def generate_llama_swap_config(
 
     # Then, add/update with running models (these take precedence for active models)
     for proxy_model_name, model_data in models.items():
+        overlay_model = all_models_by_proxy.get(proxy_model_name)
+        engine = _model_attr(overlay_model, "engine") if overlay_model else None
+        model_format = _model_attr(overlay_model, "format") or _model_attr(overlay_model, "model_format") if overlay_model else None
+        is_lmdeploy_overlay = (engine == "lmdeploy" or model_format == "safetensors") and lmdeploy_bin and overlay_model
+        if is_lmdeploy_overlay:
+            config = _coerce_model_config(model_data.get("config"))
+            try:
+                cmd_with_env = _build_lmdeploy_cmd(overlay_model, config, lmdeploy_bin, _model_attr)
+                config_data["models"][proxy_model_name] = {"cmd": cmd_with_env}
+            except Exception as e:
+                logger.warning(f"Failed to build LMDeploy overlay cmd for {proxy_model_name}: {e}")
+            continue
+
         model_path = model_data["model_path"]
         llama_cpp_config = model_data["config"]
 
@@ -753,6 +857,17 @@ def generate_llama_swap_config(
             "--port",
             "${PORT}",
         ]
+        # Vision: add --mmproj if model has mmproj_filename
+        if overlay_model:
+            mmproj_fn = _model_attr(overlay_model, "mmproj_filename")
+            hf_id_overlay = _model_attr(overlay_model, "huggingface_id")
+            if mmproj_fn and hf_id_overlay:
+                from backend.huggingface import resolve_cached_model_path
+                mmproj_path = resolve_cached_model_path(hf_id_overlay, mmproj_fn)
+                if mmproj_path and os.path.exists(mmproj_path):
+                    if not os.path.isabs(mmproj_path):
+                        mmproj_path = f"/app/{mmproj_path}"
+                    cmd_args.extend(["--mmproj", _quote_arg_if_needed(mmproj_path)])
 
         # Default values to skip (these cause errors if flag isn't supported)
         default_values = {
diff --git a/backend/llama_swap_manager.py b/backend/llama_swap_manager.py
index 59f6544..d08a0c1 100644
--- a/backend/llama_swap_manager.py
+++ b/backend/llama_swap_manager.py
@@ -3,9 +3,9 @@
 import os
 import yaml
 import httpx
-from typing import Dict, Any, Optional
+from typing import Any, Dict, Optional
 from backend.llama_swap_config import generate_llama_swap_config
-from backend.database import Model
+from backend.data_store import get_store
 from backend.logging_config import get_logger
 
 logger = get_logger(__name__)
@@ -57,17 +57,12 @@ async def _write_config(self, llama_server_path: str = None):
                     "No llama-server binary path provided and none found in database"
                 )
 
-        # Load all models from database to include them in config
-        from backend.database import get_db, Model
-
-        db = next(get_db())
-        try:
-            all_models = db.query(Model).all()
-            config_content = generate_llama_swap_config(
-                self.running_models, llama_server_path, all_models
-            )
-        finally:
-            db.close()
+        # Load all models from data store to include them in config
+        store = get_store()
+        all_models = store.list_models()
+        config_content = generate_llama_swap_config(
+            self.running_models, llama_server_path, all_models
+        )
 
         # Ensure directory exists
         config_dir = os.path.dirname(self.config_path)
@@ -346,32 +341,34 @@ async def restart_proxy(self):
         await self.start_proxy()
         logger.info("llama-swap proxy restarted successfully")
 
-    async def register_model(self, model: Model, config: Dict[str, Any]) -> str:
+    async def register_model(self, model: Any, config: Dict[str, Any]) -> str:
         """
         Registers a model with llama-swap by storing its configuration.
         Returns the proxy_model_name used by llama-swap.
         Note: This only stores the model info, config is written separately.
+        model can be a dict or an object with proxy_name, file_path, display_name/name.
         """
-        # Use the centralized proxy name from the database
-        if not model.proxy_name:
-            raise ValueError(f"Model '{model.name}' does not have a proxy_name set")
+        proxy_name = model.get("proxy_name") if isinstance(model, dict) else getattr(model, "proxy_name", None)
+        file_path = model.get("file_path") if isinstance(model, dict) else getattr(model, "file_path", None)
+        name = model.get("display_name") or model.get("name") if isinstance(model, dict) else (getattr(model, "display_name", None) or getattr(model, "name", None))
 
-        proxy_model_name = model.proxy_name
+        if not proxy_name:
+            raise ValueError(f"Model '{name}' does not have a proxy_name set")
 
-        if proxy_model_name in self.running_models:
+        if proxy_name in self.running_models:
             raise ValueError(
-                f"Model '{proxy_model_name}' is already registered with llama-swap."
+                f"Model '{proxy_name}' is already registered with llama-swap."
             )
 
-        self.running_models[proxy_model_name] = {
-            "model_path": model.file_path,
+        self.running_models[proxy_name] = {
+            "model_path": file_path,
             "config": config,
         }
 
         logger.info(
-            f"Model '{model.name}' registered as '{proxy_model_name}' with llama-swap"
+            f"Model '{name}' registered as '{proxy_name}' with llama-swap"
         )
-        return proxy_model_name
+        return proxy_name
 
     def _detect_correct_binary_path(self, version_dir: str) -> str:
         """
@@ -406,46 +403,35 @@ async def _ensure_correct_binary_path(self):
         Ensures the active llama-cpp version has the correct binary path.
         Automatically detects and updates if needed.
         """
-        from backend.database import SessionLocal, LlamaVersion
-
-        db = SessionLocal()
-        try:
-            active_version = (
-                db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-            )
+        store = get_store()
+        for engine in ("llama_cpp", "ik_llama"):
+            active_version = store.get_active_engine_version(engine)
             if not active_version:
-                logger.warning("No active llama-cpp version found")
-                return
-
-            # Convert relative path to absolute
-            version_dir = active_version.binary_path
+                continue
+            version_dir = active_version.get("binary_path")
+            if not version_dir:
+                continue
             if not os.path.isabs(version_dir):
                 version_dir = os.path.join("/app", version_dir)
-
-            # Get the directory containing the binary
             binary_dir = os.path.dirname(version_dir)
-
-            # Detect the correct binary path
             correct_binary_path = self._detect_correct_binary_path(binary_dir)
-
-            # Convert back to relative path for database storage
             relative_path = os.path.relpath(correct_binary_path, "/app")
-
-            # Update database if path has changed
-            if active_version.binary_path != relative_path:
+            if active_version.get("binary_path") != relative_path:
                 logger.info(
-                    f"Updating binary path from '{active_version.binary_path}' to '{relative_path}'"
+                    f"Updating binary path from '{active_version.get('binary_path')}' to '{relative_path}'"
                 )
-                active_version.binary_path = relative_path
-                db.commit()
+                data = store._read_yaml("engines.yaml")
+                engine_data = data.get(engine, {})
+                for i, v in enumerate(engine_data.get("versions", [])):
+                    if v.get("version") == active_version.get("version"):
+                        engine_data["versions"][i] = {**v, "binary_path": relative_path}
+                        break
+                store._save_yaml("engines.yaml", data)
                 logger.info("Binary path updated successfully")
             else:
                 logger.debug(f"Binary path is already correct: {relative_path}")
-
-        except Exception as e:
-            logger.error(f"Error ensuring correct binary path: {e}")
-        finally:
-            db.close()
+            return
+        logger.warning("No active llama-cpp version found")
 
     async def regenerate_config_with_active_version(self):
         """
@@ -454,52 +440,39 @@ async def regenerate_config_with_active_version(self):
         Automatically detects and fixes binary path if needed.
         Ensures llama-swap is running if an active version exists.
         """
-        from backend.database import SessionLocal, LlamaVersion
-
-        # First, ensure the binary path is correct
         await self._ensure_correct_binary_path()
 
-        db = SessionLocal()
-        try:
-            # Get the active version
-            active_version = (
-                db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-            )
-            if not active_version:
-                logger.warning(
-                    "No active llama-cpp version found, skipping config regeneration"
-                )
-                return
-
-            # Convert to absolute path for existence check
-            binary_path = active_version.binary_path
-            if not os.path.isabs(binary_path):
-                binary_path = os.path.join("/app", binary_path)
-
-            if not os.path.exists(binary_path):
-                logger.warning(f"Active version binary not found: {binary_path}")
-                return
-
-            # Sync running_models with actual llama-swap state
-            await self.sync_running_models()
-
-            # Regenerate config with active version and synced running_models
-            await self._write_config(active_version.binary_path)
-            logger.info(
-                f"Regenerated llama-swap config with active version: {active_version.version} and {len(self.running_models)} running models"
+        store = get_store()
+        active_version = None
+        for engine in ("llama_cpp", "ik_llama"):
+            active_version = store.get_active_engine_version(engine)
+            if active_version:
+                break
+        if not active_version:
+            logger.warning(
+                "No active llama-cpp version found, skipping config regeneration"
             )
+            return
 
-            # Ensure llama-swap is running when we have an active version
-            try:
-                await self.start_proxy()
-                logger.info("Ensured llama-swap is running after config regeneration")
-            except Exception as e:
-                logger.warning(f"Failed to start llama-swap after config regeneration: {e}")
+        binary_path = active_version.get("binary_path")
+        if not binary_path:
+            return
+        if not os.path.isabs(binary_path):
+            binary_path = os.path.join("/app", binary_path)
+        if not os.path.exists(binary_path):
+            logger.warning(f"Active version binary not found: {binary_path}")
+            return
 
+        await self.sync_running_models()
+        await self._write_config(active_version.get("binary_path"))
+        logger.info(
+            f"Regenerated llama-swap config with active version: {active_version.get('version')} and {len(self.running_models)} running models"
+        )
+        try:
+            await self.start_proxy()
+            logger.info("Ensured llama-swap is running after config regeneration")
         except Exception as e:
-            logger.error(f"Failed to regenerate config with active version: {e}")
-        finally:
-            db.close()
+            logger.warning(f"Failed to start llama-swap after config regeneration: {e}")
 
     async def unregister_model(self, proxy_model_name: str):
         """
diff --git a/backend/lmdeploy_installer.py b/backend/lmdeploy_installer.py
index 73e7610..875b2f7 100644
--- a/backend/lmdeploy_installer.py
+++ b/backend/lmdeploy_installer.py
@@ -1,362 +1,416 @@
-import asyncio
-import json
-import os
-import shutil
-import subprocess
-import sys
-from asyncio.subprocess import PIPE, STDOUT
-from datetime import datetime, timezone
-from typing import Any, Awaitable, Dict, Optional
-
-from backend.logging_config import get_logger
-from backend.websocket_manager import websocket_manager
-
-
-def _utcnow() -> str:
-    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
-
-
-logger = get_logger(__name__)
-
-_installer_instance: Optional["LMDeployInstaller"] = None
-
-
-def get_lmdeploy_installer() -> "LMDeployInstaller":
-    global _installer_instance
-    if _installer_instance is None:
-        _installer_instance = LMDeployInstaller()
-    return _installer_instance
-
-
-class LMDeployInstaller:
-    """Install or remove LMDeploy inside the runtime environment on demand."""
-
-    def __init__(
-        self,
-        *,
-        log_path: Optional[str] = None,
-        state_path: Optional[str] = None,
-        base_dir: Optional[str] = None,
-    ) -> None:
-        self._lock = asyncio.Lock()
-        self._operation: Optional[str] = None
-        self._operation_started_at: Optional[str] = None
-        self._current_task: Optional[asyncio.Task] = None
-        self._last_error: Optional[str] = None
-        data_root = os.path.abspath("data")
-        base_path = base_dir or os.path.join(data_root, "lmdeploy")
-        self._base_dir = os.path.abspath(base_path)
-        self._venv_path = os.path.join(self._base_dir, "venv")
-        log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log")
-        state_path = state_path or os.path.join(
-            data_root, "configs", "lmdeploy_installer.json"
-        )
-        self._log_path = os.path.abspath(log_path)
-        self._state_path = os.path.abspath(state_path)
-        self._ensure_directories()
-
-    def _ensure_directories(self) -> None:
-        os.makedirs(self._base_dir, exist_ok=True)
-        os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
-        os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
-
-    def _venv_bin(self, executable: str) -> str:
-        if os.name == "nt":
-            exe = (
-                executable
-                if executable.lower().endswith(".exe")
-                else f"{executable}.exe"
-            )
-            return os.path.join(self._venv_path, "Scripts", exe)
-        return os.path.join(self._venv_path, "bin", executable)
-
-    def _venv_python(self) -> str:
-        return self._venv_bin("python")
-
-    def _ensure_venv(self) -> None:
-        python_path = self._venv_python()
-        if os.path.exists(python_path):
-            return
-        os.makedirs(self._base_dir, exist_ok=True)
-        try:
-            subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True)
-        except subprocess.CalledProcessError as exc:
-            raise RuntimeError(
-                f"Failed to create LMDeploy virtual environment: {exc}"
-            ) from exc
-
-    def _load_state(self) -> Dict[str, Any]:
-        if not os.path.exists(self._state_path):
-            return {}
-        try:
-            with open(self._state_path, "r", encoding="utf-8") as handle:
-                data = json.load(handle)
-                return data if isinstance(data, dict) else {}
-        except Exception as exc:
-            logger.warning(f"Failed to load LMDeploy installer state: {exc}")
-            return {}
-
-    def _save_state(self, state: Dict[str, Any]) -> None:
-        tmp_path = f"{self._state_path}.tmp"
-        with open(tmp_path, "w", encoding="utf-8") as handle:
-            json.dump(state, handle, indent=2)
-        os.replace(tmp_path, self._state_path)
-
-    def _detect_installed_version(self) -> Optional[str]:
-        python_exe = self._venv_python()
-        if not os.path.exists(python_exe):
-            return None
-        script = (
-            "import importlib, sys\n"
-            "try:\n"
-            "    from importlib import metadata\n"
-            "except ImportError:\n"
-            "    import importlib_metadata as metadata\n"
-            "try:\n"
-            "    print(metadata.version('lmdeploy'))\n"
-            "except metadata.PackageNotFoundError:\n"
-            "    sys.exit(1)\n"
-        )
-        try:
-            output = subprocess.check_output(
-                [python_exe, "-c", script], text=True
-            ).strip()
-            return output or None
-        except subprocess.CalledProcessError:
-            return None
-        except Exception as exc:  # pragma: no cover
-            logger.debug(f"Unable to determine LMDeploy version: {exc}")
-            return None
-
-    def _resolve_binary_path(self) -> Optional[str]:
-        override = os.getenv("LMDEPLOY_BIN")
-        if override:
-            override_path = os.path.abspath(os.path.expanduser(override))
-            if os.path.exists(override_path):
-                return override_path
-            resolved_override = shutil.which(override)
-            if resolved_override:
-                return resolved_override
-
-        candidate = self._venv_bin("lmdeploy")
-        if os.path.exists(candidate) and os.access(candidate, os.X_OK):
-            return os.path.abspath(candidate)
-
-        resolved = shutil.which("lmdeploy")
-        return resolved
-
-    def _update_installed_state(
-        self, installed: bool, version: Optional[str] = None
-    ) -> None:
-        state = self._load_state()
-        if installed:
-            state["installed_at"] = _utcnow()
-            if version:
-                state["installed_version"] = version
-            state["venv_path"] = self._venv_path
-        else:
-            state["installed_version"] = None
-            state["installed_at"] = None
-            state["removed_at"] = _utcnow()
-            state["venv_path"] = self._venv_path
-        self._save_state(state)
-
-    def _refresh_state_from_environment(self) -> None:
-        state = self._load_state()
-        version = self._detect_installed_version()
-        state["installed_version"] = version
-        if version is None:
-            state["removed_at"] = _utcnow()
-        state["venv_path"] = self._venv_path
-        self._save_state(state)
-
-    async def _run_pip(
-        self, args: list[str], operation: str, ensure_venv: bool = True
-    ) -> int:
-        if ensure_venv:
-            self._ensure_venv()
-        python_exe = self._venv_python()
-        if not os.path.exists(python_exe):
-            raise RuntimeError(
-                "LMDeploy virtual environment is missing; cannot run pip."
-            )
-        header = (
-            f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n"
-        )
-        with open(self._log_path, "w", encoding="utf-8") as log_file:
-            log_file.write(header)
-        process = await asyncio.create_subprocess_exec(
-            python_exe,
-            "-m",
-            "pip",
-            *args,
-            stdout=PIPE,
-            stderr=STDOUT,
-        )
-
-        async def _stream_output() -> None:
-            if process.stdout is None:
-                return
-            with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
-                while True:
-                    chunk = await process.stdout.readline()
-                    if not chunk:
-                        break
-                    text = chunk.decode("utf-8", errors="replace")
-                    log_file.write(text)
-                    await self._broadcast_log_line(text.rstrip("\n"))
-
-        await asyncio.gather(process.wait(), _stream_output())
-        return process.returncode or 0
-
-    async def _broadcast_log_line(self, line: str) -> None:
-        try:
-            await websocket_manager.broadcast(
-                {
-                    "type": "lmdeploy_install_log",
-                    "line": line,
-                    "timestamp": _utcnow(),
-                }
-            )
-        except Exception as exc:  # pragma: no cover
-            logger.debug(f"Failed to broadcast LMDeploy log line: {exc}")
-
-    async def _set_operation(self, operation: str) -> None:
-        self._operation = operation
-        self._operation_started_at = _utcnow()
-        self._last_error = None
-        await websocket_manager.broadcast(
-            {
-                "type": "lmdeploy_install_status",
-                "status": operation,
-                "started_at": self._operation_started_at,
-            }
-        )
-
-    async def _finish_operation(self, success: bool, message: str = "") -> None:
-        payload = {
-            "type": "lmdeploy_install_status",
-            "status": "completed" if success else "failed",
-            "operation": self._operation,
-            "message": message,
-            "ended_at": _utcnow(),
-        }
-        await websocket_manager.broadcast(payload)
-        self._operation = None
-        self._operation_started_at = None
-
-    def _create_task(self, coro: Awaitable[Any]) -> None:
-        loop = asyncio.get_running_loop()
-        task = loop.create_task(coro)
-        self._current_task = task
-
-        def _cleanup(fut: asyncio.Future) -> None:
-            try:
-                fut.result()
-            except Exception as exc:  # pragma: no cover - surfaced via status
-                logger.error(f"LMDeploy installer task error: {exc}")
-            finally:
-                self._current_task = None
-
-        task.add_done_callback(_cleanup)
-
-    async def install(
-        self, version: Optional[str] = None, force_reinstall: bool = False
-    ) -> Dict[str, Any]:
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another LMDeploy installer operation is already running"
-                )
-            await self._set_operation("install")
-            args = ["install", "--upgrade"]
-            if force_reinstall:
-                args.append("--force-reinstall")
-            package = "lmdeploy"
-            if version:
-                package = f"lmdeploy=={version}"
-            args.append(package)
-
-            async def _runner():
-                try:
-                    code = await self._run_pip(args, "install")
-                    if code != 0:
-                        raise RuntimeError(f"pip exited with status {code}")
-                    detected_version = self._detect_installed_version()
-                    self._update_installed_state(True, detected_version)
-                    await self._finish_operation(True, "LMDeploy installed")
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    self._refresh_state_from_environment()
-                    await self._finish_operation(False, str(exc))
-
-            self._create_task(_runner())
-            return {"message": "LMDeploy installation started"}
-
-    async def remove(self) -> Dict[str, Any]:
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another LMDeploy installer operation is already running"
-                )
-            await self._set_operation("remove")
-            args = ["uninstall", "-y", "lmdeploy"]
-
-            async def _runner():
-                try:
-                    python_exists = os.path.exists(self._venv_python())
-                    if python_exists:
-                        code = await self._run_pip(args, "remove", ensure_venv=False)
-                        if code != 0:
-                            raise RuntimeError(f"pip exited with status {code}")
-                    shutil.rmtree(self._venv_path, ignore_errors=True)
-                    self._update_installed_state(False)
-                    await self._finish_operation(True, "LMDeploy removed")
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    self._refresh_state_from_environment()
-                    await self._finish_operation(False, str(exc))
-
-            self._create_task(_runner())
-            return {"message": "LMDeploy removal started"}
-
-    def status(self) -> Dict[str, Any]:
-        version = self._detect_installed_version()
-        binary_path = self._resolve_binary_path()
-        installed = version is not None and binary_path is not None
-        state = self._load_state()
-        return {
-            "installed": installed,
-            "version": version,
-            "binary_path": binary_path,
-            "venv_path": state.get("venv_path") or self._venv_path,
-            "installed_at": state.get("installed_at"),
-            "removed_at": state.get("removed_at"),
-            "operation": self._operation,
-            "operation_started_at": self._operation_started_at,
-            "last_error": self._last_error,
-            "log_path": self._log_path,
-        }
-
-    async def _broadcast_status(self) -> None:
-        """Broadcast current status via WebSocket."""
-        try:
-            status_data = self.status()
-            await websocket_manager.send_lmdeploy_status(status_data)
-        except Exception as exc:
-            logger.debug(f"Failed to broadcast LMDeploy status: {exc}")
-
-    def is_operation_running(self) -> bool:
-        return self._operation is not None
-
-    def read_log_tail(self, max_bytes: int = 8192) -> str:
-        if not os.path.exists(self._log_path):
-            return ""
-        with open(self._log_path, "rb") as log_file:
-            log_file.seek(0, os.SEEK_END)
-            size = log_file.tell()
-            log_file.seek(max(0, size - max_bytes))
-            data = log_file.read().decode("utf-8", errors="replace")
-            if size > max_bytes:
-                data = data.split("\n", 1)[-1]
-            return data.strip()
+import asyncio
+import json
+import os
+import shutil
+import subprocess
+import sys
+from asyncio.subprocess import PIPE, STDOUT
+from datetime import datetime, timezone
+from typing import Any, Awaitable, Dict, Optional
+
+from backend.logging_config import get_logger
+from backend.progress_manager import get_progress_manager
+
+
+def _utcnow() -> str:
+    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+
+
+logger = get_logger(__name__)
+
+_installer_instance: Optional["LMDeployInstaller"] = None
+
+
+def get_lmdeploy_installer() -> "LMDeployInstaller":
+    global _installer_instance
+    if _installer_instance is None:
+        _installer_instance = LMDeployInstaller()
+    return _installer_instance
+
+
+class LMDeployInstaller:
+    """Install or remove LMDeploy inside the runtime environment on demand."""
+
+    def __init__(
+        self,
+        *,
+        log_path: Optional[str] = None,
+        state_path: Optional[str] = None,
+        base_dir: Optional[str] = None,
+    ) -> None:
+        self._lock = asyncio.Lock()
+        self._operation: Optional[str] = None
+        self._operation_started_at: Optional[str] = None
+        self._current_task: Optional[asyncio.Task] = None
+        self._last_error: Optional[str] = None
+        data_root = os.path.abspath("data")
+        base_path = base_dir or os.path.join(data_root, "lmdeploy")
+        self._base_dir = os.path.abspath(base_path)
+        self._venv_path = os.path.join(self._base_dir, "venv")
+        log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log")
+        state_path = state_path or os.path.join(
+            data_root, "configs", "lmdeploy_installer.json"
+        )
+        self._log_path = os.path.abspath(log_path)
+        self._state_path = os.path.abspath(state_path)
+        self._ensure_directories()
+
+    def _ensure_directories(self) -> None:
+        os.makedirs(self._base_dir, exist_ok=True)
+        os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
+        os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
+
+    def _venv_bin(self, executable: str) -> str:
+        if os.name == "nt":
+            exe = (
+                executable
+                if executable.lower().endswith(".exe")
+                else f"{executable}.exe"
+            )
+            return os.path.join(self._venv_path, "Scripts", exe)
+        return os.path.join(self._venv_path, "bin", executable)
+
+    def _venv_python(self) -> str:
+        return self._venv_bin("python")
+
+    def _ensure_venv(self) -> None:
+        python_path = self._venv_python()
+        if os.path.exists(python_path):
+            return
+        os.makedirs(self._base_dir, exist_ok=True)
+        try:
+            subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True)
+        except subprocess.CalledProcessError as exc:
+            raise RuntimeError(
+                f"Failed to create LMDeploy virtual environment: {exc}"
+            ) from exc
+
+    def _load_state(self) -> Dict[str, Any]:
+        if not os.path.exists(self._state_path):
+            return {}
+        try:
+            with open(self._state_path, "r", encoding="utf-8") as handle:
+                data = json.load(handle)
+                return data if isinstance(data, dict) else {}
+        except Exception as exc:
+            logger.warning(f"Failed to load LMDeploy installer state: {exc}")
+            return {}
+
+    def _save_state(self, state: Dict[str, Any]) -> None:
+        tmp_path = f"{self._state_path}.tmp"
+        with open(tmp_path, "w", encoding="utf-8") as handle:
+            json.dump(state, handle, indent=2)
+        os.replace(tmp_path, self._state_path)
+
+    def _detect_installed_version(self) -> Optional[str]:
+        python_exe = self._venv_python()
+        if not os.path.exists(python_exe):
+            return None
+        script = (
+            "import importlib, sys\n"
+            "try:\n"
+            "    from importlib import metadata\n"
+            "except ImportError:\n"
+            "    import importlib_metadata as metadata\n"
+            "try:\n"
+            "    print(metadata.version('lmdeploy'))\n"
+            "except metadata.PackageNotFoundError:\n"
+            "    sys.exit(1)\n"
+        )
+        try:
+            output = subprocess.check_output(
+                [python_exe, "-c", script], text=True
+            ).strip()
+            return output or None
+        except subprocess.CalledProcessError:
+            return None
+        except Exception as exc:  # pragma: no cover
+            logger.debug(f"Unable to determine LMDeploy version: {exc}")
+            return None
+
+    def _resolve_binary_path(self) -> Optional[str]:
+        override = os.getenv("LMDEPLOY_BIN")
+        if override:
+            override_path = os.path.abspath(os.path.expanduser(override))
+            if os.path.exists(override_path):
+                return override_path
+            resolved_override = shutil.which(override)
+            if resolved_override:
+                return resolved_override
+
+        candidate = self._venv_bin("lmdeploy")
+        if os.path.exists(candidate) and os.access(candidate, os.X_OK):
+            return os.path.abspath(candidate)
+
+        resolved = shutil.which("lmdeploy")
+        return resolved
+
+    def _update_installed_state(
+        self, installed: bool, version: Optional[str] = None
+    ) -> None:
+        state = self._load_state()
+        if installed:
+            state["installed_at"] = _utcnow()
+            if version:
+                state["installed_version"] = version
+            state["venv_path"] = self._venv_path
+        else:
+            state["installed_version"] = None
+            state["installed_at"] = None
+            state["removed_at"] = _utcnow()
+            state["venv_path"] = self._venv_path
+        self._save_state(state)
+
+    def _refresh_state_from_environment(self) -> None:
+        state = self._load_state()
+        version = self._detect_installed_version()
+        state["installed_version"] = version
+        if version is None:
+            state["removed_at"] = _utcnow()
+        state["venv_path"] = self._venv_path
+        self._save_state(state)
+
+    async def _run_pip(
+        self,
+        args: list[str],
+        operation: str,
+        ensure_venv: bool = True,
+        cwd: Optional[str] = None,
+    ) -> int:
+        if ensure_venv:
+            self._ensure_venv()
+        python_exe = self._venv_python()
+        if not os.path.exists(python_exe):
+            raise RuntimeError(
+                "LMDeploy virtual environment is missing; cannot run pip."
+            )
+        header = (
+            f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n"
+        )
+        with open(self._log_path, "w", encoding="utf-8") as log_file:
+            log_file.write(header)
+        process = await asyncio.create_subprocess_exec(
+            python_exe,
+            "-m",
+            "pip",
+            *args,
+            stdout=PIPE,
+            stderr=STDOUT,
+            cwd=cwd,
+        )
+
+        async def _stream_output() -> None:
+            if process.stdout is None:
+                return
+            with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
+                while True:
+                    chunk = await process.stdout.readline()
+                    if not chunk:
+                        break
+                    text = chunk.decode("utf-8", errors="replace")
+                    log_file.write(text)
+                    await self._broadcast_log_line(text.rstrip("\n"))
+
+        await asyncio.gather(process.wait(), _stream_output())
+        return process.returncode or 0
+
+    async def _broadcast_log_line(self, line: str) -> None:
+        try:
+            await get_progress_manager().broadcast(
+                {
+                    "type": "lmdeploy_install_log",
+                    "line": line,
+                    "timestamp": _utcnow(),
+                }
+            )
+        except Exception as exc:  # pragma: no cover
+            logger.debug(f"Failed to broadcast LMDeploy log line: {exc}")
+
+    async def _set_operation(self, operation: str) -> None:
+        self._operation = operation
+        self._operation_started_at = _utcnow()
+        self._last_error = None
+        await get_progress_manager().broadcast(
+            {
+                "type": "lmdeploy_install_status",
+                "status": operation,
+                "started_at": self._operation_started_at,
+            }
+        )
+
+    async def _finish_operation(self, success: bool, message: str = "") -> None:
+        payload = {
+            "type": "lmdeploy_install_status",
+            "status": "completed" if success else "failed",
+            "operation": self._operation,
+            "message": message,
+            "ended_at": _utcnow(),
+        }
+        await get_progress_manager().broadcast(payload)
+        self._operation = None
+        self._operation_started_at = None
+
+    def _create_task(self, coro: Awaitable[Any]) -> None:
+        loop = asyncio.get_running_loop()
+        task = loop.create_task(coro)
+        self._current_task = task
+
+        def _cleanup(fut: asyncio.Future) -> None:
+            try:
+                fut.result()
+            except Exception as exc:  # pragma: no cover - surfaced via status
+                logger.error(f"LMDeploy installer task error: {exc}")
+            finally:
+                self._current_task = None
+
+        task.add_done_callback(_cleanup)
+
+    async def install(
+        self, version: Optional[str] = None, force_reinstall: bool = False
+    ) -> Dict[str, Any]:
+        async with self._lock:
+            if self._operation:
+                raise RuntimeError(
+                    "Another LMDeploy installer operation is already running"
+                )
+            await self._set_operation("install")
+            args = ["install", "--upgrade"]
+            if force_reinstall:
+                args.append("--force-reinstall")
+            package = "lmdeploy"
+            if version:
+                package = f"lmdeploy=={version}"
+            args.append(package)
+
+            async def _runner():
+                try:
+                    code = await self._run_pip(args, "install")
+                    if code != 0:
+                        raise RuntimeError(f"pip exited with status {code}")
+                    detected_version = self._detect_installed_version()
+                    self._update_installed_state(True, detected_version)
+                    await self._finish_operation(True, "LMDeploy installed")
+                except Exception as exc:
+                    self._last_error = str(exc)
+                    self._refresh_state_from_environment()
+                    await self._finish_operation(False, str(exc))
+
+            self._create_task(_runner())
+            return {"message": "LMDeploy installation started"}
+
+    async def install_from_source(
+        self,
+        repo_url: str = "https://github.com/InternLM/lmdeploy.git",
+        branch: str = "main",
+    ) -> Dict[str, Any]:
+        """Install LMDeploy from a git repo and branch (for development)."""
+        async with self._lock:
+            if self._operation:
+                raise RuntimeError(
+                    "Another LMDeploy installer operation is already running"
+                )
+            await self._set_operation("install_source")
+            clone_dir = os.path.join(self._base_dir, "source")
+            async def _runner():
+                try:
+                    self._ensure_venv()
+                    if os.path.exists(clone_dir):
+                        shutil.rmtree(clone_dir)
+                    os.makedirs(clone_dir, exist_ok=True)
+                    proc = await asyncio.create_subprocess_exec(
+                        "git", "clone", "--depth", "1", "--branch", branch, repo_url, clone_dir,
+                        stdout=PIPE, stderr=STDOUT,
+                    )
+                    await proc.wait()
+                    if proc.returncode != 0:
+                        raise RuntimeError(f"git clone failed with code {proc.returncode}")
+                    code = await self._run_pip(
+                        ["install", "-e", "."],
+                        "install_source",
+                        cwd=clone_dir,
+                    )
+                    if code != 0:
+                        raise RuntimeError(f"pip install -e . failed with code {code}")
+                    detected = self._detect_installed_version()
+                    self._update_installed_state(True, detected)
+                    from backend.data_store import get_store
+                    get_store().update_lmdeploy({
+                        "install_type": "source",
+                        "source_repo": repo_url,
+                        "source_branch": branch,
+                    })
+                    await self._finish_operation(True, f"Installed from {branch}")
+                except Exception as exc:
+                    self._last_error = str(exc)
+                    self._refresh_state_from_environment()
+                    await self._finish_operation(False, str(exc))
+            self._create_task(_runner())
+            return {"message": "LMDeploy install from source started", "repo": repo_url, "branch": branch}
+
+    async def remove(self) -> Dict[str, Any]:
+        async with self._lock:
+            if self._operation:
+                raise RuntimeError(
+                    "Another LMDeploy installer operation is already running"
+                )
+            await self._set_operation("remove")
+            args = ["uninstall", "-y", "lmdeploy"]
+
+            async def _runner():
+                try:
+                    python_exists = os.path.exists(self._venv_python())
+                    if python_exists:
+                        code = await self._run_pip(args, "remove", ensure_venv=False)
+                        if code != 0:
+                            raise RuntimeError(f"pip exited with status {code}")
+                    shutil.rmtree(self._venv_path, ignore_errors=True)
+                    self._update_installed_state(False)
+                    await self._finish_operation(True, "LMDeploy removed")
+                except Exception as exc:
+                    self._last_error = str(exc)
+                    self._refresh_state_from_environment()
+                    await self._finish_operation(False, str(exc))
+
+            self._create_task(_runner())
+            return {"message": "LMDeploy removal started"}
+
+    def status(self) -> Dict[str, Any]:
+        version = self._detect_installed_version()
+        binary_path = self._resolve_binary_path()
+        installed = version is not None and binary_path is not None
+        state = self._load_state()
+        return {
+            "installed": installed,
+            "version": version,
+            "binary_path": binary_path,
+            "venv_path": state.get("venv_path") or self._venv_path,
+            "installed_at": state.get("installed_at"),
+            "removed_at": state.get("removed_at"),
+            "operation": self._operation,
+            "operation_started_at": self._operation_started_at,
+            "last_error": self._last_error,
+            "log_path": self._log_path,
+        }
+
+    async def _broadcast_status(self) -> None:
+        """Broadcast current status via SSE."""
+        try:
+            status_data = self.status()
+            get_progress_manager().emit("lmdeploy_status", {**status_data, "timestamp": _utcnow()})
+        except Exception as exc:
+            logger.debug(f"Failed to broadcast LMDeploy status: {exc}")
+
+    def is_operation_running(self) -> bool:
+        return self._operation is not None
+
+    def read_log_tail(self, max_bytes: int = 8192) -> str:
+        if not os.path.exists(self._log_path):
+            return ""
+        with open(self._log_path, "rb") as log_file:
+            log_file.seek(0, os.SEEK_END)
+            size = log_file.tell()
+            log_file.seek(max(0, size - max_bytes))
+            data = log_file.read().decode("utf-8", errors="replace")
+            if size > max_bytes:
+                data = data.split("\n", 1)[-1]
+            return data.strip()
diff --git a/backend/lmdeploy_manager.py b/backend/lmdeploy_manager.py
index 59e7d01..6328d71 100644
--- a/backend/lmdeploy_manager.py
+++ b/backend/lmdeploy_manager.py
@@ -1,877 +1,841 @@
-import asyncio
-import json
-import os
-import shlex
-import shutil
-from datetime import datetime
-from typing import Optional, Dict, Any, List
-
-import httpx
-import psutil
-from asyncio.subprocess import Process, STDOUT
-
-from backend.logging_config import get_logger
-from backend.database import SessionLocal, Model, RunningInstance
-from backend.huggingface import DEFAULT_LMDEPLOY_CONTEXT, MAX_LMDEPLOY_CONTEXT
-from backend.websocket_manager import websocket_manager
-
-logger = get_logger(__name__)
-
-_lmdeploy_manager_instance: Optional["LMDeployManager"] = None
-
-
-def get_lmdeploy_manager() -> "LMDeployManager":
-    """Return singleton LMDeploy manager."""
-    global _lmdeploy_manager_instance
-    if _lmdeploy_manager_instance is None:
-        _lmdeploy_manager_instance = LMDeployManager()
-    return _lmdeploy_manager_instance
-
-
-class LMDeployManager:
-    """Manage LMDeploy TurboMind runtime lifecycle."""
-
-    def __init__(
-        self,
-        binary_path: Optional[str] = None,
-        host: str = "0.0.0.0",
-        port: int = 2001,
-    ):
-        self.binary_path = binary_path or os.getenv("LMDEPLOY_BIN", "lmdeploy")
-        self.host = host
-        self.port = int(os.getenv("LMDEPLOY_PORT", port))
-        self._process: Optional[Process] = None
-        self._log_file = None
-        self._lock = asyncio.Lock()
-        self._current_instance: Optional[Dict[str, Any]] = None
-        self._started_at: Optional[str] = None
-        self._log_path = os.path.join("data", "logs", "lmdeploy.log")
-        self._health_timeout = 180  # seconds
-        self._last_health_status: Optional[Dict[str, Any]] = None
-        self._last_detected_external: Optional[Dict[str, Any]] = None
-        self._last_broadcast_log_position = 0
-
-    async def start(
-        self, model_entry: Dict[str, Any], config: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Start LMDeploy serving the provided model. Only one model may run at once."""
-        async with self._lock:
-            if self._process and self._process.returncode is None:
-                raise RuntimeError("LMDeploy runtime is already running")
-
-            model_path = model_entry.get("file_path")
-            if not model_path or not os.path.exists(model_path):
-                raise FileNotFoundError(f"Model file not found at {model_path}")
-            model_dir = model_entry.get("model_dir") or os.path.dirname(model_path)
-            if not os.path.isdir(model_dir):
-                raise FileNotFoundError(f"Model directory not found at {model_dir}")
-            model_dir_abs = os.path.abspath(model_dir)
-
-            # Derive a stable model name for LMDeploy's --model-name flag.
-            # Preference order:
-            # 1) Explicit model_name passed in model_entry
-            # 2) Base model / display name from model_entry
-            # 3) Hugging Face repo id
-            # 4) Directory name
-            model_name = (
-                model_entry.get("model_name")
-                or model_entry.get("display_name")
-                or model_entry.get("huggingface_id")
-                or os.path.basename(model_dir_abs.rstrip(os.sep))
-            )
-
-            # Inject model_name into config passed to LMDeploy so the command builder
-            # can add --model-name and we persist it in status/config reflection.
-            effective_config = dict(config or {})
-            if model_name and not effective_config.get("model_name"):
-                effective_config["model_name"] = model_name
-
-            binary = self._resolve_binary()
-            command = self._build_command(binary, model_dir_abs, effective_config)
-            env = os.environ.copy()
-            env.setdefault("LMDEPLOY_LOG_DIR", os.path.dirname(self._log_path))
-            os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
-            self._log_file = open(self._log_path, "ab", buffering=0)
-
-            logger.info(f"Starting LMDeploy with command: {' '.join(command)}")
-            self._process = await asyncio.create_subprocess_exec(
-                *command,
-                stdout=self._log_file,
-                stderr=STDOUT,
-                cwd=model_dir_abs,
-                env=env,
-            )
-            self._started_at = datetime.utcnow().isoformat() + "Z"
-            self._current_instance = {
-                "model_id": model_entry.get("model_id"),
-                "huggingface_id": model_entry.get("huggingface_id"),
-                "file_path": model_path,
-                "config": effective_config,
-                "pid": self._process.pid,
-            }
-
-        try:
-            await self._wait_for_ready()
-        except Exception as exc:
-            await self.stop(force=True)
-            raise exc
-
-        return self.status()
-
-    async def stop(self, force: bool = False) -> None:
-        """Stop LMDeploy process if running."""
-        async with self._lock:
-            if not self._process:
-                return
-            if self._process.returncode is None:
-                try:
-                    self._process.terminate()
-                    await asyncio.wait_for(self._process.wait(), timeout=30)
-                except asyncio.TimeoutError:
-                    logger.warning(
-                        "LMDeploy did not terminate gracefully; killing process"
-                    )
-                    self._process.kill()
-                    await self._process.wait()
-                except ProcessLookupError:
-                    logger.debug("LMDeploy process already stopped")
-            elif force:
-                try:
-                    self._process.kill()
-                except ProcessLookupError:
-                    pass
-            self._cleanup_process_state()
-
-    async def restart(
-        self, model_entry: Dict[str, Any], config: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Restart LMDeploy with a new model/config."""
-        await self.stop()
-        return await self.start(model_entry, config)
-
-    def status(self) -> Dict[str, Any]:
-        """Return status payload describing the running instance."""
-        running = bool(self._process and self._process.returncode is None)
-        detection = None
-        if not running:
-            detection = self._detect_external_process()
-            if detection:
-                running = True
-                self._last_detected_external = detection
-                if not self._current_instance:
-                    self._current_instance = detection.get("instance")
-                if not self._started_at:
-                    self._started_at = detection.get("started_at")
-            else:
-                self._last_detected_external = None
-        else:
-            self._last_detected_external = None
-
-        return {
-            "running": running,
-            "port": self.port,
-            "host": self.host,
-            "process_id": self._process.pid if running else None,
-            "started_at": self._started_at,
-            "current_instance": self._current_instance if running else None,
-            "health": self._last_health_status,
-            "binary_path": self._current_binary_path(),
-            "log_path": self._log_path,
-            "auto_detected": bool(detection),
-            "detection": detection,
-        }
-
-    def _current_binary_path(self) -> Optional[str]:
-        try:
-            return self._resolve_binary()
-        except FileNotFoundError:
-            return None
-
-    def _resolve_binary(self) -> str:
-        try:
-            from backend.lmdeploy_installer import get_lmdeploy_installer
-
-            installer_binary = get_lmdeploy_installer().status().get("binary_path")
-            if installer_binary and os.path.exists(installer_binary):
-                return installer_binary
-        except Exception as exc:
-            logger.debug(
-                f"Failed to resolve LMDeploy binary via installer status: {exc}"
-            )
-
-        resolved = shutil.which(self.binary_path)
-        if resolved:
-            return resolved
-
-        candidate = os.path.expanduser(self.binary_path)
-        if os.path.isabs(candidate) and os.path.exists(candidate):
-            return candidate
-        raise FileNotFoundError(
-            "LMDeploy binary not found in PATH. Install LMDeploy from the LMDeploy page or set LMDEPLOY_BIN."
-        )
-
-    def _build_command(
-        self, binary: str, model_dir: str, config: Dict[str, Any]
-    ) -> list:
-        """Convert stored config into lmdeploy CLI arguments."""
-        tensor_parallel = max(1, int(config.get("tensor_parallel") or 1))
-        base_session_len = max(
-            1024,
-            int(
-                config.get("session_len")
-                or config.get("context_length")
-                or DEFAULT_LMDEPLOY_CONTEXT
-            ),
-        )
-        rope_scaling_mode = str(config.get("rope_scaling_mode") or "disabled").lower()
-        rope_scaling_factor = float(config.get("rope_scaling_factor") or 1.0)
-        scaling_enabled = (
-            rope_scaling_mode not in {"", "none", "disabled"}
-            and rope_scaling_factor > 1.0
-        )
-        effective_session_len = base_session_len
-        if scaling_enabled:
-            scaled = int(base_session_len * rope_scaling_factor)
-            effective_session_len = max(
-                base_session_len, min(scaled, MAX_LMDEPLOY_CONTEXT)
-            )
-        max_batch_size = max(1, int(config.get("max_batch_size") or 4))
-        base_prefill = int(
-            config.get("max_prefill_token_num")
-            or config.get("max_batch_tokens")
-            or (base_session_len * 2)
-        )
-        if scaling_enabled:
-            scaled_prefill = int(base_prefill * rope_scaling_factor)
-            max_prefill_token_num = scaled_prefill
-        else:
-            max_prefill_token_num = base_prefill
-
-        command = [
-            binary,
-            "serve",
-            "api_server",
-            model_dir,
-            "--backend",
-            "turbomind",
-            "--server-name",
-            self.host,
-            "--server-port",
-            str(self.port),
-            "--tp",
-            str(tensor_parallel),
-            "--session-len",
-            str(effective_session_len),
-            "--max-batch-size",
-            str(max_batch_size),
-        ]
-
-        # Optional model identity for OpenAI-style /v1/models listing
-        model_name = config.get("model_name")
-        if model_name and str(model_name).strip():
-            command.extend(["--model-name", str(model_name).strip()])
-
-        # Optional inference settings
-        dtype = config.get("dtype")
-        if dtype and str(dtype).strip():
-            command.extend(["--dtype", str(dtype).strip()])
-        if max_prefill_token_num:
-            command.extend(["--max-prefill-token-num", str(max_prefill_token_num)])
-        cache_max_entry_count = config.get("cache_max_entry_count")
-        if cache_max_entry_count is not None:
-            command.extend(["--cache-max-entry-count", str(cache_max_entry_count)])
-        cache_block_seq_len = config.get("cache_block_seq_len")
-        if cache_block_seq_len:
-            command.extend(["--cache-block-seq-len", str(cache_block_seq_len)])
-        if config.get("enable_prefix_caching"):
-            command.append("--enable-prefix-caching")
-        quant_policy = config.get("quant_policy")
-        if quant_policy is not None:
-            command.extend(["--quant-policy", str(quant_policy)])
-        model_format = config.get("model_format")
-        if model_format and str(model_format).strip():
-            command.extend(["--model-format", str(model_format).strip()])
-        hf_overrides = config.get("hf_overrides")
-        if isinstance(hf_overrides, dict) and hf_overrides:
-
-            def _flatten(prefix: str, value: Any):
-                if isinstance(value, dict):
-                    for key, nested in value.items():
-                        if not isinstance(key, str) or not key:
-                            continue
-                        new_prefix = f"{prefix}.{key}" if prefix else key
-                        yield from _flatten(new_prefix, nested)
-                else:
-                    yield prefix, value
-
-            def _format_override_value(val: Any) -> str:
-                if isinstance(val, bool):
-                    return "true" if val else "false"
-                if val is None:
-                    return "null"
-                return str(val)
-
-            for path, value in _flatten("", hf_overrides):
-                if not path:
-                    continue
-                command.extend(
-                    [f"--hf-overrides.{path}", _format_override_value(value)]
-                )
-        elif isinstance(hf_overrides, str) and hf_overrides.strip():
-            command.extend(["--hf-overrides", hf_overrides.strip()])
-        # LMDeploy uses --disable-metrics (inverted logic)
-        # When enable_metrics=false, send --disable-metrics
-        # When enable_metrics=true (default), don't send anything (metrics enabled by default)
-        if not config.get("enable_metrics", True):
-            command.append("--disable-metrics")
-        if scaling_enabled:
-            command.extend(["--rope-scaling-factor", str(rope_scaling_factor)])
-        num_tokens_per_iter = config.get("num_tokens_per_iter")
-        if num_tokens_per_iter:
-            command.extend(["--num-tokens-per-iter", str(num_tokens_per_iter)])
-        max_prefill_iters = config.get("max_prefill_iters")
-        if max_prefill_iters:
-            command.extend(["--max-prefill-iters", str(max_prefill_iters)])
-        communicator = config.get("communicator")
-        if communicator and str(communicator).strip():
-            command.extend(["--communicator", str(communicator).strip()])
-
-        # Server configuration parameters
-        allow_origins = config.get("allow_origins")
-        if allow_origins:
-            if isinstance(allow_origins, list):
-                command.extend(
-                    ["--allow-origins"] + [str(origin) for origin in allow_origins]
-                )
-            elif isinstance(allow_origins, str):
-                command.extend(["--allow-origins", allow_origins])
-        if config.get("allow_credentials"):
-            command.append("--allow-credentials")
-        allow_methods = config.get("allow_methods")
-        if allow_methods:
-            if isinstance(allow_methods, list):
-                command.extend(
-                    ["--allow-methods"] + [str(method) for method in allow_methods]
-                )
-            elif isinstance(allow_methods, str):
-                command.extend(["--allow-methods", allow_methods])
-        allow_headers = config.get("allow_headers")
-        if allow_headers:
-            if isinstance(allow_headers, list):
-                command.extend(
-                    ["--allow-headers"] + [str(header) for header in allow_headers]
-                )
-            elif isinstance(allow_headers, str):
-                command.extend(["--allow-headers", allow_headers])
-        proxy_url = config.get("proxy_url")
-        if proxy_url and str(proxy_url).strip():
-            command.extend(["--proxy-url", str(proxy_url).strip()])
-        max_concurrent_requests = config.get("max_concurrent_requests")
-        if max_concurrent_requests is not None:
-            command.extend(
-                ["--max-concurrent-requests", str(int(max_concurrent_requests))]
-            )
-        log_level = config.get("log_level")
-        if log_level and str(log_level).strip():
-            command.extend(["--log-level", str(log_level).strip()])
-        api_keys = config.get("api_keys")
-        if api_keys:
-            if isinstance(api_keys, list):
-                command.extend(["--api-keys"] + [str(key) for key in api_keys])
-            elif isinstance(api_keys, str):
-                command.extend(["--api-keys", api_keys])
-        if config.get("ssl"):
-            command.append("--ssl")
-        max_log_len = config.get("max_log_len")
-        if max_log_len is not None:
-            command.extend(["--max-log-len", str(int(max_log_len))])
-        if config.get("disable_fastapi_docs"):
-            command.append("--disable-fastapi-docs")
-        if config.get("allow_terminate_by_client"):
-            command.append("--allow-terminate-by-client")
-        if config.get("enable_abort_handling"):
-            command.append("--enable-abort-handling")
-
-        # Model configuration parameters
-        chat_template = config.get("chat_template")
-        if chat_template and str(chat_template).strip():
-            command.extend(["--chat-template", str(chat_template).strip()])
-        tool_call_parser = config.get("tool_call_parser")
-        if tool_call_parser and str(tool_call_parser).strip():
-            command.extend(["--tool-call-parser", str(tool_call_parser).strip()])
-        reasoning_parser = config.get("reasoning_parser")
-        if reasoning_parser and str(reasoning_parser).strip():
-            command.extend(["--reasoning-parser", str(reasoning_parser).strip()])
-        revision = config.get("revision")
-        if revision and str(revision).strip():
-            command.extend(["--revision", str(revision).strip()])
-        download_dir = config.get("download_dir")
-        if download_dir and str(download_dir).strip():
-            command.extend(["--download-dir", str(download_dir).strip()])
-        adapters = config.get("adapters")
-        if adapters:
-            if isinstance(adapters, list):
-                command.extend(["--adapters"] + [str(adapter) for adapter in adapters])
-            elif isinstance(adapters, str):
-                command.extend(["--adapters", adapters])
-        device = config.get("device")
-        if device and str(device).strip():
-            command.extend(["--device", str(device).strip()])
-        if config.get("eager_mode"):
-            command.append("--eager-mode")
-        if config.get("disable_vision_encoder"):
-            command.append("--disable-vision-encoder")
-        logprobs_mode = config.get("logprobs_mode")
-        if logprobs_mode is not None:
-            command.extend(["--logprobs-mode", str(logprobs_mode)])
-
-        # DLLM parameters
-        dllm_block_length = config.get("dllm_block_length")
-        if dllm_block_length is not None:
-            command.extend(["--dllm-block-length", str(int(dllm_block_length))])
-        dllm_unmasking_strategy = config.get("dllm_unmasking_strategy")
-        if dllm_unmasking_strategy and str(dllm_unmasking_strategy).strip():
-            command.extend(
-                ["--dllm-unmasking-strategy", str(dllm_unmasking_strategy).strip()]
-            )
-        dllm_denoising_steps = config.get("dllm_denoising_steps")
-        if dllm_denoising_steps is not None:
-            command.extend(["--dllm-denoising-steps", str(int(dllm_denoising_steps))])
-        dllm_confidence_threshold = config.get("dllm_confidence_threshold")
-        if dllm_confidence_threshold is not None:
-            command.extend(
-                ["--dllm-confidence-threshold", str(float(dllm_confidence_threshold))]
-            )
-
-        # Distributed/Multi-node parameters
-        dp = config.get("dp")
-        if dp is not None:
-            command.extend(["--dp", str(int(dp))])
-        ep = config.get("ep")
-        if ep is not None:
-            command.extend(["--ep", str(int(ep))])
-        if config.get("enable_microbatch"):
-            command.append("--enable-microbatch")
-        if config.get("enable_eplb"):
-            command.append("--enable-eplb")
-        role = config.get("role")
-        if role and str(role).strip():
-            command.extend(["--role", str(role).strip()])
-        migration_backend = config.get("migration_backend")
-        if migration_backend and str(migration_backend).strip():
-            command.extend(["--migration-backend", str(migration_backend).strip()])
-        node_rank = config.get("node_rank")
-        if node_rank is not None:
-            command.extend(["--node-rank", str(int(node_rank))])
-        nnodes = config.get("nnodes")
-        if nnodes is not None:
-            command.extend(["--nnodes", str(int(nnodes))])
-        cp = config.get("cp")
-        if cp is not None:
-            command.extend(["--cp", str(int(cp))])
-        if config.get("enable_return_routed_experts"):
-            command.append("--enable-return-routed-experts")
-        distributed_executor_backend = config.get("distributed_executor_backend")
-        if distributed_executor_backend and str(distributed_executor_backend).strip():
-            command.extend(
-                [
-                    "--distributed-executor-backend",
-                    str(distributed_executor_backend).strip(),
-                ]
-            )
-
-        # Vision parameters
-        vision_max_batch_size = config.get("vision_max_batch_size")
-        if vision_max_batch_size is not None:
-            command.extend(["--vision-max-batch-size", str(int(vision_max_batch_size))])
-
-        # Speculative decoding parameters
-        speculative_algorithm = config.get("speculative_algorithm")
-        if speculative_algorithm and str(speculative_algorithm).strip():
-            command.extend(
-                ["--speculative-algorithm", str(speculative_algorithm).strip()]
-            )
-        speculative_draft_model = config.get("speculative_draft_model")
-        if speculative_draft_model and str(speculative_draft_model).strip():
-            command.extend(
-                ["--speculative-draft-model", str(speculative_draft_model).strip()]
-            )
-        speculative_num_draft_tokens = config.get("speculative_num_draft_tokens")
-        if speculative_num_draft_tokens is not None:
-            command.extend(
-                [
-                    "--speculative-num-draft-tokens",
-                    str(int(speculative_num_draft_tokens)),
-                ]
-            )
-
-        additional_args = config.get("additional_args")
-        if isinstance(additional_args, str) and additional_args.strip():
-            command.extend(shlex.split(additional_args.strip()))
-
-        return command
-
-    async def _wait_for_ready(self) -> None:
-        """Poll LMDeploy server until healthy or timeout."""
-        start_time = asyncio.get_event_loop().time()
-        url = f"http://{self.host}:{self.port}/v1/models"
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            while True:
-                if self._process and self._process.returncode not in (None, 0):
-                    self._raise_with_logs(
-                        f"LMDeploy exited unexpectedly with code {self._process.returncode}"
-                    )
-                try:
-                    response = await client.get(url)
-                    if response.status_code == 200:
-                        self._last_health_status = {
-                            "status": "ready",
-                            "checked_at": datetime.utcnow().isoformat() + "Z",
-                        }
-                        return
-                except Exception as exc:
-                    logger.debug(f"LMDeploy health check pending: {exc}")
-                if asyncio.get_event_loop().time() - start_time > self._health_timeout:
-                    self._raise_with_logs(
-                        "Timed out waiting for LMDeploy server to become ready"
-                    )
-                await asyncio.sleep(2)
-
-    def _cleanup_process_state(self) -> None:
-        if self._log_file:
-            try:
-                self._log_file.close()
-            except Exception:
-                pass
-            self._log_file = None
-        self._process = None
-        self._current_instance = None
-        self._started_at = None
-        self._last_health_status = {
-            "status": "stopped",
-            "checked_at": datetime.utcnow().isoformat() + "Z",
-        }
-
-    def read_log_tail(self, max_bytes: int = 8192) -> str:
-        """Return the tail of the lmdeploy log file for debugging."""
-        try:
-            with open(self._log_path, "rb") as log_file:
-                log_file.seek(0, os.SEEK_END)
-                file_size = log_file.tell()
-                seek_pos = max(0, file_size - max_bytes)
-                log_file.seek(seek_pos)
-                data = log_file.read().decode("utf-8", errors="replace")
-                if seek_pos > 0:
-                    # Remove potential partial first line
-                    data = data.split("\n", 1)[-1]
-                return data.strip()
-        except Exception as exc:
-            logger.error(f"Failed to read LMDeploy log tail: {exc}")
-            return ""
-
-    async def _broadcast_runtime_logs(self) -> None:
-        """Broadcast new runtime log lines via WebSocket."""
-        try:
-            if not os.path.exists(self._log_path):
-                return
-            
-            # Read new content since last broadcast
-            current_size = os.path.getsize(self._log_path)
-            if current_size <= self._last_broadcast_log_position:
-                return  # No new content
-            
-            # Read only new content
-            with open(self._log_path, "rb") as log_file:
-                log_file.seek(self._last_broadcast_log_position)
-                new_content = log_file.read().decode("utf-8", errors="replace")
-                self._last_broadcast_log_position = current_size
-            
-            if new_content:
-                # Split into lines and broadcast each non-empty line
-                lines = new_content.split('\n')
-                for line in lines:
-                    if line.strip():  # Only send non-empty lines
-                        await websocket_manager.send_lmdeploy_runtime_log(line.strip())
-        except Exception as exc:
-            logger.debug(f"Failed to broadcast LMDeploy runtime logs: {exc}")
-
-    def _read_log_tail(self, max_bytes: int = 8192) -> str:
-        """Private alias for backward compatibility."""
-        return self.read_log_tail(max_bytes)
-
-    def _raise_with_logs(self, message: str) -> None:
-        """Raise a runtime error that includes the recent LMDeploy logs."""
-        log_tail = self.read_log_tail()
-        if log_tail:
-            logger.error(
-                f"{message}\n--- LMDeploy log tail ---\n{log_tail}\n--- end ---"
-            )
-            raise RuntimeError(f"{message}. See logs for details.\n{log_tail}")
-        raise RuntimeError(message)
-
-    def _detect_external_process(self) -> Optional[Dict[str, Any]]:
-        """Scan system processes for an LMDeploy server launched outside the manager."""
-        try:
-            for proc in psutil.process_iter(attrs=["pid", "cmdline", "create_time"]):
-                cmdline: List[str] = proc.info.get("cmdline") or []
-                if not cmdline:
-                    continue
-                lowered = " ".join(cmdline).lower()
-                if "lmdeploy" not in lowered:
-                    continue
-                if "serve" not in lowered or "api_server" not in lowered:
-                    continue
-
-                try:
-                    api_server_idx = cmdline.index("api_server")
-                except ValueError:
-                    continue
-                model_dir = (
-                    cmdline[api_server_idx + 1]
-                    if len(cmdline) > api_server_idx + 1
-                    else None
-                )
-                detection = {
-                    "pid": proc.info["pid"],
-                    "cmdline": cmdline,
-                    "model_dir": model_dir,
-                    "detected_at": datetime.utcnow().isoformat() + "Z",
-                }
-
-                config = self._config_from_cmdline(cmdline)
-                model_entry = (
-                    self._lookup_model_by_dir(model_dir) if model_dir else None
-                )
-                if model_entry:
-                    self._ensure_running_instance_record(model_entry.id, config)
-                    detection["instance"] = {
-                        "model_id": model_entry.id,
-                        "huggingface_id": model_entry.huggingface_id,
-                        "file_path": model_entry.file_path,
-                        "config": config,
-                        "pid": proc.info["pid"],
-                        "auto_detected": True,
-                    }
-                    detection["model_id"] = model_entry.id
-                    detection["huggingface_id"] = model_entry.huggingface_id
-                else:
-                    detection["instance"] = {
-                        "model_id": None,
-                        "huggingface_id": None,
-                        "file_path": model_dir,
-                        "config": config,
-                        "pid": proc.info["pid"],
-                        "auto_detected": True,
-                    }
-
-                started_at = proc.info.get("create_time")
-                if started_at:
-                    detection["started_at"] = (
-                        datetime.utcfromtimestamp(started_at).isoformat() + "Z"
-                    )
-                else:
-                    detection["started_at"] = datetime.utcnow().isoformat() + "Z"
-                return detection
-        except Exception as exc:
-            logger.debug(f"LMDeploy external scan failed: {exc}")
-        return None
-
-    def _config_from_cmdline(self, cmdline: List[str]) -> Dict[str, Any]:
-        """Reconstruct a minimal config dict from lmdeploy CLI arguments."""
-
-        def _extract(flag: str, cast, default=None):
-            if flag in cmdline:
-                idx = cmdline.index(flag)
-                if idx + 1 < len(cmdline):
-                    try:
-                        return cast(cmdline[idx + 1])
-                    except (ValueError, TypeError):
-                        return default
-            return default
-
-        def _extract_list(flag: str, default=None):
-            """Extract list of values for flags that accept multiple arguments."""
-            if flag not in cmdline:
-                return default
-            idx = cmdline.index(flag)
-            result = []
-            i = idx + 1
-            while i < len(cmdline) and not cmdline[i].startswith("--"):
-                result.append(cmdline[i])
-                i += 1
-            return result if result else default
-
-        session_len = _extract("--session-len", int, DEFAULT_LMDEPLOY_CONTEXT)
-        max_prefill = _extract("--max-prefill-token-num", int, session_len)
-        # Note: --max-context-token-num doesn't exist in LMDeploy, so derive from session_len
-        max_context = session_len
-
-        rope_scaling_factor = _extract("--rope-scaling-factor", float, 1.0)
-        rope_scaling_mode = "disabled"
-        if rope_scaling_factor and rope_scaling_factor > 1.0:
-            rope_scaling_mode = "detected"
-
-        hf_overrides: Dict[str, Any] = {}
-
-        def _assign_nested(target: Dict[str, Any], path: List[str], value: Any) -> None:
-            current = target
-            for segment in path[:-1]:
-                current = current.setdefault(segment, {})
-            current[path[-1]] = value
-
-        def _coerce_override_value(raw: str) -> Any:
-            lowered = raw.lower()
-            if lowered in {"true", "false"}:
-                return lowered == "true"
-            if lowered == "null":
-                return None
-            try:
-                if "." in raw:
-                    return float(raw)
-                return int(raw)
-            except ValueError:
-                return raw
-
-        i = 0
-        while i < len(cmdline):
-            token = cmdline[i]
-            if token.startswith("--hf-overrides."):
-                path_str = token[len("--hf-overrides.") :]
-                if path_str and i + 1 < len(cmdline):
-                    value = _coerce_override_value(cmdline[i + 1])
-                    _assign_nested(hf_overrides, path_str.split("."), value)
-                    i += 2
-                    continue
-            i += 1
-
-        config = {
-            "session_len": session_len,
-            "tensor_parallel": _extract("--tp", int, 1),
-            "max_batch_size": _extract("--max-batch-size", int, 4),
-            "max_prefill_token_num": max_prefill,
-            "max_context_token_num": max_context,
-            "dtype": _extract("--dtype", str, "auto"),
-            "cache_max_entry_count": _extract("--cache-max-entry-count", float, 0.8),
-            "cache_block_seq_len": _extract("--cache-block-seq-len", int, 64),
-            "enable_prefix_caching": "--enable-prefix-caching" in cmdline,
-            "quant_policy": _extract("--quant-policy", int, 0),
-            "model_format": _extract("--model-format", str, ""),
-            "hf_overrides": hf_overrides or _extract("--hf-overrides", str, ""),
-            # LMDeploy uses --disable-metrics, so enable_metrics=True when flag is NOT present
-            "enable_metrics": "--disable-metrics" not in cmdline,
-            "rope_scaling_factor": rope_scaling_factor,
-            "rope_scaling_mode": rope_scaling_mode,
-            "num_tokens_per_iter": _extract("--num-tokens-per-iter", int, 0),
-            "max_prefill_iters": _extract("--max-prefill-iters", int, 1),
-            "communicator": _extract("--communicator", str, "nccl"),
-            "model_name": _extract("--model-name", str, ""),
-            # Server configuration
-            "allow_origins": _extract_list("--allow-origins"),
-            "allow_credentials": "--allow-credentials" in cmdline,
-            "allow_methods": _extract_list("--allow-methods"),
-            "allow_headers": _extract_list("--allow-headers"),
-            "proxy_url": _extract("--proxy-url", str, ""),
-            "max_concurrent_requests": _extract("--max-concurrent-requests", int),
-            "log_level": _extract("--log-level", str, ""),
-            "api_keys": _extract_list("--api-keys"),
-            "ssl": "--ssl" in cmdline,
-            "max_log_len": _extract("--max-log-len", int),
-            "disable_fastapi_docs": "--disable-fastapi-docs" in cmdline,
-            "allow_terminate_by_client": "--allow-terminate-by-client" in cmdline,
-            "enable_abort_handling": "--enable-abort-handling" in cmdline,
-            # Model configuration
-            "chat_template": _extract("--chat-template", str, ""),
-            "tool_call_parser": _extract("--tool-call-parser", str, ""),
-            "reasoning_parser": _extract("--reasoning-parser", str, ""),
-            "revision": _extract("--revision", str, ""),
-            "download_dir": _extract("--download-dir", str, ""),
-            "adapters": _extract_list("--adapters"),
-            "device": _extract("--device", str, ""),
-            "eager_mode": "--eager-mode" in cmdline,
-            "disable_vision_encoder": "--disable-vision-encoder" in cmdline,
-            "logprobs_mode": _extract("--logprobs-mode", str),
-            # DLLM parameters
-            "dllm_block_length": _extract("--dllm-block-length", int),
-            "dllm_unmasking_strategy": _extract("--dllm-unmasking-strategy", str, ""),
-            "dllm_denoising_steps": _extract("--dllm-denoising-steps", int),
-            "dllm_confidence_threshold": _extract("--dllm-confidence-threshold", float),
-            # Distributed/Multi-node parameters
-            "dp": _extract("--dp", int),
-            "ep": _extract("--ep", int),
-            "enable_microbatch": "--enable-microbatch" in cmdline,
-            "enable_eplb": "--enable-eplb" in cmdline,
-            "role": _extract("--role", str, ""),
-            "migration_backend": _extract("--migration-backend", str, ""),
-            "node_rank": _extract("--node-rank", int),
-            "nnodes": _extract("--nnodes", int),
-            "cp": _extract("--cp", int),
-            "enable_return_routed_experts": "--enable-return-routed-experts" in cmdline,
-            "distributed_executor_backend": _extract(
-                "--distributed-executor-backend", str, ""
-            ),
-            # Vision parameters
-            "vision_max_batch_size": _extract("--vision-max-batch-size", int),
-            # Speculative decoding parameters
-            "speculative_algorithm": _extract("--speculative-algorithm", str, ""),
-            "speculative_draft_model": _extract("--speculative-draft-model", str, ""),
-            "speculative_num_draft_tokens": _extract(
-                "--speculative-num-draft-tokens", int
-            ),
-            "additional_args": "",
-        }
-
-        return config
-
-    def _lookup_model_by_dir(self, model_dir: Optional[str]) -> Optional[Model]:
-        if not model_dir:
-            return None
-        db = SessionLocal()
-        try:
-            candidates = (
-                db.query(Model).filter(Model.model_format == "safetensors").all()
-            )
-            for candidate in candidates:
-                if (
-                    candidate.file_path
-                    and os.path.dirname(candidate.file_path) == model_dir
-                ):
-                    return candidate
-        finally:
-            db.close()
-        return None
-
-    def _ensure_running_instance_record(
-        self, model_id: Optional[int], config: Dict[str, Any]
-    ) -> None:
-        if not model_id:
-            return
-        db = SessionLocal()
-        try:
-            existing = (
-                db.query(RunningInstance)
-                .filter(
-                    RunningInstance.model_id == model_id,
-                    RunningInstance.runtime_type == "lmdeploy",
-                )
-                .first()
-            )
-            if existing:
-                return
-            instance = RunningInstance(
-                model_id=model_id,
-                llama_version="lmdeploy",
-                proxy_model_name=f"lmdeploy::{model_id}",
-                started_at=datetime.utcnow(),
-                config=json.dumps({"lmdeploy": config}),
-                runtime_type="lmdeploy",
-            )
-            db.add(instance)
-            model = db.query(Model).filter(Model.id == model_id).first()
-            if model:
-                model.is_active = True
-            db.commit()
-        except Exception as exc:
-            logger.warning(f"Failed to create LMDeploy running instance record: {exc}")
-            db.rollback()
-        finally:
-            db.close()
+import asyncio
+import json
+import os
+import shlex
+import shutil
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+
+import httpx
+import psutil
+from asyncio.subprocess import Process, STDOUT
+
+from backend.logging_config import get_logger
+from backend.data_store import get_store
+from backend.huggingface import DEFAULT_LMDEPLOY_CONTEXT, MAX_LMDEPLOY_CONTEXT
+from backend.progress_manager import get_progress_manager
+
+logger = get_logger(__name__)
+
+_lmdeploy_manager_instance: Optional["LMDeployManager"] = None
+
+
+def get_lmdeploy_manager() -> "LMDeployManager":
+    """Return singleton LMDeploy manager."""
+    global _lmdeploy_manager_instance
+    if _lmdeploy_manager_instance is None:
+        _lmdeploy_manager_instance = LMDeployManager()
+    return _lmdeploy_manager_instance
+
+
+class LMDeployManager:
+    """Manage LMDeploy TurboMind runtime lifecycle."""
+
+    def __init__(
+        self,
+        binary_path: Optional[str] = None,
+        host: str = "0.0.0.0",
+        port: int = 2001,
+    ):
+        self.binary_path = binary_path or os.getenv("LMDEPLOY_BIN", "lmdeploy")
+        self.host = host
+        self.port = int(os.getenv("LMDEPLOY_PORT", port))
+        self._process: Optional[Process] = None
+        self._log_file = None
+        self._lock = asyncio.Lock()
+        self._current_instance: Optional[Dict[str, Any]] = None
+        self._started_at: Optional[str] = None
+        self._log_path = os.path.join("data", "logs", "lmdeploy.log")
+        self._health_timeout = 180  # seconds
+        self._last_health_status: Optional[Dict[str, Any]] = None
+        self._last_detected_external: Optional[Dict[str, Any]] = None
+        self._last_broadcast_log_position = 0
+
+    async def start(
+        self, model_entry: Dict[str, Any], config: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Start LMDeploy serving the provided model. Only one model may run at once."""
+        async with self._lock:
+            if self._process and self._process.returncode is None:
+                raise RuntimeError("LMDeploy runtime is already running")
+
+            model_path = model_entry.get("file_path")
+            if not model_path or not os.path.exists(model_path):
+                raise FileNotFoundError(f"Model file not found at {model_path}")
+            model_dir = model_entry.get("model_dir") or os.path.dirname(model_path)
+            if not os.path.isdir(model_dir):
+                raise FileNotFoundError(f"Model directory not found at {model_dir}")
+            model_dir_abs = os.path.abspath(model_dir)
+
+            # Derive a stable model name for LMDeploy's --model-name flag.
+            # Preference order:
+            # 1) Explicit model_name passed in model_entry
+            # 2) Base model / display name from model_entry
+            # 3) Hugging Face repo id
+            # 4) Directory name
+            model_name = (
+                model_entry.get("model_name")
+                or model_entry.get("display_name")
+                or model_entry.get("huggingface_id")
+                or os.path.basename(model_dir_abs.rstrip(os.sep))
+            )
+
+            # Inject model_name into config passed to LMDeploy so the command builder
+            # can add --model-name and we persist it in status/config reflection.
+            effective_config = dict(config or {})
+            if model_name and not effective_config.get("model_name"):
+                effective_config["model_name"] = model_name
+
+            binary = self._resolve_binary()
+            command = self._build_command(binary, model_dir_abs, effective_config)
+            env = os.environ.copy()
+            env.setdefault("LMDEPLOY_LOG_DIR", os.path.dirname(self._log_path))
+            os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
+            self._log_file = open(self._log_path, "ab", buffering=0)
+
+            logger.info(f"Starting LMDeploy with command: {' '.join(command)}")
+            self._process = await asyncio.create_subprocess_exec(
+                *command,
+                stdout=self._log_file,
+                stderr=STDOUT,
+                cwd=model_dir_abs,
+                env=env,
+            )
+            self._started_at = datetime.utcnow().isoformat() + "Z"
+            self._current_instance = {
+                "model_id": model_entry.get("model_id"),
+                "huggingface_id": model_entry.get("huggingface_id"),
+                "file_path": model_path,
+                "config": effective_config,
+                "pid": self._process.pid,
+            }
+
+        try:
+            await self._wait_for_ready()
+        except Exception as exc:
+            await self.stop(force=True)
+            raise exc
+
+        return self.status()
+
+    async def stop(self, force: bool = False) -> None:
+        """Stop LMDeploy process if running."""
+        async with self._lock:
+            if not self._process:
+                return
+            if self._process.returncode is None:
+                try:
+                    self._process.terminate()
+                    await asyncio.wait_for(self._process.wait(), timeout=30)
+                except asyncio.TimeoutError:
+                    logger.warning(
+                        "LMDeploy did not terminate gracefully; killing process"
+                    )
+                    self._process.kill()
+                    await self._process.wait()
+                except ProcessLookupError:
+                    logger.debug("LMDeploy process already stopped")
+            elif force:
+                try:
+                    self._process.kill()
+                except ProcessLookupError:
+                    pass
+            self._cleanup_process_state()
+
+    async def restart(
+        self, model_entry: Dict[str, Any], config: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Restart LMDeploy with a new model/config."""
+        await self.stop()
+        return await self.start(model_entry, config)
+
+    def status(self) -> Dict[str, Any]:
+        """Return status payload describing the running instance."""
+        running = bool(self._process and self._process.returncode is None)
+        detection = None
+        if not running:
+            detection = self._detect_external_process()
+            if detection:
+                running = True
+                self._last_detected_external = detection
+                if not self._current_instance:
+                    self._current_instance = detection.get("instance")
+                if not self._started_at:
+                    self._started_at = detection.get("started_at")
+            else:
+                self._last_detected_external = None
+        else:
+            self._last_detected_external = None
+
+        return {
+            "running": running,
+            "port": self.port,
+            "host": self.host,
+            "process_id": self._process.pid if running else None,
+            "started_at": self._started_at,
+            "current_instance": self._current_instance if running else None,
+            "health": self._last_health_status,
+            "binary_path": self._current_binary_path(),
+            "log_path": self._log_path,
+            "auto_detected": bool(detection),
+            "detection": detection,
+        }
+
+    def _current_binary_path(self) -> Optional[str]:
+        try:
+            return self._resolve_binary()
+        except FileNotFoundError:
+            return None
+
+    def _resolve_binary(self) -> str:
+        try:
+            from backend.lmdeploy_installer import get_lmdeploy_installer
+
+            installer_binary = get_lmdeploy_installer().status().get("binary_path")
+            if installer_binary and os.path.exists(installer_binary):
+                return installer_binary
+        except Exception as exc:
+            logger.debug(
+                f"Failed to resolve LMDeploy binary via installer status: {exc}"
+            )
+
+        resolved = shutil.which(self.binary_path)
+        if resolved:
+            return resolved
+
+        candidate = os.path.expanduser(self.binary_path)
+        if os.path.isabs(candidate) and os.path.exists(candidate):
+            return candidate
+        raise FileNotFoundError(
+            "LMDeploy binary not found in PATH. Install LMDeploy from the LMDeploy page or set LMDEPLOY_BIN."
+        )
+
+    def _build_command(
+        self, binary: str, model_dir: str, config: Dict[str, Any]
+    ) -> list:
+        """Convert stored config into lmdeploy CLI arguments."""
+        tensor_parallel = max(1, int(config.get("tensor_parallel") or 1))
+        base_session_len = max(
+            1024,
+            int(
+                config.get("session_len")
+                or config.get("context_length")
+                or DEFAULT_LMDEPLOY_CONTEXT
+            ),
+        )
+        rope_scaling_mode = str(config.get("rope_scaling_mode") or "disabled").lower()
+        rope_scaling_factor = float(config.get("rope_scaling_factor") or 1.0)
+        scaling_enabled = (
+            rope_scaling_mode not in {"", "none", "disabled"}
+            and rope_scaling_factor > 1.0
+        )
+        effective_session_len = base_session_len
+        if scaling_enabled:
+            scaled = int(base_session_len * rope_scaling_factor)
+            effective_session_len = max(
+                base_session_len, min(scaled, MAX_LMDEPLOY_CONTEXT)
+            )
+        max_batch_size = max(1, int(config.get("max_batch_size") or 4))
+        base_prefill = int(
+            config.get("max_prefill_token_num")
+            or config.get("max_batch_tokens")
+            or (base_session_len * 2)
+        )
+        if scaling_enabled:
+            scaled_prefill = int(base_prefill * rope_scaling_factor)
+            max_prefill_token_num = scaled_prefill
+        else:
+            max_prefill_token_num = base_prefill
+
+        command = [
+            binary,
+            "serve",
+            "api_server",
+            model_dir,
+            "--backend",
+            "turbomind",
+            "--server-name",
+            self.host,
+            "--server-port",
+            str(self.port),
+            "--tp",
+            str(tensor_parallel),
+            "--session-len",
+            str(effective_session_len),
+            "--max-batch-size",
+            str(max_batch_size),
+        ]
+
+        # Optional model identity for OpenAI-style /v1/models listing
+        model_name = config.get("model_name")
+        if model_name and str(model_name).strip():
+            command.extend(["--model-name", str(model_name).strip()])
+
+        # Optional inference settings
+        dtype = config.get("dtype")
+        if dtype and str(dtype).strip():
+            command.extend(["--dtype", str(dtype).strip()])
+        if max_prefill_token_num:
+            command.extend(["--max-prefill-token-num", str(max_prefill_token_num)])
+        cache_max_entry_count = config.get("cache_max_entry_count")
+        if cache_max_entry_count is not None:
+            command.extend(["--cache-max-entry-count", str(cache_max_entry_count)])
+        cache_block_seq_len = config.get("cache_block_seq_len")
+        if cache_block_seq_len:
+            command.extend(["--cache-block-seq-len", str(cache_block_seq_len)])
+        if config.get("enable_prefix_caching"):
+            command.append("--enable-prefix-caching")
+        quant_policy = config.get("quant_policy")
+        if quant_policy is not None:
+            command.extend(["--quant-policy", str(quant_policy)])
+        model_format = config.get("model_format")
+        if model_format and str(model_format).strip():
+            command.extend(["--model-format", str(model_format).strip()])
+        hf_overrides = config.get("hf_overrides")
+        if isinstance(hf_overrides, dict) and hf_overrides:
+
+            def _flatten(prefix: str, value: Any):
+                if isinstance(value, dict):
+                    for key, nested in value.items():
+                        if not isinstance(key, str) or not key:
+                            continue
+                        new_prefix = f"{prefix}.{key}" if prefix else key
+                        yield from _flatten(new_prefix, nested)
+                else:
+                    yield prefix, value
+
+            def _format_override_value(val: Any) -> str:
+                if isinstance(val, bool):
+                    return "true" if val else "false"
+                if val is None:
+                    return "null"
+                return str(val)
+
+            for path, value in _flatten("", hf_overrides):
+                if not path:
+                    continue
+                command.extend(
+                    [f"--hf-overrides.{path}", _format_override_value(value)]
+                )
+        elif isinstance(hf_overrides, str) and hf_overrides.strip():
+            command.extend(["--hf-overrides", hf_overrides.strip()])
+        # LMDeploy uses --disable-metrics (inverted logic)
+        # When enable_metrics=false, send --disable-metrics
+        # When enable_metrics=true (default), don't send anything (metrics enabled by default)
+        if not config.get("enable_metrics", True):
+            command.append("--disable-metrics")
+        if scaling_enabled:
+            command.extend(["--rope-scaling-factor", str(rope_scaling_factor)])
+        num_tokens_per_iter = config.get("num_tokens_per_iter")
+        if num_tokens_per_iter:
+            command.extend(["--num-tokens-per-iter", str(num_tokens_per_iter)])
+        max_prefill_iters = config.get("max_prefill_iters")
+        if max_prefill_iters:
+            command.extend(["--max-prefill-iters", str(max_prefill_iters)])
+        communicator = config.get("communicator")
+        if communicator and str(communicator).strip():
+            command.extend(["--communicator", str(communicator).strip()])
+
+        # Server configuration parameters
+        allow_origins = config.get("allow_origins")
+        if allow_origins:
+            if isinstance(allow_origins, list):
+                command.extend(
+                    ["--allow-origins"] + [str(origin) for origin in allow_origins]
+                )
+            elif isinstance(allow_origins, str):
+                command.extend(["--allow-origins", allow_origins])
+        if config.get("allow_credentials"):
+            command.append("--allow-credentials")
+        allow_methods = config.get("allow_methods")
+        if allow_methods:
+            if isinstance(allow_methods, list):
+                command.extend(
+                    ["--allow-methods"] + [str(method) for method in allow_methods]
+                )
+            elif isinstance(allow_methods, str):
+                command.extend(["--allow-methods", allow_methods])
+        allow_headers = config.get("allow_headers")
+        if allow_headers:
+            if isinstance(allow_headers, list):
+                command.extend(
+                    ["--allow-headers"] + [str(header) for header in allow_headers]
+                )
+            elif isinstance(allow_headers, str):
+                command.extend(["--allow-headers", allow_headers])
+        proxy_url = config.get("proxy_url")
+        if proxy_url and str(proxy_url).strip():
+            command.extend(["--proxy-url", str(proxy_url).strip()])
+        max_concurrent_requests = config.get("max_concurrent_requests")
+        if max_concurrent_requests is not None:
+            command.extend(
+                ["--max-concurrent-requests", str(int(max_concurrent_requests))]
+            )
+        log_level = config.get("log_level")
+        if log_level and str(log_level).strip():
+            command.extend(["--log-level", str(log_level).strip()])
+        api_keys = config.get("api_keys")
+        if api_keys:
+            if isinstance(api_keys, list):
+                command.extend(["--api-keys"] + [str(key) for key in api_keys])
+            elif isinstance(api_keys, str):
+                command.extend(["--api-keys", api_keys])
+        if config.get("ssl"):
+            command.append("--ssl")
+        max_log_len = config.get("max_log_len")
+        if max_log_len is not None:
+            command.extend(["--max-log-len", str(int(max_log_len))])
+        if config.get("disable_fastapi_docs"):
+            command.append("--disable-fastapi-docs")
+        if config.get("allow_terminate_by_client"):
+            command.append("--allow-terminate-by-client")
+        if config.get("enable_abort_handling"):
+            command.append("--enable-abort-handling")
+
+        # Model configuration parameters
+        chat_template = config.get("chat_template")
+        if chat_template and str(chat_template).strip():
+            command.extend(["--chat-template", str(chat_template).strip()])
+        tool_call_parser = config.get("tool_call_parser")
+        if tool_call_parser and str(tool_call_parser).strip():
+            command.extend(["--tool-call-parser", str(tool_call_parser).strip()])
+        reasoning_parser = config.get("reasoning_parser")
+        if reasoning_parser and str(reasoning_parser).strip():
+            command.extend(["--reasoning-parser", str(reasoning_parser).strip()])
+        revision = config.get("revision")
+        if revision and str(revision).strip():
+            command.extend(["--revision", str(revision).strip()])
+        download_dir = config.get("download_dir")
+        if download_dir and str(download_dir).strip():
+            command.extend(["--download-dir", str(download_dir).strip()])
+        adapters = config.get("adapters")
+        if adapters:
+            if isinstance(adapters, list):
+                command.extend(["--adapters"] + [str(adapter) for adapter in adapters])
+            elif isinstance(adapters, str):
+                command.extend(["--adapters", adapters])
+        device = config.get("device")
+        if device and str(device).strip():
+            command.extend(["--device", str(device).strip()])
+        if config.get("eager_mode"):
+            command.append("--eager-mode")
+        if config.get("disable_vision_encoder"):
+            command.append("--disable-vision-encoder")
+        logprobs_mode = config.get("logprobs_mode")
+        if logprobs_mode is not None:
+            command.extend(["--logprobs-mode", str(logprobs_mode)])
+
+        # DLLM parameters
+        dllm_block_length = config.get("dllm_block_length")
+        if dllm_block_length is not None:
+            command.extend(["--dllm-block-length", str(int(dllm_block_length))])
+        dllm_unmasking_strategy = config.get("dllm_unmasking_strategy")
+        if dllm_unmasking_strategy and str(dllm_unmasking_strategy).strip():
+            command.extend(
+                ["--dllm-unmasking-strategy", str(dllm_unmasking_strategy).strip()]
+            )
+        dllm_denoising_steps = config.get("dllm_denoising_steps")
+        if dllm_denoising_steps is not None:
+            command.extend(["--dllm-denoising-steps", str(int(dllm_denoising_steps))])
+        dllm_confidence_threshold = config.get("dllm_confidence_threshold")
+        if dllm_confidence_threshold is not None:
+            command.extend(
+                ["--dllm-confidence-threshold", str(float(dllm_confidence_threshold))]
+            )
+
+        # Distributed/Multi-node parameters
+        dp = config.get("dp")
+        if dp is not None:
+            command.extend(["--dp", str(int(dp))])
+        ep = config.get("ep")
+        if ep is not None:
+            command.extend(["--ep", str(int(ep))])
+        if config.get("enable_microbatch"):
+            command.append("--enable-microbatch")
+        if config.get("enable_eplb"):
+            command.append("--enable-eplb")
+        role = config.get("role")
+        if role and str(role).strip():
+            command.extend(["--role", str(role).strip()])
+        migration_backend = config.get("migration_backend")
+        if migration_backend and str(migration_backend).strip():
+            command.extend(["--migration-backend", str(migration_backend).strip()])
+        node_rank = config.get("node_rank")
+        if node_rank is not None:
+            command.extend(["--node-rank", str(int(node_rank))])
+        nnodes = config.get("nnodes")
+        if nnodes is not None:
+            command.extend(["--nnodes", str(int(nnodes))])
+        cp = config.get("cp")
+        if cp is not None:
+            command.extend(["--cp", str(int(cp))])
+        if config.get("enable_return_routed_experts"):
+            command.append("--enable-return-routed-experts")
+        distributed_executor_backend = config.get("distributed_executor_backend")
+        if distributed_executor_backend and str(distributed_executor_backend).strip():
+            command.extend(
+                [
+                    "--distributed-executor-backend",
+                    str(distributed_executor_backend).strip(),
+                ]
+            )
+
+        # Vision parameters
+        vision_max_batch_size = config.get("vision_max_batch_size")
+        if vision_max_batch_size is not None:
+            command.extend(["--vision-max-batch-size", str(int(vision_max_batch_size))])
+
+        # Speculative decoding parameters
+        speculative_algorithm = config.get("speculative_algorithm")
+        if speculative_algorithm and str(speculative_algorithm).strip():
+            command.extend(
+                ["--speculative-algorithm", str(speculative_algorithm).strip()]
+            )
+        speculative_draft_model = config.get("speculative_draft_model")
+        if speculative_draft_model and str(speculative_draft_model).strip():
+            command.extend(
+                ["--speculative-draft-model", str(speculative_draft_model).strip()]
+            )
+        speculative_num_draft_tokens = config.get("speculative_num_draft_tokens")
+        if speculative_num_draft_tokens is not None:
+            command.extend(
+                [
+                    "--speculative-num-draft-tokens",
+                    str(int(speculative_num_draft_tokens)),
+                ]
+            )
+
+        additional_args = config.get("additional_args")
+        if isinstance(additional_args, str) and additional_args.strip():
+            command.extend(shlex.split(additional_args.strip()))
+
+        return command
+
+    async def _wait_for_ready(self) -> None:
+        """Poll LMDeploy server until healthy or timeout."""
+        start_time = asyncio.get_event_loop().time()
+        url = f"http://{self.host}:{self.port}/v1/models"
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            while True:
+                if self._process and self._process.returncode not in (None, 0):
+                    self._raise_with_logs(
+                        f"LMDeploy exited unexpectedly with code {self._process.returncode}"
+                    )
+                try:
+                    response = await client.get(url)
+                    if response.status_code == 200:
+                        self._last_health_status = {
+                            "status": "ready",
+                            "checked_at": datetime.utcnow().isoformat() + "Z",
+                        }
+                        return
+                except Exception as exc:
+                    logger.debug(f"LMDeploy health check pending: {exc}")
+                if asyncio.get_event_loop().time() - start_time > self._health_timeout:
+                    self._raise_with_logs(
+                        "Timed out waiting for LMDeploy server to become ready"
+                    )
+                await asyncio.sleep(2)
+
+    def _cleanup_process_state(self) -> None:
+        if self._log_file:
+            try:
+                self._log_file.close()
+            except Exception:
+                pass
+            self._log_file = None
+        self._process = None
+        self._current_instance = None
+        self._started_at = None
+        self._last_health_status = {
+            "status": "stopped",
+            "checked_at": datetime.utcnow().isoformat() + "Z",
+        }
+
+    def read_log_tail(self, max_bytes: int = 8192) -> str:
+        """Return the tail of the lmdeploy log file for debugging."""
+        try:
+            with open(self._log_path, "rb") as log_file:
+                log_file.seek(0, os.SEEK_END)
+                file_size = log_file.tell()
+                seek_pos = max(0, file_size - max_bytes)
+                log_file.seek(seek_pos)
+                data = log_file.read().decode("utf-8", errors="replace")
+                if seek_pos > 0:
+                    # Remove potential partial first line
+                    data = data.split("\n", 1)[-1]
+                return data.strip()
+        except Exception as exc:
+            logger.error(f"Failed to read LMDeploy log tail: {exc}")
+            return ""
+
+    async def _broadcast_runtime_logs(self) -> None:
+        """Broadcast new runtime log lines via SSE."""
+        try:
+            if not os.path.exists(self._log_path):
+                return
+            
+            # Read new content since last broadcast
+            current_size = os.path.getsize(self._log_path)
+            if current_size <= self._last_broadcast_log_position:
+                return  # No new content
+            
+            # Read only new content
+            with open(self._log_path, "rb") as log_file:
+                log_file.seek(self._last_broadcast_log_position)
+                new_content = log_file.read().decode("utf-8", errors="replace")
+                self._last_broadcast_log_position = current_size
+            
+            if new_content:
+                # Split into lines and broadcast each non-empty line via SSE
+                lines = new_content.split('\n')
+                for line in lines:
+                    if line.strip():
+                        get_progress_manager().emit("lmdeploy_runtime_log", {"line": line.strip(), "timestamp": datetime.utcnow().isoformat()})
+        except Exception as exc:
+            logger.debug(f"Failed to broadcast LMDeploy runtime logs: {exc}")
+
+    def _read_log_tail(self, max_bytes: int = 8192) -> str:
+        """Private alias for backward compatibility."""
+        return self.read_log_tail(max_bytes)
+
+    def _raise_with_logs(self, message: str) -> None:
+        """Raise a runtime error that includes the recent LMDeploy logs."""
+        log_tail = self.read_log_tail()
+        if log_tail:
+            logger.error(
+                f"{message}\n--- LMDeploy log tail ---\n{log_tail}\n--- end ---"
+            )
+            raise RuntimeError(f"{message}. See logs for details.\n{log_tail}")
+        raise RuntimeError(message)
+
+    def _detect_external_process(self) -> Optional[Dict[str, Any]]:
+        """Scan system processes for an LMDeploy server launched outside the manager."""
+        try:
+            for proc in psutil.process_iter(attrs=["pid", "cmdline", "create_time"]):
+                cmdline: List[str] = proc.info.get("cmdline") or []
+                if not cmdline:
+                    continue
+                lowered = " ".join(cmdline).lower()
+                if "lmdeploy" not in lowered:
+                    continue
+                if "serve" not in lowered or "api_server" not in lowered:
+                    continue
+
+                try:
+                    api_server_idx = cmdline.index("api_server")
+                except ValueError:
+                    continue
+                model_dir = (
+                    cmdline[api_server_idx + 1]
+                    if len(cmdline) > api_server_idx + 1
+                    else None
+                )
+                detection = {
+                    "pid": proc.info["pid"],
+                    "cmdline": cmdline,
+                    "model_dir": model_dir,
+                    "detected_at": datetime.utcnow().isoformat() + "Z",
+                }
+
+                config = self._config_from_cmdline(cmdline)
+                model_entry = (
+                    self._lookup_model_by_dir(model_dir) if model_dir else None
+                )
+                if model_entry:
+                    self._ensure_running_instance_record(model_entry.get("id"), config)
+                    detection["instance"] = {
+                        "model_id": model_entry.get("id"),
+                        "huggingface_id": model_entry.get("huggingface_id"),
+                        "file_path": model_entry.get("file_path"),
+                        "config": config,
+                        "pid": proc.info["pid"],
+                        "auto_detected": True,
+                    }
+                    detection["model_id"] = model_entry.get("id")
+                    detection["huggingface_id"] = model_entry.get("huggingface_id")
+                else:
+                    detection["instance"] = {
+                        "model_id": None,
+                        "huggingface_id": None,
+                        "file_path": model_dir,
+                        "config": config,
+                        "pid": proc.info["pid"],
+                        "auto_detected": True,
+                    }
+
+                started_at = proc.info.get("create_time")
+                if started_at:
+                    detection["started_at"] = (
+                        datetime.utcfromtimestamp(started_at).isoformat() + "Z"
+                    )
+                else:
+                    detection["started_at"] = datetime.utcnow().isoformat() + "Z"
+                return detection
+        except Exception as exc:
+            logger.debug(f"LMDeploy external scan failed: {exc}")
+        return None
+
+    def _config_from_cmdline(self, cmdline: List[str]) -> Dict[str, Any]:
+        """Reconstruct a minimal config dict from lmdeploy CLI arguments."""
+
+        def _extract(flag: str, cast, default=None):
+            if flag in cmdline:
+                idx = cmdline.index(flag)
+                if idx + 1 < len(cmdline):
+                    try:
+                        return cast(cmdline[idx + 1])
+                    except (ValueError, TypeError):
+                        return default
+            return default
+
+        def _extract_list(flag: str, default=None):
+            """Extract list of values for flags that accept multiple arguments."""
+            if flag not in cmdline:
+                return default
+            idx = cmdline.index(flag)
+            result = []
+            i = idx + 1
+            while i < len(cmdline) and not cmdline[i].startswith("--"):
+                result.append(cmdline[i])
+                i += 1
+            return result if result else default
+
+        session_len = _extract("--session-len", int, DEFAULT_LMDEPLOY_CONTEXT)
+        max_prefill = _extract("--max-prefill-token-num", int, session_len)
+        # Note: --max-context-token-num doesn't exist in LMDeploy, so derive from session_len
+        max_context = session_len
+
+        rope_scaling_factor = _extract("--rope-scaling-factor", float, 1.0)
+        rope_scaling_mode = "disabled"
+        if rope_scaling_factor and rope_scaling_factor > 1.0:
+            rope_scaling_mode = "detected"
+
+        hf_overrides: Dict[str, Any] = {}
+
+        def _assign_nested(target: Dict[str, Any], path: List[str], value: Any) -> None:
+            current = target
+            for segment in path[:-1]:
+                current = current.setdefault(segment, {})
+            current[path[-1]] = value
+
+        def _coerce_override_value(raw: str) -> Any:
+            lowered = raw.lower()
+            if lowered in {"true", "false"}:
+                return lowered == "true"
+            if lowered == "null":
+                return None
+            try:
+                if "." in raw:
+                    return float(raw)
+                return int(raw)
+            except ValueError:
+                return raw
+
+        i = 0
+        while i < len(cmdline):
+            token = cmdline[i]
+            if token.startswith("--hf-overrides."):
+                path_str = token[len("--hf-overrides.") :]
+                if path_str and i + 1 < len(cmdline):
+                    value = _coerce_override_value(cmdline[i + 1])
+                    _assign_nested(hf_overrides, path_str.split("."), value)
+                    i += 2
+                    continue
+            i += 1
+
+        config = {
+            "session_len": session_len,
+            "tensor_parallel": _extract("--tp", int, 1),
+            "max_batch_size": _extract("--max-batch-size", int, 4),
+            "max_prefill_token_num": max_prefill,
+            "max_context_token_num": max_context,
+            "dtype": _extract("--dtype", str, "auto"),
+            "cache_max_entry_count": _extract("--cache-max-entry-count", float, 0.8),
+            "cache_block_seq_len": _extract("--cache-block-seq-len", int, 64),
+            "enable_prefix_caching": "--enable-prefix-caching" in cmdline,
+            "quant_policy": _extract("--quant-policy", int, 0),
+            "model_format": _extract("--model-format", str, ""),
+            "hf_overrides": hf_overrides or _extract("--hf-overrides", str, ""),
+            # LMDeploy uses --disable-metrics, so enable_metrics=True when flag is NOT present
+            "enable_metrics": "--disable-metrics" not in cmdline,
+            "rope_scaling_factor": rope_scaling_factor,
+            "rope_scaling_mode": rope_scaling_mode,
+            "num_tokens_per_iter": _extract("--num-tokens-per-iter", int, 0),
+            "max_prefill_iters": _extract("--max-prefill-iters", int, 1),
+            "communicator": _extract("--communicator", str, "nccl"),
+            "model_name": _extract("--model-name", str, ""),
+            # Server configuration
+            "allow_origins": _extract_list("--allow-origins"),
+            "allow_credentials": "--allow-credentials" in cmdline,
+            "allow_methods": _extract_list("--allow-methods"),
+            "allow_headers": _extract_list("--allow-headers"),
+            "proxy_url": _extract("--proxy-url", str, ""),
+            "max_concurrent_requests": _extract("--max-concurrent-requests", int),
+            "log_level": _extract("--log-level", str, ""),
+            "api_keys": _extract_list("--api-keys"),
+            "ssl": "--ssl" in cmdline,
+            "max_log_len": _extract("--max-log-len", int),
+            "disable_fastapi_docs": "--disable-fastapi-docs" in cmdline,
+            "allow_terminate_by_client": "--allow-terminate-by-client" in cmdline,
+            "enable_abort_handling": "--enable-abort-handling" in cmdline,
+            # Model configuration
+            "chat_template": _extract("--chat-template", str, ""),
+            "tool_call_parser": _extract("--tool-call-parser", str, ""),
+            "reasoning_parser": _extract("--reasoning-parser", str, ""),
+            "revision": _extract("--revision", str, ""),
+            "download_dir": _extract("--download-dir", str, ""),
+            "adapters": _extract_list("--adapters"),
+            "device": _extract("--device", str, ""),
+            "eager_mode": "--eager-mode" in cmdline,
+            "disable_vision_encoder": "--disable-vision-encoder" in cmdline,
+            "logprobs_mode": _extract("--logprobs-mode", str),
+            # DLLM parameters
+            "dllm_block_length": _extract("--dllm-block-length", int),
+            "dllm_unmasking_strategy": _extract("--dllm-unmasking-strategy", str, ""),
+            "dllm_denoising_steps": _extract("--dllm-denoising-steps", int),
+            "dllm_confidence_threshold": _extract("--dllm-confidence-threshold", float),
+            # Distributed/Multi-node parameters
+            "dp": _extract("--dp", int),
+            "ep": _extract("--ep", int),
+            "enable_microbatch": "--enable-microbatch" in cmdline,
+            "enable_eplb": "--enable-eplb" in cmdline,
+            "role": _extract("--role", str, ""),
+            "migration_backend": _extract("--migration-backend", str, ""),
+            "node_rank": _extract("--node-rank", int),
+            "nnodes": _extract("--nnodes", int),
+            "cp": _extract("--cp", int),
+            "enable_return_routed_experts": "--enable-return-routed-experts" in cmdline,
+            "distributed_executor_backend": _extract(
+                "--distributed-executor-backend", str, ""
+            ),
+            # Vision parameters
+            "vision_max_batch_size": _extract("--vision-max-batch-size", int),
+            # Speculative decoding parameters
+            "speculative_algorithm": _extract("--speculative-algorithm", str, ""),
+            "speculative_draft_model": _extract("--speculative-draft-model", str, ""),
+            "speculative_num_draft_tokens": _extract(
+                "--speculative-num-draft-tokens", int
+            ),
+            "additional_args": "",
+        }
+
+        return config
+
+    def _lookup_model_by_dir(self, model_dir: Optional[str]) -> Optional[Dict[str, Any]]:
+        if not model_dir:
+            return None
+        store = get_store()
+        for candidate in store.list_models():
+            if (candidate.get("format") or candidate.get("model_format")) != "safetensors":
+                continue
+            fp = candidate.get("file_path")
+            if fp and os.path.dirname(fp) == model_dir:
+                return candidate
+        return None
+
+    def _ensure_running_instance_record(
+        self, model_id: Optional[Any], config: Dict[str, Any]
+    ) -> None:
+        # No-op: running state is not persisted to DB (Phase 1 YAML store)
+        pass
diff --git a/backend/main.py b/backend/main.py
index 40fc33b..c692ee4 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -3,13 +3,13 @@
 import uvicorn
 import time
 from datetime import datetime
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
+from fastapi import FastAPI, Request
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from contextlib import asynccontextmanager
 
-from backend.database import init_db, LlamaVersion
+from backend.data_store import get_store
 from backend.routes import (
     models,
     llama_versions,
@@ -18,9 +18,7 @@
     llama_version_manager,
     lmdeploy,
 )
-from backend.websocket_manager import websocket_manager
 from backend.huggingface import set_huggingface_token
-from backend.unified_monitor import unified_monitor
 from backend.logging_config import setup_logging, get_logger
 from backend.lmdeploy_installer import get_lmdeploy_installer
 from backend.lmdeploy_manager import get_lmdeploy_manager
@@ -38,7 +36,7 @@ def ensure_data_directories():
     else:
         data_dir = "data"
     
-    subdirs = ["models", "configs", "logs", "llama-cpp", "lmdeploy", "temp"]
+    subdirs = ["config", "configs", "logs", "llama-cpp", "lmdeploy", "temp"]
 
     try:
         # Ensure main data directory exists
@@ -70,7 +68,8 @@ def ensure_data_directories():
             logger.info(f"Data directory {data_dir} is writable")
         except PermissionError as e:
             logger.error(f"Data directory {data_dir} is not writable: {e}")
-            logger.warning(f"Current user: {os.getuid() if hasattr(os, 'getuid') else 'unknown'}, directory owner check needed")("Attempting to fix permissions...")
+            logger.warning(f"Current user: {os.getuid() if hasattr(os, 'getuid') else 'unknown'}, directory owner check needed")
+            logger.warning("Attempting to fix permissions...")
             # Try to fix permissions (may fail if not running as root)
             try:
                 import stat
@@ -95,74 +94,72 @@ def ensure_data_directories():
 
 async def register_all_models_with_llama_swap():
     """Register all downloaded models with llama-swap on startup"""
-    from backend.database import SessionLocal, Model
-    from backend.llama_manager import LlamaManager
-
-    db = SessionLocal()
-    try:
-        # Get all downloaded models
-        models = db.query(Model).all()
-        if not models:
-            logger.info("No models found to register with llama-swap")
-            return
-
-        logger.info(f"Found {len(models)} models to register with llama-swap")
-
-        llama_server_path = None
-        # Get llama-server path from active version
-        active_version = (
-            db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-        )
-        if active_version and os.path.exists(active_version.binary_path):
-            llama_server_path = active_version.binary_path
-            logger.info(f"Using active llama-cpp version: {active_version.version}")
-        else:
-            # Fallback: try to find llama-server in the llama-cpp directory
-            llama_cpp_dir = (
-                "data/llama-cpp" if os.path.exists("data") else "/app/data/llama-cpp"
-            )
-            if os.path.exists(llama_cpp_dir):
-                for version_dir in os.listdir(llama_cpp_dir):
-                    server_path = os.path.join(
-                        llama_cpp_dir, version_dir, "build", "bin", "llama-server"
-                    )
-                    if os.path.exists(server_path) and os.access(server_path, os.X_OK):
-                        llama_server_path = server_path
-                        logger.info(f"Found llama-server at: {llama_server_path}")
-                        break
-
-        if not llama_server_path:
-            logger.warning("llama-server not found, skipping model registration")
-            return
-
-        # Register each model with llama-swap (without binary path)
-        for model in models:
-            try:
-                # Create a basic config for the model
-                config = {
-                    "model": model.file_path,
-                    "host": "0.0.0.0",
-                    "ctx_size": 2048,
-                    "batch_size": 512,
-                    "threads": 4,
-                }
-
-                # Register with llama-swap (no binary path needed)
-                proxy_name = await llama_swap_manager.register_model(model, config)
-                logger.info(
-                    f"Registered model '{model.name}' as '{proxy_name}' with llama-swap"
+    store = get_store()
+    model_list = store.list_models()
+    if not model_list:
+        logger.info("No models found to register with llama-swap")
+        return
+
+    logger.info(f"Found {len(model_list)} models to register with llama-swap")
+
+    llama_server_path = None
+    for engine in ("llama_cpp", "ik_llama"):
+        active_version = store.get_active_engine_version(engine)
+        if active_version and active_version.get("binary_path"):
+            path = active_version["binary_path"]
+            if os.path.isabs(path) and os.path.exists(path):
+                llama_server_path = path
+            else:
+                abs_path = os.path.abspath(path)
+                if os.path.exists(abs_path):
+                    llama_server_path = abs_path
+            if llama_server_path:
+                logger.info(f"Using active {engine} version: {active_version.get('version')}")
+                break
+
+    if not llama_server_path:
+        llama_cpp_dir = "data/llama-cpp" if os.path.exists("data") else "/app/data/llama-cpp"
+        if os.path.exists(llama_cpp_dir):
+            for version_dir in os.listdir(llama_cpp_dir):
+                server_path = os.path.join(
+                    llama_cpp_dir, version_dir, "build", "bin", "llama-server"
                 )
+                if os.path.exists(server_path) and os.access(server_path, os.X_OK):
+                    llama_server_path = server_path
+                    logger.info(f"Found llama-server at: {llama_server_path}")
+                    break
+
+    if not llama_server_path:
+        logger.warning("llama-server not found, skipping model registration")
+        return
+
+    from backend.routes.models import _get_model_file_path
+    from backend.data_store import generate_proxy_name
+
+    for model in model_list:
+        file_path = _get_model_file_path(model)
+        if not file_path or not os.path.exists(file_path):
+            logger.debug(f"Model '{model.get('id')}' not found in HF cache, skipping")
+            continue
+        try:
+            proxy_name = generate_proxy_name(
+                model.get("huggingface_id", ""),
+                model.get("quantization"),
+            )
+            config = (model.get("config") or {}).copy()
+            config.setdefault("host", "0.0.0.0")
+            config.setdefault("ctx_size", 2048)
+            config.setdefault("batch_size", 512)
+            config.setdefault("threads", 4)
+            model_with_proxy = dict(model, proxy_name=proxy_name)
+            await llama_swap_manager.register_model(model_with_proxy, config)
+            logger.info(
+                f"Registered model '{model.get('display_name', model.get('id'))}' as '{proxy_name}' with llama-swap"
+            )
+        except Exception as e:
+            logger.error(f"Failed to register model '{model.get('id')}' with llama-swap: {e}")
 
-            except Exception as e:
-                logger.error(
-                    f"Failed to register model '{model.name}' with llama-swap: {e}"
-                )
-
-        # Generate config with the active version
-        await llama_swap_manager.regenerate_config_with_active_version()
-
-    finally:
-        db.close()
+    await llama_swap_manager.regenerate_config_with_active_version()
 
 
 @asynccontextmanager
@@ -171,7 +168,7 @@ async def lifespan(app: FastAPI):
 
     # Startup
     ensure_data_directories()
-    await init_db()
+    get_store()  # Ensure YAML config files exist
 
     huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
     if huggingface_api_key:
@@ -182,15 +179,20 @@ async def lifespan(app: FastAPI):
 
     llama_swap_manager = get_llama_swap_manager()
 
-    from backend.database import SessionLocal, LlamaVersion, RunningInstance, Model
-
-    session = SessionLocal()
-    active_version = (
-        session.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-    )
-    session.close()
-
-    if active_version and active_version.binary_path:
+    store = get_store()
+    active_version = None
+    for engine in ("llama_cpp", "ik_llama"):
+        v = store.get_active_engine_version(engine)
+        if v and v.get("binary_path"):
+            path = v["binary_path"]
+            if os.path.isabs(path) and os.path.exists(path):
+                active_version = v
+                break
+            if os.path.exists(os.path.abspath(path)):
+                active_version = v
+                break
+
+    if active_version and active_version.get("binary_path"):
         try:
             await llama_swap_manager.start_proxy()
             logger.info("llama-swap proxy started on port 2000")
@@ -203,64 +205,14 @@ async def lifespan(app: FastAPI):
             "Install or activate a llama.cpp build to enable multi-model serving."
         )
 
-    db = SessionLocal()
-    try:
-        stale_instances = db.query(RunningInstance).all()
-        if stale_instances:
-            logger.info(f"Cleaning {len(stale_instances)} stale instances")
-            for instance in stale_instances:
-                model = db.query(Model).filter(Model.id == instance.model_id).first()
-                if model:
-                    model.is_active = False
-                db.delete(instance)
-            db.commit()
-    finally:
-        db.close()
-
     try:
         await register_all_models_with_llama_swap()
     except Exception as e:
         logger.error(f"Failed to register models with llama-swap: {e}")
 
-    await unified_monitor.start_monitoring()
-
-    # Start background task for LMDeploy status and logs broadcasting
-    lmdeploy_broadcast_task = None
-    
-    async def broadcast_lmdeploy_updates():
-        """Periodically broadcast LMDeploy status and runtime logs."""
-        installer = get_lmdeploy_installer()
-        manager = get_lmdeploy_manager()
-        last_runtime_log_position = 0
-        
-        while True:
-            try:
-                # Broadcast status every 2 seconds
-                await installer._broadcast_status()
-                
-                # Broadcast new runtime log lines every 1 second
-                await manager._broadcast_runtime_logs()
-                
-                await asyncio.sleep(1)  # Check every 1 second
-            except Exception as e:
-                logger.debug(f"Error in LMDeploy broadcast task: {e}")
-                await asyncio.sleep(2)  # Wait longer on error
-    
-    lmdeploy_broadcast_task = asyncio.create_task(broadcast_lmdeploy_updates())
-    logger.info("Started LMDeploy WebSocket broadcasting task")
-
     yield
 
     # Shutdown
-    if lmdeploy_broadcast_task:
-        lmdeploy_broadcast_task.cancel()
-        try:
-            await lmdeploy_broadcast_task
-        except asyncio.CancelledError:
-            pass
-        logger.info("Stopped LMDeploy WebSocket broadcasting task")
-    
-    await unified_monitor.stop_monitoring()
 
     # Stop llama-swap (automatically stops all models)
     if llama_swap_manager:
@@ -282,9 +234,16 @@ async def broadcast_lmdeploy_updates():
 # CORS configuration via environment variables (safer defaults)
 # BACKEND_CORS_ORIGINS: comma-separated list of origins. Example: "http://localhost:5173,http://localhost:8080"
 # BACKEND_CORS_ALLOW_CREDENTIALS: "true"/"false" (default false; forced false when origins == ["*"])
-cors_origins_env = os.getenv("BACKEND_CORS_ORIGINS", "http://localhost:5173").strip()
+cors_origins_env = os.getenv(
+    "BACKEND_CORS_ORIGINS",
+    "http://localhost:5173,http://localhost:5174,http://localhost:5175,http://localhost:5176,http://localhost:8080",
+).strip()
 allow_origins = [o.strip() for o in cors_origins_env.split(",") if o.strip()] or [
-    "http://localhost:5173"
+    "http://localhost:5173",
+    "http://localhost:5174",
+    "http://localhost:5175",
+    "http://localhost:5176",
+    "http://localhost:8080",
 ]
 
 allow_credentials_env = (
@@ -302,8 +261,6 @@ async def broadcast_lmdeploy_updates():
     allow_headers=["*"],
 )
 
-# Use the global WebSocket manager instance
-
 # Include routers
 app.include_router(models.router, prefix="/api/models", tags=["models"])
 app.include_router(
@@ -316,42 +273,34 @@ async def broadcast_lmdeploy_updates():
 app.include_router(gpu_info.router, prefix="/api", tags=["gpu"])
 app.include_router(lmdeploy.router, prefix="/api", tags=["lmdeploy"])
 
-# Include monitoring routes
-from backend.routes import unified_monitoring
-
-app.include_router(unified_monitoring.router, prefix="/api", tags=["monitoring"])
-
-
-# WebSocket endpoint for real-time updates (must be before static file serving)
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    import json
-
-    try:
-        logger.info("New WebSocket connection attempt")
-        await websocket_manager.connect(websocket)
-        logger.info(
-            f"WebSocket connected successfully. Total connections: {len(websocket_manager.active_connections)}"
-        )
-
-        try:
-            while True:
-                # Keep connection alive and handle any incoming messages
-                data = await websocket.receive_text()
-                message = json.loads(data)
-
-                # Handle any client messages if needed
-                logger.debug(f"Received WebSocket message: {message}")
-
-        except WebSocketDisconnect:
-            logger.info("WebSocket disconnected by client")
-            websocket_manager.disconnect(websocket)
-        except Exception as e:
-            logger.error(f"WebSocket error: {e}")
-            websocket_manager.disconnect(websocket)
-    except Exception as e:
-        logger.error(f"Failed to establish WebSocket connection: {e}")
-        websocket_manager.disconnect(websocket)
+# SSE endpoint for progress tracking
+from backend.progress_manager import get_progress_manager
+from fastapi.responses import StreamingResponse
+
+
+@app.get("/api/events")
+async def sse_events(request: Request):
+    """Server-Sent Events endpoint for progress tracking."""
+    logger.info("SSE /api/events: client connected")
+    pm = get_progress_manager()
+
+    async def logged_stream():
+        first = True
+        async for chunk in pm.subscribe():
+            if first:
+                logger.info("SSE: sending first chunk to client")
+                first = False
+            yield chunk
+
+    return StreamingResponse(
+        logged_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache, no-store, must-revalidate",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",  # Disable proxy buffering (nginx, etc.)
+        },
+    )
 
 
 # Serve static files (built frontend)
@@ -399,8 +348,8 @@ async def serve_favicon():
     # Catch-all route for Vue Router (must be after API routes)
     @app.get("/{full_path:path}")
     async def serve_spa(full_path: str):
-        # If it's an API route or WebSocket route, let it pass through
-        if full_path.startswith("api/") or full_path.startswith("ws"):
+        # If it's an API route, let it pass through
+        if full_path.startswith("api/"):
             return {"error": "Not found"}
 
         # Serve index.html for all other routes (Vue Router will handle routing)
@@ -455,9 +404,12 @@ async def serve_spa(full_path: str):
 
 
 if __name__ == "__main__":
-    # Enable hot reload in development (set RELOAD=true environment variable)
-    enable_reload = os.getenv("RELOAD", "false").lower() in ("true", "1", "yes")
-    reload_dirs = ["/app/backend"] if enable_reload else None
+    # Auto-reload in development: on by default when not in Docker; set RELOAD=false to disable
+    in_docker = os.path.exists("/app/data")
+    enable_reload = os.getenv("RELOAD", "true" if not in_docker else "false").lower() in ("true", "1", "yes")
+    # Watch the backend package directory (works when run from repo root with --app-dir backend)
+    backend_dir = os.path.abspath(os.path.dirname(__file__))
+    reload_dirs = [backend_dir] if enable_reload else None
 
     uvicorn.run(
         "main:app",
diff --git a/backend/param_registry.py b/backend/param_registry.py
new file mode 100644
index 0000000..9b8e5bb
--- /dev/null
+++ b/backend/param_registry.py
@@ -0,0 +1,117 @@
+"""
+Registry of model config parameters for llama.cpp (and optionally LMDeploy).
+Used by the frontend to render basic vs advanced settings from a single source of truth.
+"""
+
+import copy
+from typing import Any, Dict, List
+
+# Param entry: key, label, type ("int"|"float"|"bool"|"string"), default, min, max (optional), description (optional)
+ParamDef = Dict[str, Any]
+
+# Basic params shown by default (most common for chat/embedding)
+# Host and port are not included: they are managed by llama-swap (--port ${PORT}, host default 0.0.0.0)
+LLAMA_CPP_BASIC: List[ParamDef] = [
+    {"key": "ctx_size", "label": "Context size", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum context length in tokens"},
+    {"key": "n_gpu_layers", "label": "GPU layers", "type": "int", "default": -1, "min": -1, "max": 1000, "description": "Number of layers to offload to GPU (-1 = all)"},
+    {"key": "batch_size", "label": "Batch size", "type": "int", "default": 512, "min": 1, "max": 2048, "description": "Batch size for prompt processing"},
+    {"key": "threads", "label": "Threads", "type": "int", "default": 4, "min": 1, "max": 64, "description": "Number of threads"},
+    {"key": "embedding", "label": "Embedding mode", "type": "bool", "default": False, "description": "Enable embedding-only mode"},
+]
+
+# Advanced params (shown in expandable "Advanced" section)
+LLAMA_CPP_ADVANCED: List[ParamDef] = [
+    {"key": "n_predict", "label": "Max tokens to predict", "type": "int", "default": -1, "min": -1, "max": 100_000},
+    {"key": "ubatch_size", "label": "Ubatch size", "type": "int", "default": 512, "min": 1, "max": 2048},
+    {"key": "temp", "label": "Temperature", "type": "float", "default": 0.8, "min": 0, "max": 2},
+    {"key": "top_k", "label": "Top K", "type": "int", "default": 40, "min": 0, "max": 1000},
+    {"key": "top_p", "label": "Top P", "type": "float", "default": 0.9, "min": 0, "max": 1},
+    {"key": "min_p", "label": "Min P", "type": "float", "default": 0.0, "min": 0, "max": 1},
+    {"key": "typical_p", "label": "Typical P", "type": "float", "default": 1.0, "min": 0, "max": 1},
+    {"key": "repeat_penalty", "label": "Repeat penalty", "type": "float", "default": 1.1, "min": 1, "max": 2},
+    {"key": "presence_penalty", "label": "Presence penalty", "type": "float", "default": 0, "min": -2, "max": 2},
+    {"key": "frequency_penalty", "label": "Frequency penalty", "type": "float", "default": 0, "min": -2, "max": 2},
+    {"key": "seed", "label": "Seed", "type": "int", "default": -1, "min": -1, "max": 2**31 - 1},
+    {"key": "threads_batch", "label": "Threads (batch)", "type": "int", "default": -1, "min": -1, "max": 64},
+    {"key": "parallel", "label": "Parallel", "type": "int", "default": 1, "min": 1, "max": 64},
+    {"key": "rope_freq_base", "label": "RoPE freq base", "type": "float", "default": 0, "min": 0},
+    {"key": "rope_freq_scale", "label": "RoPE freq scale", "type": "float", "default": 0, "min": 0},
+    {"key": "flash_attn", "label": "Flash attention", "type": "bool", "default": False},
+    {"key": "yarn_ext_factor", "label": "YaRN ext factor", "type": "float", "default": -1, "min": -1},
+    {"key": "yarn_attn_factor", "label": "YaRN attn factor", "type": "float", "default": 1, "min": 0},
+    {"key": "no_mmap", "label": "No mmap", "type": "bool", "default": False},
+    {"key": "mlock", "label": "MLock", "type": "bool", "default": False},
+    {"key": "low_vram", "label": "Low VRAM", "type": "bool", "default": False},
+    {"key": "logits_all", "label": "Logits all", "type": "bool", "default": False},
+    {"key": "cont_batching", "label": "Continuous batching", "type": "bool", "default": True},
+    {"key": "no_kv_offload", "label": "No KV offload", "type": "bool", "default": False},
+    {"key": "tensor_split", "label": "Tensor split", "type": "string", "default": ""},
+    {"key": "main_gpu", "label": "Main GPU", "type": "int", "default": 0, "min": 0},
+    {"key": "split_mode", "label": "Split mode", "type": "string", "default": ""},
+    {"key": "cache_type_k", "label": "Cache type K", "type": "string", "default": ""},
+    {"key": "cache_type_v", "label": "Cache type V", "type": "string", "default": ""},
+    {"key": "grammar", "label": "Grammar", "type": "string", "default": ""},
+    {"key": "json_schema", "label": "JSON schema", "type": "string", "default": ""},
+    {"key": "cpu_moe", "label": "CPU MoE", "type": "bool", "default": False},
+    {"key": "n_cpu_moe", "label": "N CPU MoE", "type": "int", "default": 0, "min": 0},
+    {"key": "override_tensor", "label": "Override tensor", "type": "string", "default": ""},
+    {"key": "rope_scaling", "label": "RoPE scaling", "type": "string", "default": ""},
+    {"key": "mirostat", "label": "Mirostat", "type": "int", "default": 0, "min": 0, "max": 2},
+    {"key": "mirostat_tau", "label": "Mirostat tau", "type": "float", "default": 5.0, "min": 0},
+    {"key": "mirostat_eta", "label": "Mirostat eta", "type": "float", "default": 0.1, "min": 0},
+]
+
+# ik_llama.cpp: same as llama_cpp plus these extras (and different mirostat flag names)
+IK_LLAMA_EXTRA: List[ParamDef] = [
+    {"key": "mla_attn", "label": "MLA attention", "type": "bool", "default": False, "description": "Enable MLA attention"},
+    {"key": "attn_max_batch", "label": "Attention max batch", "type": "int", "default": 0, "min": 0, "description": "Max attention batch size"},
+    {"key": "fused_moe", "label": "Fused MoE", "type": "bool", "default": True, "description": "Enable fused MoE"},
+    {"key": "smart_expert_reduction", "label": "Smart expert reduction", "type": "bool", "default": False, "description": "Enable smart expert reduction"},
+]
+
+# LMDeploy (safetensors / TurboMind)
+LMDEPLOY_BASIC: List[ParamDef] = [
+    {"key": "session_len", "label": "Session length", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum session length"},
+    {"key": "max_batch_size", "label": "Max batch size", "type": "int", "default": 128, "min": 1, "max": 1024, "description": "Maximum batch size"},
+    {"key": "tensor_parallel", "label": "Tensor parallel", "type": "int", "default": 1, "min": 1, "max": 8, "description": "Tensor parallelism degree"},
+]
+LMDEPLOY_ADVANCED: List[ParamDef] = [
+    {"key": "dtype", "label": "Dtype", "type": "string", "default": "auto", "description": "Model dtype (auto, float16, bfloat16)"},
+    {"key": "quant_policy", "label": "Quantization policy", "type": "int", "default": 0, "min": 0, "max": 8, "description": "KV cache quantization (0=off, 4=4bit, 8=8bit)"},
+    {"key": "enable_prefix_caching", "label": "Prefix caching", "type": "bool", "default": False, "description": "Enable prefix caching"},
+    {"key": "chat_template", "label": "Chat template", "type": "string", "default": "", "description": "Override chat template"},
+]
+
+
+def get_llama_cpp_param_registry() -> Dict[str, List[ParamDef]]:
+    """Return basic and advanced param definitions for llama.cpp config forms."""
+    return {
+        "basic": LLAMA_CPP_BASIC,
+        "advanced": LLAMA_CPP_ADVANCED,
+    }
+
+
+def get_ik_llama_param_registry() -> Dict[str, List[ParamDef]]:
+    """Return param definitions for ik_llama.cpp (llama_cpp params plus ik_llama extras)."""
+    basic = copy.deepcopy(LLAMA_CPP_BASIC)
+    advanced = copy.deepcopy(LLAMA_CPP_ADVANCED) + copy.deepcopy(IK_LLAMA_EXTRA)
+    return {"basic": basic, "advanced": advanced}
+
+
+def get_lmdeploy_param_registry() -> Dict[str, List[ParamDef]]:
+    """Return param definitions for LMDeploy (safetensors / TurboMind)."""
+    return {
+        "basic": LMDEPLOY_BASIC,
+        "advanced": LMDEPLOY_ADVANCED,
+    }
+
+
+def get_param_registry(engine: str = "llama_cpp") -> Dict[str, List[ParamDef]]:
+    """Return param registry for the given engine."""
+    if engine == "llama_cpp":
+        return get_llama_cpp_param_registry()
+    if engine == "ik_llama":
+        return get_ik_llama_param_registry()
+    if engine == "lmdeploy":
+        return get_lmdeploy_param_registry()
+    return {"basic": [], "advanced": []}
diff --git a/backend/presets.py b/backend/presets.py
deleted file mode 100644
index 517a9cf..0000000
--- a/backend/presets.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Dict, Any, Tuple
-import os
-
-from backend.gguf_reader import get_model_layer_info
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-
-def _detect_architecture_from_name(model_name: str) -> str:
-    """Detect model architecture from model name"""
-    name = (model_name or "").lower()
-
-    if "llama" in name:
-        if "codellama" in name:
-            return "codellama"
-        elif "llama3" in name or "llama-3" in name:
-            return "llama3"
-        elif "llama2" in name or "llama-2" in name:
-            return "llama2"
-        return "llama"
-    elif "mistral" in name:
-        return "mistral"
-    elif "phi" in name:
-        return "phi"
-    elif "glm" in name or "chatglm" in name:
-        if "glm-4" in name or "glm4" in name:
-            return "glm4"
-        return "glm"
-    elif "deepseek" in name:
-        if "v3" in name or "v3.1" in name:
-            return "deepseek-v3"
-        return "deepseek"
-    elif "qwen" in name:
-        if "qwen3" in name or "qwen-3" in name:
-            return "qwen3"
-        elif "qwen2" in name or "qwen-2" in name:
-            return "qwen2"
-        return "qwen"
-    elif "gemma" in name:
-        if "gemma3" in name or "gemma-3" in name:
-            return "gemma3"
-        return "gemma"
-
-    return "unknown"
-
-
-def get_architecture_and_presets(model) -> Tuple[str, Dict[str, Dict[str, Any]]]:
-    """
-    Source of truth for presets. Returns (architecture, presets dict).
-    Presets include keys like temp, top_p, top_k, repeat_penalty.
-    """
-    # Import normalize_architecture from model_metadata to ensure consistency
-    from backend.smart_auto.architecture_config import (
-        normalize_architecture,
-        detect_architecture_from_name,
-    )
-
-    # Try GGUF metadata
-    architecture = "unknown"
-    try:
-        if model.file_path and os.path.exists(model.file_path):
-            layer_info = get_model_layer_info(model.file_path)
-            if layer_info:
-                raw_architecture = layer_info.get("architecture", "")
-                architecture = normalize_architecture(raw_architecture)
-                if architecture != "unknown" and raw_architecture != architecture:
-                    logger.debug(
-                        f"Normalized architecture for presets: '{raw_architecture}' -> '{architecture}'"
-                    )
-    except Exception as e:
-        logger.warning(f"Failed to get layer info for presets: {e}")
-
-    # Fallback to name-based detection if architecture is still unknown or empty
-    if not architecture or architecture == "unknown":
-        architecture = detect_architecture_from_name(model.name)
-        if architecture != "unknown":
-            logger.debug(
-                f"Detected architecture from name for presets: '{architecture}'"
-            )
-
-    # Defaults
-    presets: Dict[str, Dict[str, Any]] = {
-        "coding": {},
-        "conversational": {},
-    }
-
-    model_lower = (model.name or "").lower()
-    is_coding_model = "code" in model_lower or architecture in ["codellama", "deepseek"]
-
-    if architecture in ["glm", "glm4"]:
-        presets["coding"] = {
-            "temp": 1.0,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.05,
-        }
-        presets["conversational"] = {
-            "temp": 1.0,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-    elif architecture in ["deepseek", "deepseek-v3"]:
-        presets["coding"] = {
-            "temp": 1.0,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.05,
-        }
-        presets["conversational"] = {
-            "temp": 0.9,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-    elif architecture in ["qwen", "qwen2", "qwen3"]:
-        presets["coding"] = {
-            "temp": 0.7,
-            "top_p": 0.8,
-            "top_k": 20,
-            "repeat_penalty": 1.05,
-        }
-        presets["conversational"] = {
-            "temp": 0.7,
-            "top_p": 0.8,
-            "top_k": 20,
-            "repeat_penalty": 1.05,
-        }
-    elif architecture in ["gemma", "gemma3"]:
-        presets["coding"] = {
-            "temp": 0.9,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.05,
-        }
-        presets["conversational"] = {
-            "temp": 0.9,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-    elif is_coding_model:
-        presets["coding"] = {
-            "temp": 0.1,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.05,
-        }
-        presets["conversational"] = {
-            "temp": 0.7,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-    else:
-        presets["coding"] = {
-            "temp": 0.7,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-        presets["conversational"] = {
-            "temp": 0.8,
-            "top_p": 0.95,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-        }
-
-    return architecture, presets
diff --git a/backend/progress_manager.py b/backend/progress_manager.py
new file mode 100644
index 0000000..46269e4
--- /dev/null
+++ b/backend/progress_manager.py
@@ -0,0 +1,236 @@
+"""SSE-based progress tracking."""
+
+import asyncio
+import json
+import time
+import uuid
+from datetime import datetime
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+
+class ProgressManager:
+    """In-memory task tracker with SSE streaming."""
+
+    def __init__(self):
+        self._tasks: Dict[str, dict] = {}
+        self._subscribers: list[asyncio.Queue] = []
+
+    def create_task(
+        self,
+        task_type: str,
+        description: str,
+        metadata: Optional[dict] = None,
+        task_id: Optional[str] = None,
+    ) -> str:
+        """Create a new tracked task. Returns task_id (uses provided task_id if given)."""
+        task_id = task_id or str(uuid.uuid4())[:8]
+        self._tasks[task_id] = {
+            "task_id": task_id,
+            "type": task_type,
+            "description": description,
+            "progress": 0.0,
+            "status": "running",
+            "message": "",
+            "metadata": metadata or {},
+            "created_at": time.time(),
+        }
+        self._broadcast({"event": "task_created", "data": self._tasks[task_id]})
+        return task_id
+
+    def update_task(
+        self,
+        task_id: str,
+        progress: Optional[float] = None,
+        message: Optional[str] = None,
+        status: Optional[str] = None,
+        metadata_update: Optional[dict] = None,
+    ):
+        """Update a task's progress/status."""
+        task = self._tasks.get(task_id)
+        if not task:
+            return
+        if progress is not None:
+            task["progress"] = min(100.0, max(0.0, progress))
+        if message is not None:
+            task["message"] = message
+        if status is not None:
+            task["status"] = status
+        if metadata_update:
+            task["metadata"].update(metadata_update)
+        self._broadcast({"event": "task_updated", "data": task})
+
+    def complete_task(self, task_id: str, message: str = "Done"):
+        self.update_task(task_id, progress=100.0, status="completed", message=message)
+
+    def fail_task(self, task_id: str, error: str):
+        self.update_task(task_id, status="failed", message=error)
+
+    def get_task(self, task_id: str) -> Optional[dict]:
+        return self._tasks.get(task_id)
+
+    def get_active_tasks(self) -> list:
+        return [t for t in self._tasks.values() if t["status"] == "running"]
+
+    def _broadcast(self, event: dict):
+        dead = []
+        for q in self._subscribers:
+            try:
+                q.put_nowait(event)
+            except asyncio.QueueFull:
+                dead.append(q)
+        for q in dead:
+            self._subscribers.remove(q)
+
+    def emit(self, event_type: str, data: Any):
+        """Emit a generic event (e.g. log, notification, model_status) to SSE subscribers."""
+        self._broadcast({"event": event_type, "data": data})
+
+    @property
+    def active_connections(self) -> List:
+        """SSE has no persistent connection list; returns empty for compatibility."""
+        return []
+
+    async def send_download_progress(
+        self,
+        task_id: str,
+        progress: int,
+        message: str = "",
+        bytes_downloaded: int = 0,
+        total_bytes: int = 0,
+        speed_mbps: float = 0,
+        eta_seconds: int = 0,
+        filename: str = "",
+        model_format: str = "gguf",
+        files_completed: int = None,
+        files_total: int = None,
+        current_filename: str = None,
+        huggingface_id: str = None,
+        **kwargs,
+    ):
+        self.update_task(
+            task_id,
+            progress=float(progress),
+            message=message or filename,
+            metadata_update=kwargs,
+        )
+        self.emit(
+            "download_progress",
+            {
+                "task_id": task_id,
+                "progress": progress,
+                "message": message,
+                "bytes_downloaded": bytes_downloaded,
+                "total_bytes": total_bytes,
+                "speed_mbps": speed_mbps,
+                "eta_seconds": eta_seconds,
+                "filename": filename,
+                "model_format": model_format,
+                "files_completed": files_completed,
+                "files_total": files_total,
+                "current_filename": current_filename or filename,
+                "huggingface_id": huggingface_id,
+                "timestamp": datetime.utcnow().isoformat(),
+                **kwargs,
+            },
+        )
+
+    async def broadcast(self, message: dict):
+        msg_type = message.get("type", "broadcast")
+        self.emit(msg_type, message)
+
+    async def send_model_status_update(
+        self, model_id: Any, status: str, details: dict = None
+    ):
+        self.emit(
+            "model_status",
+            {
+                "model_id": model_id,
+                "status": status,
+                "details": details or {},
+                "timestamp": datetime.utcnow().isoformat(),
+            },
+        )
+
+    async def send_notification(
+        self,
+        title: str = "",
+        message: str = "",
+        type: str = "info",
+        actions: List[dict] = None,
+        *args,
+        **kwargs,
+    ):
+        # Support (title, message, type) keyword and (type, title, message, task_id) positional
+        if args and len(args) >= 3:
+            type, title, message = args[0], args[1], args[2]
+        else:
+            type = kwargs.get("type", type)
+            title = kwargs.get("title", title)
+            message = kwargs.get("message", message)
+        self.emit(
+            "notification",
+            {
+                "title": title,
+                "message": message,
+                "type": type,
+                "notification_type": type,
+                "actions": actions or [],
+                "timestamp": datetime.utcnow().isoformat(),
+                **{k: v for k, v in kwargs.items() if k not in ("title", "message", "type", "actions")},
+            },
+        )
+
+    async def send_build_progress(
+        self,
+        task_id: str,
+        stage: str,
+        progress: int,
+        message: str = "",
+        log_lines: List[str] = None,
+    ):
+        self.update_task(
+            task_id,
+            progress=float(progress),
+            message=message,
+            metadata_update={"stage": stage, "log_lines": log_lines or []},
+        )
+        self.emit(
+            "build_progress",
+            {
+                "task_id": task_id,
+                "stage": stage,
+                "progress": progress,
+                "message": message,
+                "log_lines": log_lines or [],
+                "timestamp": datetime.utcnow().isoformat(),
+            },
+        )
+
+    async def subscribe(self) -> AsyncGenerator[str, None]:
+        """Yields SSE-formatted strings. Sends an initial comment so the client connection opens."""
+        queue: asyncio.Queue = asyncio.Queue(maxsize=100)
+        self._subscribers.append(queue)
+        try:
+            # Send immediate heartbeat so EventSource receives data and fires onopen
+            yield ": heartbeat\n\n"
+            await asyncio.sleep(0)  # Allow first chunk to be flushed to the client
+            for task in self.get_active_tasks():
+                yield f"event: task_updated\ndata: {json.dumps(task)}\n\n"
+            while True:
+                event = await queue.get()
+                yield f"event: {event['event']}\ndata: {json.dumps(event['data'])}\n\n"
+        except asyncio.CancelledError:
+            pass
+        finally:
+            if queue in self._subscribers:
+                self._subscribers.remove(queue)
+
+
+_progress_manager: Optional[ProgressManager] = None
+
+
+def get_progress_manager() -> ProgressManager:
+    global _progress_manager
+    if _progress_manager is None:
+        _progress_manager = ProgressManager()
+    return _progress_manager
diff --git a/backend/routes/llama_version_manager.py b/backend/routes/llama_version_manager.py
index 47e3bd5..6b9ee9a 100644
--- a/backend/routes/llama_version_manager.py
+++ b/backend/routes/llama_version_manager.py
@@ -1,13 +1,10 @@
-from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
-from sqlalchemy.orm import Session
-from typing import List, Dict, Any
+from fastapi import APIRouter, HTTPException
 import os
 import shutil
 import stat
 import time
-from datetime import datetime
 
-from backend.database import get_db, LlamaVersion, Model
+from backend.data_store import get_store
 from backend.logging_config import get_logger
 
 logger = get_logger(__name__)
@@ -15,7 +12,6 @@
 
 
 def _remove_readonly(func, path, exc):
-    """Helper function to handle readonly files on Windows"""
     try:
         os.chmod(path, stat.S_IWRITE)
         func(path)
@@ -24,198 +20,140 @@ def _remove_readonly(func, path, exc):
 
 
 def _robust_rmtree(path: str, max_retries: int = 3) -> None:
-    """Robustly remove a directory tree, handling Windows file locks"""
     if not os.path.exists(path):
         return
-
     for attempt in range(max_retries):
         try:
-            # Use onerror callback to handle readonly files (common on Windows)
             shutil.rmtree(path, onerror=_remove_readonly)
             logger.info(f"Successfully deleted directory: {path}")
             return
-        except PermissionError as e:
-            if attempt < max_retries - 1:
-                logger.warning(
-                    f"Permission error deleting {path}, attempt {attempt + 1}/{max_retries}: {e}"
-                )
-                time.sleep(0.5)  # Wait a bit before retrying
-            else:
-                logger.error(
-                    f"Failed to delete {path} after {max_retries} attempts: {e}"
-                )
-                raise
-        except OSError as e:
+        except (PermissionError, OSError) as e:
             if attempt < max_retries - 1:
-                logger.warning(
-                    f"OS error deleting {path}, attempt {attempt + 1}/{max_retries}: {e}"
-                )
                 time.sleep(0.5)
             else:
-                logger.error(
-                    f"Failed to delete {path} after {max_retries} attempts: {e}"
-                )
+                logger.error(f"Failed to delete {path} after {max_retries} attempts: {e}")
                 raise
 
 
-@router.get("/llama-versions")
-async def list_llama_versions(db: Session = Depends(get_db)):
-    """List all installed llama-cpp versions"""
-    versions = db.query(LlamaVersion).all()
-
-    # Also scan the filesystem for any versions not in the database
-    llama_cpp_dir = (
-        "data/llama-cpp" if os.path.exists("data") else "/app/data/llama-cpp"
-    )
-    if os.path.exists(llama_cpp_dir):
-        for version_dir in os.listdir(llama_cpp_dir):
-            if os.path.isdir(os.path.join(llama_cpp_dir, version_dir)):
-                # Check if this version is already in the database
-                existing_version = (
-                    db.query(LlamaVersion)
-                    .filter(LlamaVersion.version == version_dir)
-                    .first()
-                )
-                if not existing_version:
-                    # Add to database
-                    binary_path = os.path.join(
-                        llama_cpp_dir, version_dir, "build", "bin", "llama-server"
-                    )
-                    if os.path.exists(binary_path):
-                        new_version = LlamaVersion(
-                            version=version_dir,
-                            install_type="source",
-                            source_commit=version_dir,
-                            is_active=False,
-                            binary_path=binary_path,
-                        )
-                        db.add(new_version)
-                        db.commit()
-                        logger.info(
-                            f"Added llama-cpp version {version_dir} to database"
-                        )
-
-    # Refresh the list
-    versions = db.query(LlamaVersion).all()
+def _resolve_binary_path(binary_path: str) -> str:
+    if not binary_path:
+        return ""
+    if os.path.isabs(binary_path):
+        return binary_path
+    return os.path.join("/app", binary_path)
 
-    return {
-        "versions": [
-            {
-                "id": v.id,
-                "version": v.version,
-                "install_type": v.install_type,
-                "source_commit": v.source_commit,
-                "is_active": v.is_active,
-                "installed_at": v.installed_at.isoformat() if v.installed_at else None,
-                "binary_path": v.binary_path,
-                "exists": os.path.exists(v.binary_path) if v.binary_path else False,
-            }
-            for v in versions
-        ]
-    }
+
+@router.get("/llama-versions")
+async def list_llama_versions():
+    """List all installed llama-cpp versions (llama_cpp engine)."""
+    store = get_store()
+    versions = store.get_engine_versions("llama_cpp")
+    result = []
+    for i, v in enumerate(versions):
+        binary_path = _resolve_binary_path(v.get("binary_path"))
+        result.append({
+            "id": i,
+            "version": v.get("version"),
+            "install_type": v.get("type", "source"),
+            "source_commit": v.get("source_commit"),
+            "is_active": store.get_active_engine_version("llama_cpp") and store.get_active_engine_version("llama_cpp").get("version") == v.get("version"),
+            "installed_at": v.get("installed_at"),
+            "binary_path": v.get("binary_path"),
+            "exists": os.path.exists(binary_path) if binary_path else False,
+        })
+    return {"versions": result}
 
 
 @router.post("/llama-versions/{version_id}/activate")
-async def activate_llama_version(version_id: int, db: Session = Depends(get_db)):
-    """Activate a specific llama-cpp version"""
-    # Deactivate all versions first
-    db.query(LlamaVersion).update({"is_active": False})
-
-    # Activate the selected version
-    version = db.query(LlamaVersion).filter(LlamaVersion.id == version_id).first()
-    if not version:
+async def activate_llama_version(version_id: str):
+    """Activate a specific llama-cpp version (version_id can be index, version string, or "llama_cpp:version")."""
+    store = get_store()
+    versions = store.get_engine_versions("llama_cpp")
+    # Frontend may send id from list endpoint: "llama_cpp:version_str"
+    lookup_id = version_id
+    if ":" in str(version_id):
+        parts = str(version_id).split(":", 1)
+        if parts[0] == "llama_cpp":
+            lookup_id = parts[1]
+    version_entry = None
+    try:
+        idx = int(lookup_id)
+        if 0 <= idx < len(versions):
+            version_entry = versions[idx]
+    except ValueError:
+        pass
+    if not version_entry:
+        version_entry = next((v for v in versions if str(v.get("version")) == str(lookup_id)), None)
+    if not version_entry:
         raise HTTPException(status_code=404, detail="Version not found")
-
-    if not os.path.exists(version.binary_path):
+    binary_path = _resolve_binary_path(version_entry.get("binary_path"))
+    if not os.path.exists(binary_path):
         raise HTTPException(status_code=400, detail="Binary file does not exist")
-
-    version.is_active = True
-    db.commit()
-
-    # Ensure binary path is correct for the newly activated version
+    version_str = str(version_entry.get("version"))
+    store.set_active_engine_version("llama_cpp", version_str)
     try:
         from backend.llama_swap_manager import get_llama_swap_manager
-
         llama_swap_manager = get_llama_swap_manager()
-
-        # Check and fix binary path if needed
         await llama_swap_manager._ensure_correct_binary_path()
-        logger.info(f"Binary path verified for activated version: {version.version}")
-
-        # Regenerate llama-swap configuration with new binary path
-        # This will also ensure llama-swap is started
         await llama_swap_manager.regenerate_config_with_active_version()
-
-        logger.info(
-            f"Regenerated llama-swap config with new active version: {version.version}"
-        )
-        
-        # Explicitly ensure llama-swap is running after activation
         try:
             await llama_swap_manager.start_proxy()
-            logger.info("Ensured llama-swap is running after version activation")
         except Exception as e:
             logger.warning(f"Failed to start llama-swap after version activation: {e}")
     except Exception as e:
         logger.error(f"Failed to regenerate llama-swap config: {e}")
-        # Don't fail the activation if config regeneration fails
-
-    logger.info(f"Activated llama-cpp version: {version.version}")
-    return {"message": f"Activated llama-cpp version {version.version}"}
+    logger.info(f"Activated llama-cpp version: {version_str}")
+    return {"message": f"Activated llama-cpp version {version_str}"}
 
 
 @router.delete("/llama-versions/{version_id}")
-async def delete_llama_version(version_id: int, db: Session = Depends(get_db)):
-    """Delete a llama-cpp version"""
-    version = db.query(LlamaVersion).filter(LlamaVersion.id == version_id).first()
-    if not version:
+async def delete_llama_version(version_id: str):
+    """Delete a llama-cpp version (version_id can be index or version string)."""
+    store = get_store()
+    versions = store.get_engine_versions("llama_cpp")
+    version_entry = None
+    try:
+        idx = int(version_id)
+        if 0 <= idx < len(versions):
+            version_entry = versions[idx]
+    except ValueError:
+        pass
+    if not version_entry:
+        version_entry = next((v for v in versions if str(v.get("version")) == str(version_id)), None)
+    if not version_entry:
         raise HTTPException(status_code=404, detail="Version not found")
-
-    if version.is_active:
+    version_str = str(version_entry.get("version"))
+    active = store.get_active_engine_version("llama_cpp")
+    if active and str(active.get("version")) == version_str:
         raise HTTPException(status_code=400, detail="Cannot delete active version")
-
-    # Delete the directory
-    version_dir = os.path.dirname(
-        os.path.dirname(version.binary_path)
-    )  # Go up from build/bin/llama-server
-    if os.path.exists(version_dir):
+    binary_path = _resolve_binary_path(version_entry.get("binary_path"))
+    version_dir = os.path.dirname(os.path.dirname(binary_path)) if binary_path else None
+    if version_dir and os.path.exists(version_dir):
         try:
             _robust_rmtree(version_dir)
         except Exception as e:
             logger.error(f"Failed to delete directory {version_dir}: {e}")
-            raise HTTPException(
-                status_code=500, detail=f"Failed to delete directory: {e}"
-            )
-
-    # Remove from database
-    db.delete(version)
-    db.commit()
-
-    logger.info(f"Deleted llama-cpp version: {version.version}")
-    return {"message": f"Deleted llama-cpp version {version.version}"}
+            raise HTTPException(status_code=500, detail=f"Failed to delete directory: {e}")
+    store.delete_engine_version("llama_cpp", version_str)
+    logger.info(f"Deleted llama-cpp version: {version_str}")
+    return {"message": f"Deleted llama-cpp version {version_str}"}
 
 
 @router.get("/llama-versions/active")
-async def get_active_llama_version(db: Session = Depends(get_db)):
-    """Get the currently active llama-cpp version"""
-    active_version = (
-        db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-    )
-
+async def get_active_llama_version():
+    """Get the currently active llama-cpp version."""
+    store = get_store()
+    active_version = store.get_active_engine_version("llama_cpp")
     if not active_version:
         return {"active_version": None}
-
+    binary_path = _resolve_binary_path(active_version.get("binary_path"))
     return {
         "active_version": {
-            "id": active_version.id,
-            "version": active_version.version,
-            "install_type": active_version.install_type,
-            "source_commit": active_version.source_commit,
-            "binary_path": active_version.binary_path,
-            "exists": (
-                os.path.exists(active_version.binary_path)
-                if active_version.binary_path
-                else False
-            ),
+            "id": 0,
+            "version": active_version.get("version"),
+            "install_type": active_version.get("type"),
+            "source_commit": active_version.get("source_commit"),
+            "binary_path": active_version.get("binary_path"),
+            "exists": os.path.exists(binary_path) if binary_path else False,
         }
     }
diff --git a/backend/routes/llama_versions.py b/backend/routes/llama_versions.py
index 4f296c8..4941c5c 100644
--- a/backend/routes/llama_versions.py
+++ b/backend/routes/llama_versions.py
@@ -1,6 +1,6 @@
-from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
-from sqlalchemy.orm import Session
+from fastapi import APIRouter, HTTPException, Body
 from typing import List, Optional
+import asyncio
 import json
 import os
 import subprocess
@@ -11,9 +11,9 @@
 import stat
 from datetime import datetime
 
-from backend.database import get_db, LlamaVersion
+from backend.data_store import get_store
 from backend.llama_manager import LlamaManager, BuildConfig
-from backend.websocket_manager import websocket_manager
+from backend.progress_manager import get_progress_manager
 from backend.logging_config import get_logger
 from backend.gpu_detector import get_gpu_info, detect_build_capabilities
 from backend.cuda_installer import get_cuda_installer
@@ -69,62 +69,77 @@ def _robust_rmtree(path: str, max_retries: int = 3) -> None:
 
 @router.get("")
 @router.get("/")
-async def list_llama_versions(db: Session = Depends(get_db)):
-    """List all installed llama.cpp versions"""
-    versions = db.query(LlamaVersion).all()
-    return [
-        {
-            "id": version.id,
-            "version": version.version,
-            "install_type": version.install_type,
-            "binary_path": version.binary_path,
-            "source_commit": version.source_commit,
-            "patches": json.loads(version.patches) if version.patches else [],
-            "installed_at": version.installed_at,
-            "is_active": version.is_active,
-            "build_config": version.build_config,
-            "repository_source": version.repository_source or "llama.cpp",
-        }
-        for version in versions
-    ]
+async def list_llama_versions():
+    """List all installed llama.cpp and ik_llama versions"""
+    store = get_store()
+    result = []
+    for engine, repo_label in [("llama_cpp", "llama.cpp"), ("ik_llama", "ik_llama.cpp")]:
+        active = store.get_active_engine_version(engine)
+        active_version = active.get("version") if active else None
+        for i, v in enumerate(store.get_engine_versions(engine)):
+            version_str = v.get("version")
+            result.append({
+                "id": f"{engine}:{version_str}",
+                "version": version_str,
+                "install_type": v.get("type", "source"),
+                "binary_path": v.get("binary_path"),
+                "source_commit": v.get("source_commit"),
+                "patches": [],  # No longer storing patches in YAML
+                "installed_at": v.get("installed_at"),
+                "is_active": v.get("version") == active_version,
+                "build_config": v.get("build_config"),
+                "repository_source": v.get("repository_source") or repo_label,
+            })
+    return result
 
 
 @router.get("/check-updates")
-async def check_updates():
-    """Check for llama.cpp updates (both releases and source)"""
+async def check_updates(source: str | None = None):
+    """Check for llama.cpp or ik_llama.cpp updates (releases and/or source).
+    source: None or 'llama_cpp' for ggerganov/llama.cpp; 'ik_llama' for ikawrakow/ik_llama.cpp.
+    """
     try:
-        # Use the original URLs with redirect handling
-        releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases"
-        commits_url = (
-            "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1"
-        )
-
-        # Check GitHub releases
-        releases_response = requests.get(releases_url, allow_redirects=True)
-        releases_response.raise_for_status()
-        releases = releases_response.json()
-
-        latest_release = releases[0] if releases else None
+        is_ik = source == "ik_llama"
+        if is_ik:
+            commits_url = (
+                "https://api.github.com/repos/ikawrakow/ik_llama.cpp/commits?per_page=1"
+            )
+            latest_release = None
+        else:
+            # ai-dock/llama.cpp-cuda: pre-built releases with CUDA support
+            releases_url = "https://api.github.com/repos/ai-dock/llama.cpp-cuda/releases"
+            commits_url = (
+                "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1"
+            )
+            releases_response = requests.get(releases_url, allow_redirects=True)
+            releases_response.raise_for_status()
+            releases = releases_response.json()
+            latest_release = releases[0] if releases else None
 
-        # Check latest commit from main branch
         commits_response = requests.get(commits_url, allow_redirects=True)
         commits_response.raise_for_status()
         commits = commits_response.json()
         latest_commit = commits[0] if commits else None
 
         return {
-            "latest_release": {
-                "tag_name": latest_release["tag_name"] if latest_release else None,
-                "published_at": (
-                    latest_release["published_at"] if latest_release else None
-                ),
-                "html_url": latest_release["html_url"] if latest_release else None,
-            },
-            "latest_commit": {
-                "sha": latest_commit["sha"],
-                "commit_date": latest_commit["commit"]["committer"]["date"],
-                "message": latest_commit["commit"]["message"],
-            },
+            "latest_release": (
+                {
+                    "tag_name": latest_release["tag_name"],
+                    "published_at": latest_release["published_at"],
+                    "html_url": latest_release["html_url"],
+                }
+                if latest_release
+                else None
+            ),
+            "latest_commit": (
+                {
+                    "sha": latest_commit["sha"],
+                    "commit_date": latest_commit["commit"]["committer"]["date"],
+                    "message": latest_commit["commit"]["message"],
+                }
+                if latest_commit
+                else None
+            ),
         }
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 403:
@@ -199,10 +214,8 @@ async def get_build_capabilities_endpoint():
 
 
 @router.post("/install-release")
-async def install_release(
-    request: dict, background_tasks: BackgroundTasks, db: Session = Depends(get_db)
-):
-    """Install llama.cpp from GitHub release"""
+async def install_release(request: dict):
+    """Install llama.cpp from ai-dock/llama.cpp-cuda release (CUDA builds)."""
     try:
         tag_name = request.get("tag_name")
         if not tag_name:
@@ -241,17 +254,12 @@ async def install_release(
 
         version_name = preview.get("version_name")
 
-        # Check if version already exists
-        if version_name:
-            existing = (
-                db.query(LlamaVersion)
-                .filter(LlamaVersion.version == version_name)
-                .first()
-            )
-        else:
-            existing = (
-                db.query(LlamaVersion).filter(LlamaVersion.version == tag_name).first()
-            )
+        store = get_store()
+        existing_versions = store.get_engine_versions("llama_cpp")
+        existing = next(
+            (v for v in existing_versions if v.get("version") in (version_name, tag_name)),
+            None,
+        )
         if existing:
             detail = "400: Version already installed"
             if version_name:
@@ -261,10 +269,10 @@ async def install_release(
         # Generate task ID for tracking
         task_id = f"install_release_{tag_name}_{int(time.time())}"
 
-        # Start installation in background
-        background_tasks.add_task(
-            install_release_task, tag_name, websocket_manager, task_id, asset_id
-        )
+        # Start installation in background (asyncio.create_task so it runs regardless of middleware)
+        pm = get_progress_manager()
+        pm.create_task("install_release", f"Install {tag_name}", {"tag_name": tag_name}, task_id=task_id)
+        asyncio.create_task(install_release_task(tag_name, pm, task_id, asset_id))
 
         return {
             "message": f"Installing release {tag_name}",
@@ -280,19 +288,15 @@ async def install_release(
 
 async def install_release_task(
     tag_name: str,
-    websocket_manager=None,
+    progress_manager=None,
     task_id: str = None,
     asset_id: Optional[int] = None,
 ):
-    """Background task to install release with WebSocket progress updates"""
-    # Create a new database session for the background task
-    from backend.database import SessionLocal
-
-    db = SessionLocal()
-
+    """Background task to install release with SSE progress updates"""
+    store = get_store()
     try:
         install_result = await llama_manager.install_release(
-            tag_name, websocket_manager, task_id, asset_id
+            tag_name, progress_manager, task_id, asset_id
         )
         binary_path = install_result.get("binary_path")
         asset_info = install_result.get("asset")
@@ -301,39 +305,36 @@ async def install_release_task(
         if not binary_path:
             raise Exception("Installation completed without returning a binary path.")
 
-        # Save to database
-        version = LlamaVersion(
-            version=version_name,
-            install_type="release",
-            binary_path=binary_path,
-            installed_at=datetime.utcnow(),
-            build_config=(
+        version_data = {
+            "version": version_name,
+            "type": "release",
+            "binary_path": binary_path,
+            "installed_at": datetime.utcnow().isoformat() + "Z",
+            "build_config": (
                 {"release_asset": asset_info, "tag_name": tag_name}
                 if asset_info
                 else None
             ),
-        )
-        db.add(version)
-        db.commit()
+            "repository_source": "llama.cpp",
+        }
+        store.add_engine_version("llama_cpp", version_data)
 
-        # If this is the first version or if there's an active version, ensure llama-swap is running
         from backend.llama_swap_manager import get_llama_swap_manager
-        active_version = db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-        if active_version and os.path.exists(active_version.binary_path):
+        active_version = store.get_active_engine_version("llama_cpp")
+        if active_version and active_version.get("binary_path") and os.path.exists(active_version.get("binary_path", "")):
             try:
                 llama_swap_manager = get_llama_swap_manager()
-                # Regenerate config to include any new models, and ensure llama-swap is running
                 await llama_swap_manager.regenerate_config_with_active_version()
                 logger.info("Ensured llama-swap is running after release installation")
             except Exception as e:
                 logger.warning(f"Failed to ensure llama-swap is running after release installation: {e}")
 
-        # Send success notification
-        if websocket_manager:
+        if progress_manager:
             asset_label = ""
             if asset_info and asset_info.get("name"):
                 asset_label = f" ({asset_info['name']})"
-            await websocket_manager.send_notification(
+            progress_manager.complete_task(task_id, f"Installed {version_name}")
+            await progress_manager.send_notification(
                 title="Installation Complete",
                 message=f"Successfully installed llama.cpp release {version_name}{asset_label}",
                 type="success",
@@ -341,21 +342,18 @@ async def install_release_task(
 
     except Exception as e:
         logger.error(f"Release installation failed: {e}")
-        if websocket_manager:
-            await websocket_manager.send_notification(
+        if progress_manager and task_id:
+            progress_manager.fail_task(task_id, str(e))
+        if progress_manager:
+            await progress_manager.send_notification(
                 title="Installation Failed",
                 message=f"Failed to install llama.cpp release: {str(e)}",
                 type="error",
             )
-    finally:
-        # Always close the database session
-        db.close()
 
 
 @router.post("/build-source")
-async def build_source(
-    request: dict, background_tasks: BackgroundTasks, db: Session = Depends(get_db)
-):
+async def build_source(request: dict):
     """Build llama.cpp from source with optional patches"""
     try:
         commit_sha = request.get("commit_sha")
@@ -367,19 +365,17 @@ async def build_source(
         if not commit_sha:
             raise HTTPException(status_code=400, detail="commit_sha is required")
 
-        # Generate unique version name
         commit_short = commit_sha[:8]
         if version_suffix:
             version_name = f"source-{commit_short}-{version_suffix}"
         else:
-            # Use timestamp for unique naming
             timestamp = int(time.time())
             version_name = f"source-{commit_short}-{timestamp}"
 
-        # Check if version already exists (still check to prevent accidental duplicates)
-        existing = (
-            db.query(LlamaVersion).filter(LlamaVersion.version == version_name).first()
-        )
+        store = get_store()
+        engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp"
+        existing_versions = store.get_engine_versions(engine)
+        existing = next((v for v in existing_versions if v.get("version") == version_name), None)
         if existing:
             raise HTTPException(
                 status_code=400, detail=f"Version '{version_name}' already installed"
@@ -393,25 +389,48 @@ async def build_source(
                 detail=f"Unknown repository source: {repository_source}",
             )
 
-        # Parse build_config if provided
+        # Parse build_config if provided (map frontend keys to BuildConfig field names)
         build_config = None
-        if build_config_dict:
-            build_config = BuildConfig(**build_config_dict)
+        if build_config_dict and isinstance(build_config_dict, dict):
+            def _bool(v):
+                if isinstance(v, bool):
+                    return v
+                if isinstance(v, str):
+                    return v.strip().lower() in ("1", "true", "yes", "on")
+                return bool(v)
+
+            # Frontend sends cuda, flash_attention, native, backend_dl, cpu_all_variants
+            mapped = {
+                "enable_cuda": _bool(build_config_dict.get("cuda", False)),
+                "enable_flash_attention": _bool(build_config_dict.get("flash_attention", False)),
+                "enable_native": _bool(build_config_dict.get("native", True)),
+                "enable_backend_dl": _bool(build_config_dict.get("backend_dl", False)),
+                "enable_cpu_all_variants": _bool(build_config_dict.get("cpu_all_variants", False)),
+                "cuda_architectures": str(build_config_dict.get("cuda_architectures") or ""),
+            }
+            try:
+                build_config = BuildConfig(**mapped)
+            except (TypeError, ValueError) as e:
+                logger.warning("BuildConfig from request failed (%s), using defaults", e)
+                build_config = BuildConfig()
 
         # Generate task ID for tracking
         task_id = f"build_{version_name}_{int(time.time())}"
 
-        # Start build in background
-        background_tasks.add_task(
-            build_source_task,
-            commit_sha,
-            patches,
-            build_config,
-            version_name,
-            repository_source,
-            repository_url,
-            websocket_manager,
-            task_id,
+        # Start build in background (asyncio.create_task so it runs regardless of middleware)
+        pm = get_progress_manager()
+        pm.create_task("build", f"Build {repository_source} {commit_sha[:8]}", {"version_name": version_name}, task_id=task_id)
+        asyncio.create_task(
+            build_source_task(
+                commit_sha,
+                patches,
+                build_config or BuildConfig(),
+                version_name,
+                repository_source,
+                repository_url,
+                pm,
+                task_id,
+            )
         )
 
         return {
@@ -433,106 +452,94 @@ async def build_source_task(
     version_name: str,
     repository_source: str,
     repository_url: str,
-    websocket_manager=None,
+    progress_manager=None,
     task_id: str = None,
 ):
-    """Background task to build from source with WebSocket progress"""
-    # Create a new database session for the background task
-    from backend.database import SessionLocal
-    from dataclasses import asdict
-
-    db = SessionLocal()
-
+    """Background task to build from source with SSE progress"""
+    logger.info(
+        "Build task started: version_name=%s, repository_source=%s, commit_sha=%s",
+        version_name, repository_source, commit_sha[:8] if commit_sha else "",
+    )
     try:
+        from dataclasses import asdict
+        store = get_store()
+        engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp"
+
         binary_path = await llama_manager.build_source(
             commit_sha,
             patches,
             build_config,
-            websocket_manager,
+            progress_manager,
             task_id,
             repository_url=repository_url,
             version_name=version_name,
         )
 
-        # Save to database with build_config
         build_config_dict = None
         if build_config:
             build_config_dict = asdict(build_config)
-            # Add repository_source to build_config for completeness
             build_config_dict["repository_source"] = repository_source
 
-        version = LlamaVersion(
-            version=version_name,
-            install_type="patched" if patches else "source",
-            binary_path=binary_path,
-            source_commit=commit_sha,
-            patches=json.dumps(patches),
-            build_config=build_config_dict,
-            repository_source=repository_source,
-            installed_at=datetime.utcnow(),
-        )
-        db.add(version)
-        db.commit()
+        version_data = {
+            "version": version_name,
+            "type": "patched" if patches else "source",
+            "binary_path": binary_path,
+            "source_commit": commit_sha,
+            "build_config": build_config_dict,
+            "repository_source": repository_source,
+            "installed_at": datetime.utcnow().isoformat() + "Z",
+        }
+        store.add_engine_version(engine, version_data)
 
-        # If there's an active version, ensure llama-swap is running
         from backend.llama_swap_manager import get_llama_swap_manager
-        active_version = db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-        if active_version and os.path.exists(active_version.binary_path):
+        active_version = store.get_active_engine_version(engine)
+        if active_version and active_version.get("binary_path") and os.path.exists(active_version.get("binary_path", "")):
             try:
                 llama_swap_manager = get_llama_swap_manager()
-                # Regenerate config to include any new models, and ensure llama-swap is running
                 await llama_swap_manager.regenerate_config_with_active_version()
                 logger.info("Ensured llama-swap is running after source build")
             except Exception as e:
                 logger.warning(f"Failed to ensure llama-swap is running after source build: {e}")
 
-        # Send success notification
-        if websocket_manager:
-            await websocket_manager.send_notification(
+        if progress_manager:
+            if task_id:
+                progress_manager.complete_task(task_id, f"Built {version_name}")
+            await progress_manager.send_notification(
                 title="Build Complete",
                 message=f"Successfully built {repository_source} from source {commit_sha[:8]}",
                 type="success",
             )
 
     except Exception as e:
-        logger.error(f"Source build failed: {e}")
-        if websocket_manager:
+        logger.exception("Source build failed: %s", e)
+        if progress_manager:
             try:
-                logger.info(f"Sending build failure notification for task {task_id}")
-                await websocket_manager.send_notification(
+                if task_id:
+                    progress_manager.fail_task(task_id, str(e))
+                await progress_manager.send_notification(
                     title="Build Failed",
                     message=f"Failed to build llama.cpp from source: {str(e)}",
                     type="error",
                 )
-                # Also send a build progress error message
                 if task_id:
-                    await websocket_manager.send_build_progress(
+                    await progress_manager.send_build_progress(
                         task_id=task_id,
                         stage="error",
                         progress=0,
                         message=f"Build task failed: {str(e)}",
-                        log_lines=[
-                            f"Task error: {str(e)}",
-                            f"Error type: {type(e).__name__}",
-                        ],
+                        log_lines=[f"Task error: {str(e)}", f"Error type: {type(e).__name__}"],
                     )
-                logger.info(f"Build failure notifications sent successfully")
             except Exception as ws_error:
                 logger.error(f"Failed to send build failure notification: {ws_error}")
-    finally:
-        # Always close the database session
-        db.close()
 
 
 @router.get("/task-status/{task_id}")
 async def get_task_status(task_id: str):
     """Get the status of a background task"""
-    # This is a simple implementation - in production you might want to store task status in Redis or database
-    # For now, we'll just return a basic response since the WebSocket provides real-time updates
     return {
         "task_id": task_id,
-        "status": "running",  # Could be "running", "completed", "failed"
-        "message": "Task is running. Use WebSocket for real-time progress updates.",
+        "status": "running",
+        "message": "Task is running. Subscribe to GET /api/events for real-time SSE progress updates.",
     }
 
 
@@ -563,33 +570,130 @@ async def get_version_commands(version: str):
         raise HTTPException(status_code=500, detail=str(e))
 
 
-@router.delete("/{version_id}")
-async def delete_version(version_id: int, db: Session = Depends(get_db)):
-    """Delete llama.cpp version"""
-    version = db.query(LlamaVersion).filter(LlamaVersion.id == version_id).first()
-    if not version:
+def _resolve_binary_path(binary_path: str) -> str:
+    if not binary_path:
+        return ""
+    if os.path.isabs(binary_path):
+        return binary_path
+    # Docker: paths relative to /app; local: relative to project root
+    if os.path.exists("/app/data"):
+        return os.path.normpath(os.path.join("/app", binary_path))
+    cwd = os.getcwd()
+    resolved = os.path.normpath(os.path.join(cwd, binary_path))
+    if os.path.exists(resolved):
+        return resolved
+    # When run with --app-dir backend, cwd may be backend/; project root is parent
+    parent = os.path.dirname(cwd)
+    return os.path.normpath(os.path.join(parent, binary_path))
+
+
+def _find_version_entry(store, version_id: str):
+    """Resolve version_id ('engine:version' or plain version) to (version_entry, engine). Returns (None, None) if not found."""
+    version_entry = None
+    engine = None
+    if ":" in version_id:
+        parts = version_id.split(":", 1)
+        eng, version_str = parts[0], parts[1]
+        if eng in ("llama_cpp", "ik_llama"):
+            version_entry = next(
+                (v for v in store.get_engine_versions(eng) if str(v.get("version")) == version_str),
+                None,
+            )
+            if version_entry:
+                engine = eng
+    if not version_entry:
+        for eng in ("llama_cpp", "ik_llama"):
+            versions = store.get_engine_versions(eng)
+            version_entry = next((v for v in versions if str(v.get("version")) == str(version_id)), None)
+            if version_entry:
+                engine = eng
+                break
+    return version_entry, engine
+
+
+@router.post("/versions/activate")
+async def activate_version_body(payload: dict = Body(...)):
+    """Activate a version; body: { \"version_id\": \"llama_cpp:version\" or \"version\" }."""
+    version_id = (payload or {}).get("version_id")
+    if not version_id:
+        raise HTTPException(status_code=400, detail="version_id required")
+    return await _do_activate_version(version_id)
+
+
+async def _do_activate_version(version_id: str):
+    store = get_store()
+    version_entry, engine = _find_version_entry(store, version_id)
+    if not version_entry or not engine:
+        logger.warning(
+            "activate_version: version not found, version_id=%r, llama_cpp versions=%s",
+            version_id,
+            [v.get("version") for v in store.get_engine_versions("llama_cpp")],
+        )
         raise HTTPException(status_code=404, detail="Version not found")
+    version_str = str(version_entry.get("version"))
+    binary_path = _resolve_binary_path(version_entry.get("binary_path"))
+    if not os.path.exists(binary_path):
+        raise HTTPException(status_code=400, detail="Binary file does not exist")
+    store.set_active_engine_version(engine, version_str)
+    if engine == "llama_cpp":
+        try:
+            from backend.llama_swap_manager import get_llama_swap_manager
+            llama_swap_manager = get_llama_swap_manager()
+            await llama_swap_manager._ensure_correct_binary_path()
+            await llama_swap_manager.regenerate_config_with_active_version()
+            try:
+                await llama_swap_manager.start_proxy()
+            except Exception as e:
+                logger.warning("Failed to start llama-swap after version activation: %s", e)
+        except Exception as e:
+            logger.error("Failed to regenerate llama-swap config: %s", e)
+    logger.info("Activated %s version: %s", engine, version_str)
+    return {"message": f"Activated {engine} version {version_str}"}
 
-    # Prevent deletion of active version
-    if version.is_active:
-        raise HTTPException(status_code=400, detail="Cannot delete active version")
 
+@router.delete("/{version_id}")
+async def delete_version(version_id: str):
+    """Delete llama.cpp version (version_id is 'engine:version' or version string)."""
+    store = get_store()
+    version_entry = None
+    if ":" in version_id:
+        parts = version_id.split(":", 1)
+        engine, version_str = parts[0], parts[1]
+        if engine in ("llama_cpp", "ik_llama"):
+            version_entry = next(
+                (v for v in store.get_engine_versions(engine) if str(v.get("version")) == version_str),
+                None,
+            )
+            if version_entry:
+                version_entry["_engine"] = engine
+    if not version_entry:
+        for engine in ("llama_cpp", "ik_llama"):
+            versions = store.get_engine_versions(engine)
+            version_entry = next((v for v in versions if str(v.get("version")) == str(version_id)), None)
+            if version_entry:
+                version_entry["_engine"] = engine
+                break
+    if not version_entry:
+        raise HTTPException(status_code=404, detail="Version not found")
+    engine = version_entry.get("_engine", "llama_cpp")
+    version_str = str(version_entry.get("version"))
+    active = store.get_active_engine_version(engine)
+    if active and str(active.get("version")) == version_str:
+        raise HTTPException(status_code=400, detail="Cannot delete active version")
     try:
-        # Delete the entire version directory
-        if version.binary_path and os.path.exists(version.binary_path):
-            # Go up two levels from build/bin/llama-server to get the version directory
-            version_dir = os.path.dirname(os.path.dirname(version.binary_path))
-            if os.path.exists(version_dir):
-                _robust_rmtree(version_dir)
-
-        # Delete from database
-        db.delete(version)
-        db.commit()
-
-        logger.info(f"Deleted llama-cpp version: {version.version}")
-        return {"message": f"Deleted llama-cpp version {version.version}"}
+        binary_path = version_entry.get("binary_path")
+        if binary_path:
+            if not os.path.isabs(binary_path):
+                binary_path = os.path.join("/app", binary_path)
+            if os.path.exists(binary_path):
+                version_dir = os.path.dirname(os.path.dirname(binary_path))
+                if os.path.exists(version_dir):
+                    _robust_rmtree(version_dir)
+        store.delete_engine_version(engine, version_str)
+        logger.info(f"Deleted version: {version_str}")
+        return {"message": f"Deleted version {version_str}"}
     except Exception as e:
-        logger.error(f"Failed to delete version {version.version}: {e}")
+        logger.error(f"Failed to delete version {version_str}: {e}")
         raise HTTPException(status_code=500, detail=f"Failed to delete version: {e}")
 
 
diff --git a/backend/routes/lmdeploy.py b/backend/routes/lmdeploy.py
index 946096f..d24e5cf 100644
--- a/backend/routes/lmdeploy.py
+++ b/backend/routes/lmdeploy.py
@@ -1,51 +1,82 @@
-from typing import Dict, Optional
-
-from fastapi import APIRouter, HTTPException
-
-from backend.lmdeploy_installer import get_lmdeploy_installer
-from backend.lmdeploy_manager import get_lmdeploy_manager
-
-router = APIRouter()
-
-
-@router.get("/lmdeploy/status")
-async def lmdeploy_installer_status() -> Dict:
-    installer = get_lmdeploy_installer()
-    return installer.status()
-
-
-@router.post("/lmdeploy/install")
-async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict:
-    installer = get_lmdeploy_installer()
-    payload = request or {}
-    version = payload.get("version")
-    force_reinstall = bool(payload.get("force_reinstall"))
-    try:
-        return await installer.install(version=version, force_reinstall=force_reinstall)
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-
-@router.post("/lmdeploy/remove")
-async def lmdeploy_remove() -> Dict:
-    installer = get_lmdeploy_installer()
-    try:
-        return await installer.remove()
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-
-@router.get("/lmdeploy/logs")
-async def lmdeploy_logs(max_bytes: int = 8192) -> Dict[str, str]:
-    """Get LMDeploy installer logs."""
-    installer = get_lmdeploy_installer()
-    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
-    return {"log": installer.read_log_tail(max_bytes)}
-
-
-@router.get("/lmdeploy/runtime-logs")
-async def lmdeploy_runtime_logs(max_bytes: int = 8192) -> Dict[str, str]:
-    """Get LMDeploy runtime logs (from running server instances)."""
-    manager = get_lmdeploy_manager()
-    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
-    return {"log": manager.read_log_tail(max_bytes)}
+from typing import Dict, Optional
+
+import httpx
+from fastapi import APIRouter, HTTPException
+
+from backend.lmdeploy_installer import get_lmdeploy_installer
+from backend.lmdeploy_manager import get_lmdeploy_manager
+
+router = APIRouter()
+
+
+@router.get("/lmdeploy/check-updates")
+async def lmdeploy_check_updates() -> Dict:
+    """Check PyPI for latest LMDeploy version."""
+    try:
+        async with httpx.AsyncClient() as client:
+            r = await client.get("https://pypi.org/pypi/lmdeploy/json", timeout=10.0)
+            r.raise_for_status()
+            data = r.json()
+            info = data.get("info", {})
+            return {
+                "latest_version": info.get("version"),
+                "releases": list(data.get("releases", {}).keys()),
+            }
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"Failed to check PyPI: {exc}")
+
+
+@router.get("/lmdeploy/status")
+async def lmdeploy_installer_status() -> Dict:
+    installer = get_lmdeploy_installer()
+    return installer.status()
+
+
+@router.post("/lmdeploy/install")
+async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict:
+    installer = get_lmdeploy_installer()
+    payload = request or {}
+    version = payload.get("version")
+    force_reinstall = bool(payload.get("force_reinstall"))
+    try:
+        return await installer.install(version=version, force_reinstall=force_reinstall)
+    except RuntimeError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+
+
+@router.post("/lmdeploy/install-source")
+async def lmdeploy_install_source(request: Optional[Dict[str, str]] = None) -> Dict:
+    """Install LMDeploy from a git repo and branch (for development)."""
+    installer = get_lmdeploy_installer()
+    payload = request or {}
+    repo_url = payload.get("repo_url", "https://github.com/InternLM/lmdeploy.git")
+    branch = payload.get("branch", "main")
+    try:
+        return await installer.install_from_source(repo_url=repo_url, branch=branch)
+    except RuntimeError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+
+
+@router.post("/lmdeploy/remove")
+async def lmdeploy_remove() -> Dict:
+    installer = get_lmdeploy_installer()
+    try:
+        return await installer.remove()
+    except RuntimeError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+
+
+@router.get("/lmdeploy/logs")
+async def lmdeploy_logs(max_bytes: int = 8192) -> Dict[str, str]:
+    """Get LMDeploy installer logs."""
+    installer = get_lmdeploy_installer()
+    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
+    return {"log": installer.read_log_tail(max_bytes)}
+
+
+@router.get("/lmdeploy/runtime-logs")
+async def lmdeploy_runtime_logs(max_bytes: int = 8192) -> Dict[str, str]:
+    """Get LMDeploy runtime logs (from running server instances)."""
+    manager = get_lmdeploy_manager()
+    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
+    return {"log": manager.read_log_tail(max_bytes)}
diff --git a/backend/routes/models.py b/backend/routes/models.py
index e9eee78..5859db4 100644
--- a/backend/routes/models.py
+++ b/backend/routes/models.py
@@ -1,6 +1,4 @@
-from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
-from sqlalchemy.orm import Session
-from sqlalchemy import or_
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, Query
 from typing import List, Optional, Dict, Any, Tuple
 from pydantic import BaseModel
 import json
@@ -10,17 +8,12 @@
 import re
 from datetime import datetime
 
-from backend.database import (
-    get_db,
-    Model,
-    RunningInstance,
-    generate_proxy_name,
-    LlamaVersion,
-)
+from backend.data_store import get_store, generate_proxy_name
+from backend.progress_manager import get_progress_manager
 from backend.huggingface import (
     search_models,
     download_model,
-    download_model_with_websocket_progress,
+    download_model_with_progress,
     set_huggingface_token,
     get_huggingface_token,
     get_model_details,
@@ -40,27 +33,21 @@
     DEFAULT_LMDEPLOY_CONTEXT,
     MAX_LMDEPLOY_CONTEXT,
     MAX_ROPE_SCALING_FACTOR,
-)
-from backend.smart_auto import SmartAutoConfig
-from backend.smart_auto.model_metadata import get_model_metadata
-from backend.smart_auto.architecture_config import (
-    normalize_architecture,
-    detect_architecture_from_name,
+    get_model_disk_size,
+    get_accurate_file_sizes,
+    get_mmproj_f16_filename,
 )
 from backend.gpu_detector import get_gpu_info
 from backend.gguf_reader import get_model_layer_info
-from backend.presets import get_architecture_and_presets
 from backend.logging_config import get_logger
 
 logger = get_logger(__name__)
 from backend.llama_swap_config import get_supported_flags
-from backend.logging_config import get_logger
 from backend.lmdeploy_manager import get_lmdeploy_manager
 from backend.lmdeploy_installer import get_lmdeploy_installer
 import psutil
 
 router = APIRouter()
-logger = get_logger(__name__)
 
 # Common embedding indicators for automatic detection
 EMBEDDING_PIPELINE_TAGS = {
@@ -100,19 +87,76 @@ def _looks_like_embedding_model(
     return any(keyword in combined for keyword in EMBEDDING_KEYWORDS)
 
 
-def _model_is_embedding(model: Model) -> bool:
+def _model_is_embedding(model: dict) -> bool:
     """Determine if a stored model should run in embedding mode."""
-    config = _coerce_model_config(model.config)
+    config = _coerce_model_config(model.get("config"))
     if config.get("embedding"):
         return True
     return _looks_like_embedding_model(
-        model.pipeline_tag,
-        model.huggingface_id,
-        model.name,
-        model.base_model_name,
+        model.get("pipeline_tag"),
+        model.get("huggingface_id"),
+        model.get("display_name") or model.get("name"),
+        model.get("base_model_name"),
     )
 
 
+def _get_model_or_404(store, model_id: str) -> dict:
+    """Return model dict from store or raise 404. Accepts str model_id (YAML id)."""
+    if model_id is None:
+        raise HTTPException(status_code=404, detail="Model not found")
+    model_id = str(model_id)
+    model = store.get_model(model_id)
+    if not model:
+        raise HTTPException(status_code=404, detail="Model not found")
+    return model
+
+
+def _get_actual_file_size(file_path: Optional[str]) -> Optional[int]:
+    """Return actual file size in bytes from disk, or None if not available."""
+    if not file_path:
+        return None
+    path = _normalize_model_path(file_path)
+    if not path or not os.path.exists(path):
+        return None
+    try:
+        real = os.path.realpath(path)
+        return os.path.getsize(real if os.path.exists(real) else path)
+    except OSError:
+        return None
+
+
+def _get_model_filename(model: dict) -> Optional[str]:
+    """Return the filename for a model record.
+
+    Prefers the dedicated ``filename`` field (new records). Falls back to
+    deriving it from the legacy ``file_path`` field (old records).
+    """
+    fname = model.get("filename")
+    if fname:
+        return fname
+    return _extract_filename(model.get("file_path")) or None
+
+
+def _get_model_file_path(model: dict) -> Optional[str]:
+    """Return the actual filesystem path for a model file.
+
+    Resolution order:
+    1. HF cache via huggingface_id + filename (new records).
+    2. Stored file_path (legacy records that still reference custom storage).
+    """
+    from backend.huggingface import resolve_cached_model_path
+
+    hf_id = model.get("huggingface_id")
+    filename = _get_model_filename(model)
+
+    if hf_id and filename:
+        cached = resolve_cached_model_path(hf_id, filename)
+        if cached:
+            return cached
+
+    return _normalize_model_path(model.get("file_path")) or None
+
+
 def _normalize_model_path(file_path: Optional[str]) -> Optional[str]:
     if not file_path:
         return None
@@ -129,45 +173,28 @@ def _extract_filename(file_path: Optional[str]) -> str:
     return parts[-1] if parts else normalized
 
 
-def _cleanup_model_folder_if_no_quantizations(
-    db: Session,
-    huggingface_id: Optional[str],
-    model_format: Optional[str],
-) -> None:
-    """
-    If there are no remaining quantizations for a given Hugging Face repo and format,
-    delete the corresponding local model folder (e.g. data/models/gguf/<repo_safe>).
-    """
-    if not huggingface_id or not model_format:
-        return
-
-    model_format = (model_format or "").lower()
-    if model_format not in ("gguf", "safetensors"):
-        return
+def normalize_architecture(raw_architecture: str) -> str:
+    """Normalize GGUF architecture string (stub after smart_auto removal)."""
+    if not raw_architecture or not isinstance(raw_architecture, str):
+        return "unknown"
+    return raw_architecture.strip() or "unknown"
 
-    # Check for remaining models of this repo/format, excluding any pending deletions
-    remaining = (
-        db.query(Model)
-        .filter(
-            Model.huggingface_id == huggingface_id,
-            Model.model_format == model_format,
-        )
-        .count()
-    )
-    if remaining > 0:
-        return
 
-    safe_repo = (huggingface_id or "unknown").replace("/", "_") or "unknown"
-    base_dir = os.path.join("data", "models", model_format)
-    repo_dir = os.path.join(base_dir, safe_repo)
+def detect_architecture_from_name(name: str) -> str:
+    """Infer architecture from model name (stub after smart_auto removal)."""
+    if not name or not isinstance(name, str):
+        return "unknown"
+    name_lower = name.lower()
+    if "llama" in name_lower:
+        return "llama"
+    if "qwen" in name_lower:
+        return "qwen2"
+    if "mistral" in name_lower:
+        return "mistral"
+    if "phi" in name_lower:
+        return "phi-2"
+    return "unknown"
 
-    if os.path.isdir(repo_dir):
-        try:
-            if not os.listdir(repo_dir):
-                os.rmdir(repo_dir)
-                logger.info(f"Removed empty model folder: {repo_dir}")
-        except Exception as exc:
-            logger.warning(f"Failed to remove model folder {repo_dir}: {exc}")
 
 
 def _derive_hf_defaults(metadata: Dict[str, Any]) -> Dict[str, Any]:
@@ -206,13 +233,13 @@ def _assign_numeric(src_key: str, dest_keys):
     return defaults
 
 
-def _apply_hf_defaults_to_model(model: Model, metadata: Dict[str, Any], db: Session):
+def _apply_hf_defaults_to_model(model: dict, metadata: Dict[str, Any], store) -> None:
     if not metadata:
         return
     defaults = _derive_hf_defaults(metadata)
     if not defaults:
         return
-    config = _coerce_model_config(model.config)
+    config = _coerce_model_config(model.get("config"))
     changed = False
     for key, value in defaults.items():
         if value is None:
@@ -222,9 +249,7 @@ def _apply_hf_defaults_to_model(model: Model, metadata: Dict[str, Any], db: Sess
             config[key] = value
             changed = True
     if changed:
-        model.config = config
-        db.commit()
-        db.refresh(model)
+        store.update_model(model["id"], {"config": config})
 
 
 def _coerce_model_config(config_value: Optional[Any]) -> Dict[str, Any]:
@@ -242,12 +267,12 @@ def _coerce_model_config(config_value: Optional[Any]) -> Dict[str, Any]:
     return {}
 
 
-def _refresh_model_metadata_from_file(model: Model, db: Session) -> Dict[str, Any]:
+def _refresh_model_metadata_from_file(model: dict, store) -> Dict[str, Any]:
     """
-    Re-read GGUF metadata from disk and update the model record similar to the refresh endpoint.
+    Re-read GGUF metadata from disk and update the model record.
     Returns metadata details for downstream consumers.
     """
-    normalized_path = _normalize_model_path(model.file_path)
+    normalized_path = _get_model_file_path(model)
     if not normalized_path or not os.path.exists(normalized_path):
         raise FileNotFoundError("Model file not found on disk")
 
@@ -259,26 +284,19 @@ def _refresh_model_metadata_from_file(model: Model, db: Session) -> Dict[str, An
     normalized_architecture = normalize_architecture(raw_architecture)
     if not normalized_architecture or normalized_architecture == "unknown":
         normalized_architecture = detect_architecture_from_name(
-            model.name or model.huggingface_id or ""
+            model.get("display_name") or model.get("name") or model.get("huggingface_id") or ""
         )
 
     update_fields = {}
     if (
         normalized_architecture
         and normalized_architecture != "unknown"
-        and normalized_architecture != model.model_type
+        and normalized_architecture != model.get("model_type")
     ):
         update_fields["model_type"] = normalized_architecture
 
-    file_size = os.path.getsize(model.file_path)
-    if file_size != model.file_size:
-        update_fields["file_size"] = file_size
-
     if update_fields:
-        for key, value in update_fields.items():
-            setattr(model, key, value)
-        db.commit()
-        db.refresh(model)
+        store.update_model(model["id"], update_fields)
 
     return {
         "updated_fields": update_fields,
@@ -286,9 +304,7 @@ def _refresh_model_metadata_from_file(model: Model, db: Session) -> Dict[str, An
             "architecture": normalized_architecture,
             "layer_count": layer_info.get("layer_count", 0),
             "context_length": layer_info.get("context_length", 0),
-            "parameter_count": layer_info.get(
-                "parameter_count"
-            ),  # Formatted as "32B", "36B", etc.
+            "parameter_count": layer_info.get("parameter_count"),
             "vocab_size": layer_info.get("vocab_size", 0),
             "embedding_length": layer_info.get("embedding_length", 0),
             "attention_head_count": layer_info.get("attention_head_count", 0),
@@ -419,97 +435,68 @@ def _coerce_positive_float(value: Any) -> Optional[float]:
 
 
 async def _save_safetensors_download(
-    db: Session,
+    store,
     huggingface_id: str,
     filename: str,
     file_path: str,
     file_size: int,
     pipeline_tag: Optional[str] = None,
-) -> Model:
+) -> dict:
     """
-    Persist safetensors download information using a single logical Model row per repo.
-
-    Historically we created one Model row per .safetensors file. This caused
-    multi‑file repositories to appear as multiple independent models. The new
-    behavior is:
-      * Exactly one Model row per Hugging Face repo (huggingface_id) with
-        model_format == "safetensors".
-      * All individual .safetensors files for that repo are tracked in the
-        safetensors manifest and share the same model_id.
-      * The logical Model.file_size reflects the aggregate size of all files.
+    Persist safetensors download information using a single logical model entry per repo.
+    Returns the model dict with "id" (string, YAML model id).
     """
     safetensors_metadata, tensor_summary, max_context = (
         await _collect_safetensors_runtime_metadata(huggingface_id, filename)
     )
-    # Determine / reuse logical Model for this Hugging Face repo
     detected_pipeline = pipeline_tag or safetensors_metadata.get("pipeline_tag")
     is_embedding_like = _looks_like_embedding_model(
         detected_pipeline, huggingface_id, filename
     )
-
-    # Try to find an existing logical model for this repo
-    model_record = (
-        db.query(Model)
-        .filter(
-            Model.huggingface_id == huggingface_id, Model.model_format == "safetensors"
-        )
-        .first()
-    )
+    model_id = huggingface_id.replace("/", "--")
+    model_record = store.get_model(model_id)
 
     if not model_record:
-        # Create a single logical model entry for the whole repo
-        model_record = Model(
-            name=filename.replace(".safetensors", ""),
-            huggingface_id=huggingface_id,
-            base_model_name=extract_base_model_name(filename),
-            file_path=file_path,
-            file_size=file_size,
-            quantization=os.path.splitext(filename)[0],
-            model_type=extract_model_type(filename),
-            downloaded_at=datetime.utcnow(),
-            model_format="safetensors",
-            pipeline_tag=detected_pipeline,
-        )
-        if is_embedding_like:
-            model_record.config = {"embedding": True}
-        db.add(model_record)
-        db.commit()
-        db.refresh(model_record)
+        from datetime import timezone as _tz
+        model_record = {
+            "id": model_id,
+            "huggingface_id": huggingface_id,
+            "filename": filename,
+            "display_name": filename.replace(".safetensors", ""),
+            "base_model_name": extract_base_model_name(filename),
+            "file_size": file_size,
+            "quantization": os.path.splitext(filename)[0],
+            "model_type": extract_model_type(filename),
+            "downloaded_at": datetime.now(_tz.utc).isoformat(),
+            "format": "safetensors",
+            "model_format": "safetensors",
+            "pipeline_tag": detected_pipeline,
+            "config": {"embedding": True} if is_embedding_like else {},
+        }
+        store.add_model(model_record)
     else:
-        # Update existing logical model with any missing metadata and aggregate size
-        updated = False
-        if not model_record.pipeline_tag and detected_pipeline:
-            model_record.pipeline_tag = detected_pipeline
-            updated = True
-        if is_embedding_like and not (model_record.config or {}).get("embedding"):
-            # Ensure embedding flag is propagated
-            current_config = _coerce_model_config(model_record.config)
-            current_config["embedding"] = True
-            model_record.config = current_config
-            updated = True
-        # Aggregate size across all files for this repo by summing manifest entries.
-        # This avoids double‑counting if a file is redownloaded.
+        updates = {}
+        if not model_record.get("pipeline_tag") and detected_pipeline:
+            updates["pipeline_tag"] = detected_pipeline
+        if is_embedding_like and not _coerce_model_config(model_record.get("config")).get("embedding"):
+            cfg = _coerce_model_config(model_record.get("config"))
+            cfg["embedding"] = True
+            updates["config"] = cfg
         try:
             from backend.huggingface import list_safetensors_downloads
-
             manifests = list_safetensors_downloads()
             total_size = 0
             for manifest in manifests:
                 if manifest.get("huggingface_id") == huggingface_id:
-                    total_size = sum(
-                        (f.get("file_size") or 0) for f in manifest.get("files", [])
-                    )
+                    total_size = sum((f.get("file_size") or 0) for f in manifest.get("files", []))
                     break
-            if total_size and total_size != (model_record.file_size or 0):
-                model_record.file_size = total_size
-                updated = True
+            if total_size and total_size != (model_record.get("file_size") or 0):
+                updates["file_size"] = total_size
         except Exception as exc:
-            logger.warning(
-                f"Failed to aggregate safetensors file sizes for {huggingface_id}: {exc}"
-            )
-        if updated:
-            db.commit()
-            db.refresh(model_record)
+            logger.warning(f"Failed to aggregate safetensors file sizes for {huggingface_id}: {exc}")
+        if updates:
+            store.update_model(model_id, updates)
+        model_record = store.get_model(model_id) or model_record
 
     lmdeploy_config = get_default_lmdeploy_config(max_context)
     record_safetensors_download(
@@ -520,33 +507,28 @@ async def _save_safetensors_download(
         metadata=safetensors_metadata,
         tensor_summary=tensor_summary,
         lmdeploy_config=lmdeploy_config,
-        model_id=model_record.id,
-    )
-    logger.info(
-        f"Safetensors download recorded for {huggingface_id}/{filename} (model_id={model_record.id})"
+        model_id=model_record.get("id"),
     )
+    logger.info(f"Safetensors download recorded for {huggingface_id}/{filename} (model_id={model_record.get('id')})")
     return model_record
 
 
-def _get_safetensors_model(model_id: int, db: Session) -> Model:
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
-    model_format = (model.model_format or "gguf").lower()
+def _get_safetensors_model(store, model_id: str) -> dict:
+    model = _get_model_or_404(store, model_id)
+    model_format = (model.get("model_format") or model.get("format") or "gguf").lower()
     if model_format != "safetensors":
-        raise HTTPException(
-            status_code=400, detail="Model is not a safetensors download"
-        )
-    normalized_path = _normalize_model_path(model.file_path)
-    if not normalized_path or not os.path.exists(normalized_path):
+        raise HTTPException(status_code=400, detail="Model is not a safetensors download")
+    resolved_path = _get_model_file_path(model)
+    if not resolved_path or not os.path.exists(resolved_path):
         raise HTTPException(status_code=400, detail="Model file not found on disk")
-    model.file_path = normalized_path
+    model = dict(model)
+    model["file_path"] = resolved_path
     return model
 
 
-def _load_manifest_entry_for_model(model: Model) -> Dict[str, Any]:
+def _load_manifest_entry_for_model(model: dict) -> Dict[str, Any]:
     """Load unified manifest for a safetensors model (repo-level, not per-file)."""
-    manifest = get_safetensors_manifest_entries(model.huggingface_id)
+    manifest = get_safetensors_manifest_entries(model.get("huggingface_id"))
     if not manifest:
         raise HTTPException(status_code=404, detail="Safetensors manifest not found")
     return manifest
@@ -1143,7 +1125,7 @@ def _as_list(key: str) -> list:
 
 
 class BundleProgressProxy:
-    """Proxy websocket manager that converts per-file progress into bundle-level updates."""
+    """Proxy progress manager that converts per-file progress into bundle-level updates."""
 
     def __init__(
         self,
@@ -1247,7 +1229,7 @@ async def get_cached_gpu_info() -> Dict[str, Any]:
 
 
 class EstimationRequest(BaseModel):
-    model_id: int
+    model_id: str  # YAML model id
     config: dict
     usage_mode: Optional[str] = "single_user"
 
@@ -1258,76 +1240,82 @@ class SafetensorsBundleRequest(BaseModel):
     files: List[Dict[str, Any]]
 
 
+@router.get("/param-registry")
+async def get_param_registry_endpoint(engine: str = "llama_cpp"):
+    """Return param definitions (basic + advanced) for config forms."""
+    from backend.param_registry import get_param_registry
+    return get_param_registry(engine)
+
+
 @router.get("")
 @router.get("/")
-async def list_models(db: Session = Depends(get_db)):
+async def list_models():
     """List all managed models grouped by base model"""
-    # Sync is_active status before returning models
-    from backend.database import sync_model_active_status
-
-    sync_model_active_status(db)
+    from backend.llama_swap_client import LlamaSwapClient
 
-    models = (
-        db.query(Model)
-        .filter(or_(Model.model_format.is_(None), Model.model_format == "gguf"))
-        .all()
-    )
+    store = get_store()
+    models = [m for m in store.list_models() if (m.get("format") or m.get("model_format") or "gguf") == "gguf"]
+    try:
+        running_data = await LlamaSwapClient().get_running_models()
+        running_list = running_data.get("running") or []
+        running_names = {item.get("model") for item in running_list if item.get("state") in ("running", "ready")}
+    except Exception:
+        running_names = set()
 
-    # Group models by huggingface_id and base_model_name
     grouped_models = {}
     for model in models:
+        hf_id = model.get("huggingface_id") or ""
+        base_name = model.get("base_model_name") or (hf_id.split("/")[-1] if hf_id else model.get("display_name") or "unknown")
+        proxy_name = generate_proxy_name(hf_id, model.get("quantization"))
+        is_active = proxy_name in running_names
         is_embedding = _model_is_embedding(model)
-        key = f"{model.huggingface_id}_{model.base_model_name}"
+        key = f"{hf_id}_{base_name}"
         if key not in grouped_models:
-            # derive author/owner from huggingface_id
-            hf_id = model.huggingface_id or ""
-            author = (
-                hf_id.split("/")[0] if isinstance(hf_id, str) and "/" in hf_id else ""
-            )
+            author = hf_id.split("/")[0] if isinstance(hf_id, str) and "/" in hf_id else ""
             grouped_models[key] = {
-                "base_model_name": model.base_model_name,
-                "huggingface_id": model.huggingface_id,
-                "model_type": model.model_type,
+                "base_model_name": base_name,
+                "huggingface_id": hf_id,
+                "model_type": model.get("model_type"),
                 "author": author,
-                "pipeline_tag": model.pipeline_tag,
+                "pipeline_tag": model.get("pipeline_tag"),
                 "is_embedding_model": is_embedding,
                 "quantizations": [],
             }
         else:
-            if model.pipeline_tag and not grouped_models[key].get("pipeline_tag"):
-                grouped_models[key]["pipeline_tag"] = model.pipeline_tag
+            if model.get("pipeline_tag") and not grouped_models[key].get("pipeline_tag"):
+                grouped_models[key]["pipeline_tag"] = model.get("pipeline_tag")
             if is_embedding and not grouped_models[key].get("is_embedding_model"):
                 grouped_models[key]["is_embedding_model"] = True
 
-        grouped_models[key]["quantizations"].append(
-            {
-                "id": model.id,
-                "name": model.name,
-                "file_path": model.file_path,
-                "file_size": model.file_size,
-                "quantization": model.quantization,
-                "downloaded_at": model.downloaded_at,
-                "is_active": model.is_active,
-                "has_config": bool(model.config),
-                "huggingface_id": model.huggingface_id,
-                "base_model_name": model.base_model_name,
-                "model_type": model.model_type,
-                "config": _coerce_model_config(model.config),
-                "proxy_name": model.proxy_name,
-                "pipeline_tag": model.pipeline_tag,
-                "is_embedding_model": is_embedding,
-            }
-        )
+        # Resolve actual disk size: prefer HF cache, fall back to stored value
+        resolved_path = _get_model_file_path(model)
+        file_size = _get_actual_file_size(resolved_path) or model.get("file_size") or 0
+
+        grouped_models[key]["quantizations"].append({
+            "id": model.get("id"),
+            "name": model.get("display_name") or model.get("name"),
+            "filename": _get_model_filename(model),
+            "file_size": file_size,
+            "quantization": model.get("quantization"),
+            "format": model.get("format") or model.get("model_format") or "gguf",
+            "engine": model.get("engine") or "llama_cpp",
+            "downloaded_at": model.get("downloaded_at"),
+            "is_active": is_active,
+            "has_config": bool(model.get("config")),
+            "huggingface_id": hf_id,
+            "base_model_name": base_name,
+            "model_type": model.get("model_type"),
+            "config": _coerce_model_config(model.get("config")),
+            "proxy_name": proxy_name,
+            "pipeline_tag": model.get("pipeline_tag"),
+            "is_embedding_model": is_embedding,
+        })
 
-    # Convert to list and sort quantizations by file size (smallest first)
     result = []
     for group in grouped_models.values():
-        group["quantizations"].sort(key=lambda x: x["file_size"] or 0)
+        group["quantizations"].sort(key=lambda x: x.get("file_size") or 0)
         result.append(group)
-
-    # Sort groups by base model name
-    result.sort(key=lambda x: x["base_model_name"])
-
+    result.sort(key=lambda x: x.get("base_model_name") or "")
     return result
 
 
@@ -1363,6 +1351,19 @@ async def clear_search_cache_endpoint():
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@router.get("/search/{model_id:path}/file-sizes")
+async def get_search_file_sizes(
+    model_id: str,
+    filenames: str = Query(..., description="Comma-separated list of file paths in the repo"),
+):
+    """Get accurate file sizes for specific files in a repo via HuggingFace API."""
+    file_list = [f.strip() for f in filenames.split(",") if f.strip()]
+    if not file_list:
+        raise HTTPException(status_code=400, detail="At least one filename is required")
+    sizes = get_accurate_file_sizes(model_id, file_list)
+    return {"sizes": sizes}
+
+
 @router.get("/search/{model_id}/details")
 async def get_model_details_endpoint(model_id: str):
     """Get detailed model information including config and architecture"""
@@ -1419,58 +1420,43 @@ async def list_safetensors_models():
 
 
 @router.delete("/safetensors")
-async def delete_safetensors_model(request: dict, db: Session = Depends(get_db)):
+async def delete_safetensors_model(request: dict):
     """Delete entire safetensors model (all files for the repo)."""
     try:
         huggingface_id = request.get("huggingface_id")
         if not huggingface_id:
             raise HTTPException(status_code=400, detail="huggingface_id is required")
 
-        # Prevent deletion while runtime is active for this logical model
-        active_instance = (
-            db.query(RunningInstance)
-            .filter(RunningInstance.runtime_type == "lmdeploy")
-            .first()
-        )
-        target_model = (
-            db.query(Model)
-            .filter(
-                Model.huggingface_id == huggingface_id,
-                Model.model_format == "safetensors",
-            )
-            .first()
-        )
-        if (
-            active_instance
-            and target_model
-            and active_instance.model_id == target_model.id
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail="Cannot delete a model currently served by LMDeploy",
-            )
+        store = get_store()
+        model_id = huggingface_id.replace("/", "--")
+        target_model = store.get_model(model_id)
+        if not target_model or (target_model.get("format") or target_model.get("model_format")) != "safetensors":
+            raise HTTPException(status_code=404, detail="Safetensors model not found")
+
+        manager = get_lmdeploy_manager()
+        status = manager.status()
+        if status.get("running"):
+            current = status.get("current_instance") or {}
+            if str(current.get("model_id")) == str(model_id):
+                raise HTTPException(
+                    status_code=400,
+                    detail="Cannot delete a model currently served by LMDeploy",
+                )
 
-        # Get unified manifest and delete all files
         from backend.huggingface import (
             get_safetensors_manifest_entries,
             delete_safetensors_download,
         )
-
         manifest = get_safetensors_manifest_entries(huggingface_id)
         if not manifest or not manifest.get("files"):
             raise HTTPException(status_code=404, detail="Safetensors model not found")
 
-        # Delete all files in the unified manifest
         for file_entry in manifest.get("files", []):
             entry_filename = file_entry.get("filename")
             if entry_filename:
                 delete_safetensors_download(huggingface_id, entry_filename)
 
-        # Delete the single logical Model row
-        if target_model:
-            db.delete(target_model)
-            db.commit()
-
+        store.delete_model(model_id)
         return {"message": f"Safetensors model {huggingface_id} deleted"}
     except HTTPException:
         raise
@@ -1479,8 +1465,8 @@ async def delete_safetensors_model(request: dict, db: Session = Depends(get_db))
 
 
 @router.post("/safetensors/reload-from-disk")
-async def reload_safetensors_from_disk(db: Session = Depends(get_db)):
-    """Reset all safetensors database entries and reload them from file storage."""
+async def reload_safetensors_from_disk():
+    """Reset all safetensors store entries and reload them from file storage."""
     try:
         from backend.huggingface import (
             SAFETENSORS_DIR,
@@ -1488,27 +1474,22 @@ async def reload_safetensors_from_disk(db: Session = Depends(get_db)):
             get_default_lmdeploy_config,
         )
 
-        # Prevent reload while runtime is active
-        active_instance = (
-            db.query(RunningInstance)
-            .filter(RunningInstance.runtime_type == "lmdeploy")
-            .first()
-        )
-        if active_instance:
+        manager = get_lmdeploy_manager()
+        if manager.status().get("running"):
             raise HTTPException(
                 status_code=400,
                 detail="Cannot reload safetensors models while LMDeploy runtime is active. Please stop the runtime first.",
             )
 
-        # Delete all existing safetensors Model entries
-        safetensors_models = (
-            db.query(Model).filter(Model.model_format == "safetensors").all()
-        )
+        store = get_store()
+        safetensors_models = [
+            m for m in store.list_models()
+            if (m.get("format") or m.get("model_format")) == "safetensors"
+        ]
         deleted_count = len(safetensors_models)
         for model in safetensors_models:
-            db.delete(model)
-        db.commit()
-        logger.info(f"Deleted {deleted_count} safetensors model entries from database")
+            store.delete_model(model.get("id"))
+        logger.info(f"Deleted {deleted_count} safetensors model entries from store")
 
         # Delete all existing manifest files to regenerate from HuggingFace with defaults
         from backend.huggingface import _get_manifest_path
@@ -1567,99 +1548,25 @@ async def reload_safetensors_from_disk(db: Session = Depends(get_db)):
             if not safetensors_files:
                 continue
 
-            # Process each file to rebuild database entries
-            model_record = None
+            # Process each file to rebuild store entries (one model per repo via _save_safetensors_download)
             for file_info in safetensors_files:
                 try:
                     filename = file_info["filename"]
                     file_path = file_info["file_path"]
                     file_size = file_info["file_size"]
-
-                    # Collect metadata (this will also create/update the manifest)
-                    safetensors_metadata, tensor_summary, max_context = (
-                        await _collect_safetensors_runtime_metadata(
-                            huggingface_id, filename
-                        )
-                    )
-
-                    # Get or create model record (one per repo)
-                    if not model_record:
-                        detected_pipeline = safetensors_metadata.get("pipeline_tag")
-                        is_embedding_like = _looks_like_embedding_model(
-                            detected_pipeline, huggingface_id, filename
-                        )
-
-                        model_record = (
-                            db.query(Model)
-                            .filter(
-                                Model.huggingface_id == huggingface_id,
-                                Model.model_format == "safetensors",
-                            )
-                            .first()
-                        )
-
-                        if not model_record:
-                            model_record = Model(
-                                name=filename.replace(".safetensors", ""),
-                                huggingface_id=huggingface_id,
-                                base_model_name=extract_base_model_name(filename),
-                                file_path=file_path,  # Use first file's path
-                                file_size=0,  # Will be aggregated below
-                                quantization=os.path.splitext(filename)[0],
-                                model_type=extract_model_type(filename),
-                                downloaded_at=datetime.utcnow(),
-                                model_format="safetensors",
-                                pipeline_tag=detected_pipeline,
-                            )
-                            if is_embedding_like:
-                                model_record.config = {"embedding": True}
-                            db.add(model_record)
-                            db.commit()
-                            db.refresh(model_record)
-
-                    # Record file in manifest
-                    lmdeploy_config = get_default_lmdeploy_config(max_context)
-                    record_safetensors_download(
-                        huggingface_id=huggingface_id,
-                        filename=filename,
-                        file_path=file_path,
-                        file_size=file_size,
-                        metadata=safetensors_metadata,
-                        tensor_summary=tensor_summary,
-                        lmdeploy_config=lmdeploy_config,
-                        model_id=model_record.id,
+                    await _save_safetensors_download(
+                        store,
+                        huggingface_id,
+                        filename,
+                        file_path,
+                        file_size,
                     )
-
                 except Exception as exc:
                     error_msg = f"Failed to reload {huggingface_id}/{file_info.get('filename', 'unknown')}: {exc}"
                     logger.error(error_msg)
                     errors.append(error_msg)
                     continue
-
-            # Update model record with aggregated size
-            if model_record:
-                try:
-                    from backend.huggingface import list_safetensors_downloads
-
-                    manifests = list_safetensors_downloads()
-                    total_size = 0
-                    for manifest in manifests:
-                        if manifest.get("huggingface_id") == huggingface_id:
-                            total_size = sum(
-                                (f.get("file_size") or 0)
-                                for f in manifest.get("files", [])
-                            )
-                            break
-                    if total_size:
-                        model_record.file_size = total_size
-                        db.commit()
-                        db.refresh(model_record)
-                except Exception as exc:
-                    logger.warning(
-                        f"Failed to update aggregate size for {huggingface_id}: {exc}"
-                    )
-
-                reloaded_count += 1
+            reloaded_count += 1
 
         result = {
             "message": f"Reloaded {reloaded_count} safetensors models from disk",
@@ -1680,10 +1587,11 @@ async def reload_safetensors_from_disk(db: Session = Depends(get_db)):
         raise HTTPException(status_code=500, detail=str(e))
 
 
-@router.get("/safetensors/{model_id}/lmdeploy/config")
-async def get_lmdeploy_config_endpoint(model_id: int, db: Session = Depends(get_db)):
+@router.get("/safetensors/{model_id:path}/lmdeploy/config")
+async def get_lmdeploy_config_endpoint(model_id: str):
     """Return stored LMDeploy config and metadata for a safetensors model."""
-    model = _get_safetensors_model(model_id, db)
+    store = get_store()
+    model = _get_safetensors_model(store, model_id)
     manifest_entry = _load_manifest_entry_for_model(model)
     metadata = manifest_entry.get("metadata") or {}
     tensor_summary = manifest_entry.get("tensor_summary") or {}
@@ -1705,28 +1613,26 @@ async def get_lmdeploy_config_endpoint(model_id: int, db: Session = Depends(get_
     }
 
 
-@router.put("/safetensors/{model_id}/lmdeploy/config")
-async def update_lmdeploy_config_endpoint(
-    model_id: int, request: Dict[str, Any], db: Session = Depends(get_db)
-):
+@router.put("/safetensors/{model_id:path}/lmdeploy/config")
+async def update_lmdeploy_config_endpoint(model_id: str, request: Dict[str, Any]):
     """Persist LMDeploy configuration changes for a safetensors model."""
-    model = _get_safetensors_model(model_id, db)
+    store = get_store()
+    model = _get_safetensors_model(store, model_id)
     manifest_entry = _load_manifest_entry_for_model(model)
     validated_config = _validate_lmdeploy_config(request, manifest_entry)
-    updated_entry = update_lmdeploy_config(model.huggingface_id, validated_config)
+    updated_entry = update_lmdeploy_config(model.get("huggingface_id"), validated_config)
     return {
         "config": updated_entry.get("lmdeploy", {}).get("config", validated_config),
         "updated_at": updated_entry.get("lmdeploy", {}).get("updated_at"),
     }
 
 
-@router.post("/safetensors/{model_id}/metadata/regenerate")
-async def regenerate_safetensors_metadata_endpoint(
-    model_id: int, db: Session = Depends(get_db)
-):
+@router.post("/safetensors/{model_id:path}/metadata/regenerate")
+async def regenerate_safetensors_metadata_endpoint(model_id: str):
     """Refresh safetensors metadata/manifest entries without redownloading files."""
-    model = _get_safetensors_model(model_id, db)
-    huggingface_id = model.huggingface_id
+    store = get_store()
+    model = _get_safetensors_model(store, model_id)
+    huggingface_id = model.get("huggingface_id")
     manifest = get_safetensors_manifest_entries(huggingface_id)
     if not manifest or not manifest.get("files"):
         raise HTTPException(
@@ -1813,7 +1719,7 @@ async def regenerate_safetensors_metadata_endpoint(
 
 
 @router.get("/safetensors/lmdeploy/status")
-async def get_lmdeploy_status(db: Session = Depends(get_db)):
+async def get_lmdeploy_status():
     """Return LMDeploy runtime status and running instance info."""
     installer = get_lmdeploy_installer()
     installer_status = installer.status()
@@ -1829,45 +1735,18 @@ async def get_lmdeploy_status(db: Session = Depends(get_db)):
         )
 
     manager = get_lmdeploy_manager()
-    installer = get_lmdeploy_installer()
     manager_status = manager.status()
 
-    # Only return running_instance if LMDeploy is actually running
+    # Use manager's in-memory current_instance (no DB)
     instance_payload = None
     if manager_status.get("running"):
-        running_instance = (
-            db.query(RunningInstance)
-            .filter(RunningInstance.runtime_type == "lmdeploy")
-            .first()
-        )
-        if running_instance:
+        current_instance = manager_status.get("current_instance")
+        if current_instance:
             instance_payload = {
-                "model_id": running_instance.model_id,
-                "started_at": (
-                    running_instance.started_at.isoformat()
-                    if running_instance.started_at
-                    else None
-                ),
-                "config": (
-                    json.loads(running_instance.config)
-                    if running_instance.config
-                    else {}
-                ),
+                "model_id": current_instance.get("model_id"),
+                "started_at": current_instance.get("started_at"),
+                "config": current_instance.get("config") if isinstance(current_instance.get("config"), dict) else {},
             }
-    else:
-        # Clean up stale RunningInstance records if LMDeploy is not running
-        stale_instances = (
-            db.query(RunningInstance)
-            .filter(RunningInstance.runtime_type == "lmdeploy")
-            .all()
-        )
-        if stale_instances:
-            for instance in stale_instances:
-                model = db.query(Model).filter(Model.id == instance.model_id).first()
-                if model:
-                    model.is_active = False
-                db.delete(instance)
-            db.commit()
 
     return {
         "manager": manager_status,
@@ -1876,27 +1755,25 @@ async def get_lmdeploy_status(db: Session = Depends(get_db)):
     }
 
 
-@router.post("/safetensors/{model_id}/lmdeploy/start")
+@router.post("/safetensors/{model_id:path}/lmdeploy/start")
 async def start_lmdeploy_runtime(
-    model_id: int,
+    model_id: str,
     request: Optional[Dict[str, Any]] = None,
-    db: Session = Depends(get_db),
 ):
     """Start LMDeploy runtime for a safetensors model."""
-    model = _get_safetensors_model(model_id, db)
+    store = get_store()
+    model = _get_safetensors_model(store, model_id)
     manifest_entry = _load_manifest_entry_for_model(model)
     requested_config = (
         (request or {}).get("config") if isinstance(request, dict) else None
     )
     validated_config = _validate_lmdeploy_config(requested_config, manifest_entry)
 
-    existing_instance = (
-        db.query(RunningInstance)
-        .filter(RunningInstance.runtime_type == "lmdeploy")
-        .first()
-    )
-    if existing_instance:
-        if existing_instance.model_id == model.id:
+    manager = get_lmdeploy_manager()
+    status = manager.status()
+    current_instance = status.get("current_instance") or {}
+    if status.get("running"):
+        if current_instance.get("model_id") == model.get("id"):
             raise HTTPException(
                 status_code=400, detail="LMDeploy is already running for this model"
             )
@@ -1905,41 +1782,30 @@ async def start_lmdeploy_runtime(
             detail="Another safetensors model is already running via LMDeploy",
         )
 
-    manager = get_lmdeploy_manager()
-    status = manager.status()
-    current_instance = status.get("current_instance") or {}
-    if status.get("running") and current_instance.get("model_id") not in (
-        None,
-        model.id,
-    ):
-        raise HTTPException(
-            status_code=400, detail="LMDeploy runtime is already serving another model"
-        )
-
-    update_lmdeploy_config(model.huggingface_id, validated_config)
+    update_lmdeploy_config(model.get("huggingface_id"), validated_config)
 
-    from backend.main import websocket_manager
-
-    await websocket_manager.send_model_status_update(
-        model_id=model.id,
-        status="starting",
-        details={
-            "runtime": "lmdeploy",
-            "message": f"Starting LMDeploy for {model.name}",
-        },
-    )
+    try:
+        pm = get_progress_manager()
+        await pm.send_model_status_update(
+            model_id=model.get("id"),
+            status="starting",
+            details={
+                "runtime": "lmdeploy",
+                "message": f"Starting LMDeploy for {model.get('display_name') or model.get('name')}",
+            },
+        )
+    except Exception:
+        pass
 
     try:
-        # Derive a human-friendly model name for LMDeploy (used by --model-name).
-        # For unified safetensors models, use the Hugging Face repo id.
-        display_name = model.huggingface_id or model.base_model_name or model.name
-        # For unified manifests, use the model directory (contains all files)
-        model_dir = os.path.dirname(model.file_path)
+        display_name = model.get("huggingface_id") or model.get("base_model_name") or model.get("display_name") or model.get("name")
+        resolved_file_path = _get_model_file_path(model)
+        model_dir = os.path.dirname(resolved_file_path or "")
         runtime_status = await manager.start(
             {
-                "model_id": model.id,
-                "huggingface_id": model.huggingface_id,
-                "file_path": model.file_path,
+                "model_id": model.get("id"),
+                "huggingface_id": model.get("huggingface_id"),
+                "file_path": resolved_file_path,
                 "model_dir": model_dir,
                 "model_name": display_name,
                 "display_name": display_name,
@@ -1947,81 +1813,61 @@ async def start_lmdeploy_runtime(
             validated_config,
         )
     except Exception as exc:
-        await websocket_manager.send_model_status_update(
-            model_id=model.id,
-            status="error",
-            details={"runtime": "lmdeploy", "message": str(exc)},
-        )
+        try:
+            await get_progress_manager().send_model_status_update(
+                model_id=model.get("id"),
+                status="error",
+                details={"runtime": "lmdeploy", "message": str(exc)},
+            )
+        except Exception:
+            pass
         raise HTTPException(status_code=500, detail=str(exc))
 
-    running_instance = RunningInstance(
-        model_id=model.id,
-        llama_version="lmdeploy",
-        proxy_model_name=f"lmdeploy::{model.id}",
-        started_at=datetime.utcnow(),
-        config=json.dumps({"lmdeploy": validated_config}),
-        runtime_type="lmdeploy",
-    )
-    db.add(running_instance)
-    model.is_active = True
-    db.commit()
-
-    from backend.unified_monitor import unified_monitor
-
-    await unified_monitor._collect_and_send_unified_data()
-    await websocket_manager.send_model_status_update(
-        model_id=model.id,
-        status="running",
-        details={"runtime": "lmdeploy", "message": "LMDeploy is ready"},
-    )
+    try:
+        await get_progress_manager().send_model_status_update(
+            model_id=model.get("id"),
+            status="running",
+            details={"runtime": "lmdeploy", "message": "LMDeploy is ready"},
+        )
+    except Exception:
+        pass
 
     return {"manager": runtime_status, "config": validated_config}
 
 
-@router.post("/safetensors/{model_id}/lmdeploy/stop")
-async def stop_lmdeploy_runtime(model_id: int, db: Session = Depends(get_db)):
+@router.post("/safetensors/{model_id:path}/lmdeploy/stop")
+async def stop_lmdeploy_runtime(model_id: str):
     """Stop the LMDeploy runtime if it is running."""
-    running_instance = (
-        db.query(RunningInstance)
-        .filter(RunningInstance.runtime_type == "lmdeploy")
-        .first()
-    )
-    if not running_instance:
+    manager = get_lmdeploy_manager()
+    status = manager.status()
+    if not status.get("running"):
         raise HTTPException(status_code=404, detail="No LMDeploy runtime is active")
-    if running_instance.model_id != model_id:
+    current_instance = status.get("current_instance") or {}
+    if str(current_instance.get("model_id")) != str(model_id):
         raise HTTPException(
             status_code=400, detail="A different model is currently running in LMDeploy"
         )
 
-    manager = get_lmdeploy_manager()
     try:
         await manager.stop()
     except Exception as exc:
         raise HTTPException(status_code=500, detail=str(exc))
 
-    db.delete(running_instance)
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if model:
-        model.is_active = False
-    db.commit()
-
-    from backend.unified_monitor import unified_monitor
-
-    await unified_monitor._collect_and_send_unified_data()
-    from backend.main import websocket_manager
-
-    await websocket_manager.send_model_status_update(
-        model_id=model_id,
-        status="stopped",
-        details={"runtime": "lmdeploy", "message": "LMDeploy runtime stopped"},
-    )
+    try:
+        await get_progress_manager().send_model_status_update(
+            model_id=model_id,
+            status="stopped",
+            details={"runtime": "lmdeploy", "message": "LMDeploy runtime stopped"},
+        )
+    except Exception:
+        pass
 
     return {"message": "LMDeploy runtime stopped"}
 
 
 @router.post("/download")
 async def download_huggingface_model(
-    request: dict, background_tasks: BackgroundTasks, db: Session = Depends(get_db)
+    request: dict, background_tasks: BackgroundTasks
 ):
     """Download model from HuggingFace"""
     try:
@@ -2053,17 +1899,12 @@ async def download_huggingface_model(
                 detail="filename must end with .safetensors for Safetensors downloads",
             )
 
-        # Check if this specific quantization already exists in database
+        store = get_store()
+        # Check if this specific quantization already exists
         if model_format == "gguf":
-            existing = (
-                db.query(Model)
-                .filter(
-                    Model.huggingface_id == huggingface_id,
-                    Model.name == filename.replace(".gguf", ""),
-                )
-                .first()
-            )
-            if existing:
+            quantization = _extract_quantization(filename)
+            model_id = f"{huggingface_id.replace('/', '--')}--{quantization}"
+            if store.get_model(model_id):
                 raise HTTPException(
                     status_code=400, detail="This quantization is already downloaded"
                 )
@@ -2100,15 +1941,14 @@ async def download_huggingface_model(
                 "model_format": model_format,
             }
 
-        # Get websocket manager from main app
-        from backend.main import websocket_manager
-
-        # Start download in background (REMOVE db parameter, pass task_id)
+        # Start download in background with progress_manager for SSE
+        pm = get_progress_manager()
+        pm.create_task("download", f"Download {filename}", {"huggingface_id": huggingface_id, "filename": filename}, task_id=task_id)
         background_tasks.add_task(
             download_model_task,
             huggingface_id,
             filename,
-            websocket_manager,
+            pm,
             task_id,
             total_bytes,
             model_format,
@@ -2171,26 +2011,24 @@ async def set_huggingface_token_endpoint(request: dict):
 async def download_model_task(
     huggingface_id: str,
     filename: str,
-    websocket_manager=None,
+    progress_manager=None,
     task_id: str = None,
     total_bytes: int = 0,
     model_format: str = "gguf",
     pipeline_tag: Optional[str] = None,
 ):
-    """Background task to download model with WebSocket progress"""
-    from backend.database import SessionLocal
-
-    db = SessionLocal()
+    """Background task to download model with SSE progress"""
+    store = get_store()
 
     try:
         model_record = None
         metadata_result = None
 
-        if websocket_manager and task_id:
-            file_path, file_size = await download_model_with_websocket_progress(
+        if progress_manager and task_id:
+            file_path, file_size = await download_model_with_progress(
                 huggingface_id,
                 filename,
-                websocket_manager,
+                progress_manager,
                 task_id,
                 total_bytes,
                 model_format,
@@ -2203,16 +2041,38 @@ async def download_model_task(
 
         if model_format == "gguf":
             model_record, metadata_result = await _record_gguf_download_post_fetch(
-                db,
+                store,
                 huggingface_id,
                 filename,
                 file_path,
                 file_size,
                 pipeline_tag=pipeline_tag,
             )
+            # If vision (mmproj) is available, download F16 projector so the model can run with vision
+            if model_record:
+                mmproj_filename = get_mmproj_f16_filename(huggingface_id)
+                if mmproj_filename:
+                    try:
+                        await download_model(
+                            huggingface_id, mmproj_filename, "gguf"
+                        )
+                        store.update_model(
+                            model_record["id"], {"mmproj_filename": mmproj_filename}
+                        )
+                        model_record = store.get_model(model_record["id"]) or model_record
+                        if progress_manager and task_id:
+                            await progress_manager.send_notification(
+                                title="Vision extension",
+                                message=f"Downloaded {mmproj_filename} for vision support",
+                                type="info",
+                            )
+                    except Exception as mmproj_err:
+                        logger.warning(
+                            f"Could not download vision projector {mmproj_filename} for {huggingface_id}: {mmproj_err}"
+                        )
         else:
             model_record = await _save_safetensors_download(
-                db,
+                store,
                 huggingface_id,
                 filename,
                 file_path,
@@ -2220,20 +2080,21 @@ async def download_model_task(
                 pipeline_tag=pipeline_tag,
             )
 
-        # Send download complete WebSocket event (NEW)
-        if websocket_manager:
+        # Send download complete via SSE
+        if progress_manager and task_id:
+            progress_manager.complete_task(task_id, f"Downloaded {filename}")
             payload = {
                 "type": "download_complete",
                 "huggingface_id": huggingface_id,
                 "filename": filename,
                 "model_format": model_format,
-                "quantization": model_record.quantization if model_record else None,
-                "model_id": model_record.id if model_record else None,
+                "quantization": model_record.get("quantization") if model_record else None,
+                "model_id": model_record.get("id") if model_record else None,
                 "base_model_name": (
-                    model_record.base_model_name if model_record else None
+                    model_record.get("base_model_name") if model_record else None
                 ),
                 "pipeline_tag": (
-                    model_record.pipeline_tag if model_record else pipeline_tag
+                    model_record.get("pipeline_tag") if model_record else pipeline_tag
                 ),
                 "is_embedding_model": (
                     _model_is_embedding(model_record) if model_record else False
@@ -2248,40 +2109,38 @@ async def download_model_task(
                 "file_size": file_size,
                 "file_path": file_path,
             }
-            await websocket_manager.broadcast({**payload})
-
-            await websocket_manager.send_notification(
+            await progress_manager.broadcast({**payload})
+            await progress_manager.send_notification(
                 title="Download Complete",
                 message=f"Successfully downloaded {filename} ({model_format})",
                 type="success",
             )
 
     except Exception as e:
-        if websocket_manager:
-            await websocket_manager.send_notification(
+        if progress_manager and task_id:
+            progress_manager.fail_task(task_id, str(e))
+            await progress_manager.send_notification(
                 title="Download Failed",
                 message=f"Failed to download {filename}: {str(e)}",
                 type="error",
             )
     finally:
-        # Cleanup: remove from active downloads and close session
         if task_id:
             async with download_lock:
                 active_downloads.pop(task_id, None)
-        db.close()
 
 
 async def _record_gguf_download_post_fetch(
-    db: Session,
+    store,
     huggingface_id: str,
     filename: str,
     file_path: str,
     file_size: int,
     pipeline_tag: Optional[str] = None,
-) -> Tuple[Model, Optional[Dict[str, Any]]]:
+) -> Tuple[dict, Optional[Dict[str, Any]]]:
     """
-    Shared helper to create GGUF Model rows and manifest entries after a file has been downloaded.
-    Returns (model_record, metadata_result).
+    Shared helper to create GGUF model entries and manifest after a file has been downloaded.
+    Returns (model_record dict, metadata_result).
     """
     quantization = _extract_quantization(filename)
     base_model_name = extract_base_model_name(filename)
@@ -2296,89 +2155,68 @@ async def _record_gguf_download_post_fetch(
         detected_pipeline = "text-embedding"
     metadata_result: Optional[Dict[str, Any]] = None
 
-    # Reuse a single logical Model row per (huggingface_id, quantization) to avoid
-    # creating one entry per GGUF shard. Additional shards for the same quantization
-    # simply update size/metadata and are tracked in the GGUF manifest.
-    model_record = (
-        db.query(Model)
-        .filter(
-            Model.huggingface_id == huggingface_id,
-            Model.quantization == quantization,
-            Model.model_format == "gguf",
-        )
-        .first()
-    )
+    model_id = f"{huggingface_id.replace('/', '--')}--{quantization}"
+    model_record = store.get_model(model_id)
 
     if not model_record:
-        model_record = Model(
-            name=filename.replace(".gguf", ""),
-            huggingface_id=huggingface_id,
-            base_model_name=base_model_name,
-            file_path=file_path,
-            file_size=file_size,
-            quantization=quantization,
-            model_type=extract_model_type(filename),
-            proxy_name=generate_proxy_name(huggingface_id, quantization),
-            model_format="gguf",
-            downloaded_at=datetime.utcnow(),
-            pipeline_tag=detected_pipeline,
-        )
-        if is_embedding_like:
-            model_record.config = {"embedding": True}
-        db.add(model_record)
-        db.commit()
-        db.refresh(model_record)
+        from datetime import timezone as _tz
+        model_record = {
+            "id": model_id,
+            "huggingface_id": huggingface_id,
+            "filename": filename,
+            "display_name": filename.replace(".gguf", ""),
+            "base_model_name": base_model_name,
+            "file_size": file_size,
+            "quantization": quantization,
+            "model_type": extract_model_type(filename),
+            "proxy_name": generate_proxy_name(huggingface_id, quantization),
+            "format": "gguf",
+            "model_format": "gguf",
+            "downloaded_at": datetime.now(_tz.utc).isoformat(),
+            "pipeline_tag": detected_pipeline,
+            "config": {"embedding": True} if is_embedding_like else {},
+        }
+        store.add_model(model_record)
     else:
-        updated = False
-        # Keep first file_path as canonical; just update aggregate size.
+        updates = {}
         if file_size and file_size > 0:
-            current_size = model_record.file_size or 0
-            model_record.file_size = current_size + file_size
-            updated = True
-        if not model_record.pipeline_tag and detected_pipeline:
-            model_record.pipeline_tag = detected_pipeline
-            updated = True
+            current_size = model_record.get("file_size") or 0
+            updates["file_size"] = current_size + file_size
+        if not model_record.get("pipeline_tag") and detected_pipeline:
+            updates["pipeline_tag"] = detected_pipeline
         if is_embedding_like:
-            current_config = _coerce_model_config(model_record.config)
+            current_config = _coerce_model_config(model_record.get("config"))
             if not current_config.get("embedding"):
                 current_config["embedding"] = True
-                model_record.config = current_config
-                updated = True
-        if updated:
-            db.commit()
-            db.refresh(model_record)
+                updates["config"] = current_config
+        if updates:
+            store.update_model(model_id, updates)
+        model_record = store.get_model(model_id) or model_record
 
+    metadata_result = None
     try:
-        metadata_result = _refresh_model_metadata_from_file(model_record, db)
+        metadata_result = _refresh_model_metadata_from_file(model_record, store)
     except FileNotFoundError:
-        logger.warning(
-            f"Model file missing during metadata refresh for {model_record.id}"
-        )
+        logger.warning(f"Model file missing during metadata refresh for {model_record.get('id')}")
     except Exception as meta_exc:
-        logger.warning(
-            f"Failed to refresh metadata for model {model_record.id}: {meta_exc}"
-        )
+        logger.warning(f"Failed to refresh metadata for model {model_record.get('id')}: {meta_exc}")
 
     manifest_entry = None
     try:
         manifest_entry = await create_gguf_manifest_entry(
-            model_record.huggingface_id,
+            model_record.get("huggingface_id"),
             file_path,
             file_size,
-            model_id=model_record.id,
+            model_id=model_record.get("id"),
         )
     except Exception as manifest_exc:
-        logger.warning(
-            f"Failed to record GGUF manifest entry for {filename}: {manifest_exc}"
-        )
+        logger.warning(f"Failed to record GGUF manifest entry for {filename}: {manifest_exc}")
     if manifest_entry:
         metadata_for_defaults = manifest_entry.get("metadata") or {}
         try:
-            _apply_hf_defaults_to_model(model_record, metadata_for_defaults, db)
+            _apply_hf_defaults_to_model(model_record, metadata_for_defaults, store)
         except Exception as default_exc:
-            logger.warning(
-                f"Failed to apply HF defaults for model {model_record.id}: {default_exc}"
-            )
+            logger.warning(f"Failed to apply HF defaults for model {model_record.get('id')}: {default_exc}")
 
     return model_record, metadata_result
 
@@ -2386,13 +2224,11 @@ async def _record_gguf_download_post_fetch(
 async def download_safetensors_bundle_task(
     huggingface_id: str,
     files: List[Dict[str, Any]],
-    websocket_manager,
+    progress_manager,
     task_id: str,
     total_bundle_bytes: int = 0,
 ):
-    from backend.database import SessionLocal
-
-    db = SessionLocal()
+    store = get_store()
     try:
         total_files = len(files)
         bytes_completed = 0
@@ -2405,7 +2241,7 @@ async def download_safetensors_bundle_task(
             filename = file_info["filename"]
             size_hint = max(file_info.get("size") or 0, 0)
             proxy = BundleProgressProxy(
-                websocket_manager,
+                progress_manager,
                 task_id,
                 bytes_completed,
                 aggregate_total or 0,
@@ -2416,7 +2252,7 @@ async def download_safetensors_bundle_task(
                 "safetensors-bundle",
             )
 
-            file_path, file_size = await download_model_with_websocket_progress(
+            file_path, file_size = await download_model_with_progress(
                 huggingface_id,
                 filename,
                 proxy,
@@ -2429,7 +2265,7 @@ async def download_safetensors_bundle_task(
             if filename.endswith(".safetensors"):
                 try:
                     await _save_safetensors_download(
-                        db, huggingface_id, filename, file_path, file_size
+                        store, huggingface_id, filename, file_path, file_size
                     )
                 except Exception as exc:
                     logger.error(
@@ -2439,7 +2275,7 @@ async def download_safetensors_bundle_task(
             bytes_completed += file_size
 
         final_total = aggregate_total or bytes_completed
-        await websocket_manager.send_download_progress(
+        await progress_manager.send_download_progress(
             task_id=task_id,
             progress=100,
             message=f"Safetensors bundle downloaded ({total_files} files)",
@@ -2454,8 +2290,9 @@ async def download_safetensors_bundle_task(
             current_filename=files[-1]["filename"] if files else "",
             huggingface_id=huggingface_id,
         )
-
-        await websocket_manager.broadcast(
+        if progress_manager:
+            progress_manager.complete_task(task_id, "Safetensors bundle downloaded")
+        await progress_manager.broadcast(
             {
                 "type": "download_complete",
                 "huggingface_id": huggingface_id,
@@ -2466,14 +2303,15 @@ async def download_safetensors_bundle_task(
         )
     except Exception as exc:
         logger.error(f"Safetensors bundle download failed: {exc}")
-        if websocket_manager:
-            await websocket_manager.send_notification(
+        if progress_manager:
+            await progress_manager.send_notification(
                 "error",
                 "Download Failed",
                 f"Safetensors bundle failed: {str(exc)}",
                 task_id,
             )
-        await websocket_manager.broadcast(
+            progress_manager.fail_task(task_id, str(exc))
+        await progress_manager.broadcast(
             {
                 "type": "download_complete",
                 "huggingface_id": huggingface_id,
@@ -2483,36 +2321,23 @@ async def download_safetensors_bundle_task(
                 "error": str(exc),
             }
         )
-    else:
-        await websocket_manager.broadcast(
-            {
-                "type": "download_complete",
-                "huggingface_id": huggingface_id,
-                "model_format": "safetensors_bundle",
-                "filenames": [f["filename"] for f in files],
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
 
     finally:
         if task_id:
             async with download_lock:
                 active_downloads.pop(task_id, None)
-        db.close()
 
 
 async def download_gguf_bundle_task(
     huggingface_id: str,
     quantization: str,
     files: List[Dict[str, Any]],
-    websocket_manager,
+    progress_manager,
     task_id: str,
     total_bundle_bytes: int = 0,
     pipeline_tag: Optional[str] = None,
 ):
-    from backend.database import SessionLocal
-
-    db = SessionLocal()
+    store = get_store()
     try:
         total_files = len(files)
         bytes_completed = 0
@@ -2525,7 +2350,7 @@ async def download_gguf_bundle_task(
             filename = file_info["filename"]
             size_hint = max(file_info.get("size") or 0, 0)
             proxy = BundleProgressProxy(
-                websocket_manager,
+                progress_manager,
                 task_id,
                 bytes_completed,
                 aggregate_total or 0,
@@ -2536,7 +2361,7 @@ async def download_gguf_bundle_task(
                 "gguf-bundle",
             )
 
-            file_path, file_size = await download_model_with_websocket_progress(
+            file_path, file_size = await download_model_with_progress(
                 huggingface_id,
                 filename,
                 proxy,
@@ -2546,10 +2371,9 @@ async def download_gguf_bundle_task(
                 huggingface_id,
             )
 
-            # Reuse the standard GGUF recording path to keep DB and manifest consistent
             try:
                 await _record_gguf_download_post_fetch(
-                    db,
+                    store,
                     huggingface_id,
                     filename,
                     file_path,
@@ -2562,7 +2386,7 @@ async def download_gguf_bundle_task(
             bytes_completed += file_size
 
         final_total = aggregate_total or bytes_completed
-        await websocket_manager.send_download_progress(
+        await progress_manager.send_download_progress(
             task_id=task_id,
             progress=100,
             message=f"GGUF bundle downloaded ({total_files} files)",
@@ -2577,8 +2401,9 @@ async def download_gguf_bundle_task(
             current_filename=files[-1]["filename"] if files else "",
             huggingface_id=huggingface_id,
         )
-
-        await websocket_manager.broadcast(
+        if progress_manager:
+            progress_manager.complete_task(task_id, "GGUF bundle downloaded")
+        await progress_manager.broadcast(
             {
                 "type": "download_complete",
                 "huggingface_id": huggingface_id,
@@ -2590,14 +2415,15 @@ async def download_gguf_bundle_task(
         )
     except Exception as exc:
         logger.error(f"GGUF bundle download failed: {exc}")
-        if websocket_manager:
-            await websocket_manager.send_notification(
+        if progress_manager:
+            await progress_manager.send_notification(
                 "error",
                 "Download Failed",
                 f"GGUF bundle failed: {str(exc)}",
                 task_id,
             )
-        await websocket_manager.broadcast(
+            progress_manager.fail_task(task_id, str(exc))
+        await progress_manager.broadcast(
             {
                 "type": "download_complete",
                 "huggingface_id": huggingface_id,
@@ -2612,7 +2438,6 @@ async def download_gguf_bundle_task(
         if task_id:
             async with download_lock:
                 active_downloads.pop(task_id, None)
-        db.close()
 
 
 @router.post("/safetensors/download-bundle")
@@ -2656,13 +2481,13 @@ async def download_safetensors_bundle(
             "model_format": "safetensors_bundle",
         }
 
-    from backend.main import websocket_manager
-
+    pm = get_progress_manager()
+    pm.create_task("download", f"Safetensors bundle {huggingface_id}", {"huggingface_id": huggingface_id}, task_id=task_id)
     background_tasks.add_task(
         download_safetensors_bundle_task,
         huggingface_id,
         sanitized_files,
-        websocket_manager,
+        pm,
         task_id,
         declared_total,
     )
@@ -2724,14 +2549,14 @@ async def download_gguf_bundle(
             "model_format": "gguf-bundle",
         }
 
-    from backend.main import websocket_manager
-
+    pm = get_progress_manager()
+    pm.create_task("download", f"GGUF bundle {huggingface_id} ({quantization})", {"huggingface_id": huggingface_id, "quantization": quantization}, task_id=task_id)
     background_tasks.add_task(
         download_gguf_bundle_task,
         huggingface_id,
         quantization,
         sanitized_files,
-        websocket_manager,
+        pm,
         task_id,
         declared_total,
         pipeline_tag,
@@ -2787,342 +2612,174 @@ def extract_base_model_name(filename: str) -> str:
     return name if name else filename
 
 
-@router.get("/{model_id}/config")
-async def get_model_config(model_id: int, db: Session = Depends(get_db)):
+@router.get("/{model_id:path}/config")
+async def get_model_config(model_id: str):
     """Get model's llama.cpp configuration"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    return _coerce_model_config(model.get("config"))
 
-    return _coerce_model_config(model.config)
 
-
-@router.put("/{model_id}/config")
-async def update_model_config(
-    model_id: int, config: dict, db: Session = Depends(get_db)
-):
+@router.put("/{model_id:path}/config")
+async def update_model_config(model_id: str, config: dict):
     """Update model's llama.cpp configuration"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
-
-    model.config = config
-    db.commit()
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    store.update_model(model_id, {"config": config})
 
-    # Regenerate llama-swap configuration to reflect the updated model config
     try:
         from backend.llama_swap_manager import get_llama_swap_manager
-
         llama_swap_manager = get_llama_swap_manager()
         await llama_swap_manager.regenerate_config_with_active_version()
         logger.info(
-            f"Regenerated llama-swap config after updating model {model.name} configuration"
+            f"Regenerated llama-swap config after updating model {model.get('display_name') or model.get('name')} configuration"
         )
     except Exception as e:
-        logger.warning(
-            f"Failed to regenerate llama-swap config after model config update: {e}"
-        )
+        logger.warning(f"Failed to regenerate llama-swap config after model config update: {e}")
 
     return {"message": "Configuration updated"}
 
 
-@router.post("/{model_id}/auto-config")
-async def generate_auto_config(model_id: int, db: Session = Depends(get_db)):
-    """Generate optimal configuration using Smart-Auto"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
-
-    try:
-        gpu_info = await get_gpu_info()
-        smart_auto = SmartAutoConfig()
-        config = await smart_auto.generate_config(model, gpu_info)
-
-        # Save the generated config
-        model.config = config
-        db.commit()
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.post("/{model_id:path}/auto-config")
+async def generate_auto_config(model_id: str):
+    """Stub: return current config (Smart Auto removed). Optionally apply defaults."""
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    config = (model.get("config") or {}).copy()
+    config.setdefault("ctx_size", 2048)
+    config.setdefault("batch_size", 512)
+    config.setdefault("threads", 4)
+    config.setdefault("n_gpu_layers", -1)
+    store.update_model(model_id, {"config": config})
+    return config
 
-        # Regenerate llama-swap configuration to reflect the updated model config
-        try:
-            from backend.llama_swap_manager import get_llama_swap_manager
 
-            llama_swap_manager = get_llama_swap_manager()
-            await llama_swap_manager.regenerate_config_with_active_version()
-            logger.info(
-                f"Regenerated llama-swap config after auto-config for model {model.name}"
-            )
-        except Exception as e:
-            logger.warning(
-                f"Failed to regenerate llama-swap config after auto-config: {e}"
-            )
-
-        return config
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/{model_id}/smart-auto")
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.post("/{model_id:path}/smart-auto")
 async def generate_smart_auto_config(
-    model_id: int,
+    model_id: str,
     preset: Optional[str] = None,
     usage_mode: str = "single_user",
     speed_quality: Optional[int] = None,
     use_case: Optional[str] = None,
     debug: Optional[bool] = False,
-    db: Session = Depends(get_db),
 ):
-    """
-    Generate smart auto configuration with optional preset tuning, speed/quality balance, and use case.
-
-    preset: Optional preset name (coding, conversational, long_context) to use as tuning parameters
-    usage_mode: 'single_user' (sequential, peak KV cache) or 'multi_user' (server, typical usage)
-    speed_quality: Speed/quality balance (0-100), where 0 = max speed, 100 = max quality. Default: 50 (balanced)
-    use_case: Optional use case ('chat', 'code', 'creative', 'analysis') for targeted optimization
-    """
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
-
-    try:
-        gpu_info = await get_gpu_info()
-        smart_auto = SmartAutoConfig()
-        debug_map = {} if debug else None
-
-        # Validate usage_mode
-        if usage_mode not in ["single_user", "multi_user"]:
-            usage_mode = "single_user"  # Default to single_user if invalid
-
-        # Validate and normalize speed_quality (0-100, default 50)
-        if speed_quality is not None:
-            speed_quality = max(0, min(100, int(speed_quality)))
-        else:
-            speed_quality = 50
-
-        # Validate use_case
-        if use_case is not None and use_case not in [
-            "chat",
-            "code",
-            "creative",
-            "analysis",
-        ]:
-            use_case = None  # Invalid use case, ignore it
-
-        # If preset is provided, pass it to generate_config for tuning
-        # Also pass speed_quality and use_case for wizard-based configuration
-        config = await smart_auto.generate_config(
-            model,
-            gpu_info,
-            preset=preset,
-            usage_mode=usage_mode,
-            speed_quality=speed_quality,
-            use_case=use_case,
-            debug=debug_map,
-        )
-
-        if debug_map is not None:
-            return {"config": config, "debug": debug_map}
-        return config
-
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/{model_id}/start")
-async def start_model(model_id: int, db: Session = Depends(get_db)):
+    """Stub: apply defaults (Smart Auto removed)."""
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    config = (model.get("config") or {}).copy()
+    config.setdefault("ctx_size", 2048)
+    config.setdefault("batch_size", 512)
+    config.setdefault("threads", 4)
+    config.setdefault("n_gpu_layers", -1)
+    store.update_model(model_id, {"config": config})
+    return config
+
+
+@router.post("/{model_id:path}/start")
+async def start_model(model_id: str):
     """Start model via llama-swap"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    from backend.llama_swap_client import LlamaSwapClient
 
-    # Check if already running
-    existing = (
-        db.query(RunningInstance).filter(RunningInstance.model_id == model_id).first()
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    proxy_model_name = model.get("proxy_name") or generate_proxy_name(
+        model.get("huggingface_id"), model.get("quantization")
     )
-    if existing:
-        raise HTTPException(status_code=400, detail="Model already running")
 
     try:
-        from backend.unified_monitor import unified_monitor
-        from backend.main import websocket_manager
+        running_data = await LlamaSwapClient().get_running_models()
+        running_list = running_data.get("running") or []
+        running_names = {item.get("model") for item in running_list if item.get("state") in ("running", "ready")}
+    except Exception:
+        running_names = set()
+    if proxy_model_name in running_names:
+        raise HTTPException(status_code=400, detail="Model already running")
 
-        await websocket_manager.send_model_status_update(
+    try:
+        await get_progress_manager().send_model_status_update(
             model_id=model_id,
             status="starting",
-            details={"message": f"Starting {model.name}"},
-        )
-
-        # Get proxy name from database (config already contains this model)
-        if not model.proxy_name:
-            raise ValueError(f"Model '{model.name}' does not have a proxy_name set")
-        proxy_model_name = model.proxy_name
-
-        # Get model configuration (for database record, not config file)
-        config = _coerce_model_config(model.config)
-        if _looks_like_embedding_model(
-            model.pipeline_tag, model.huggingface_id, model.name, model.base_model_name
-        ) and not config.get("embedding"):
-            config["embedding"] = True
-            model.config = config
-            db.commit()
-
-        # NOTE: We do NOT trigger model loading here.
-        # The model will load on-demand when the first API request is made.
-        # This avoids memory issues from making inference requests during load.
-        #
-        # With sendLoadingState: true (llama-swap v171+), the first request will
-        # stream loading progress to the user.
-        logger.info(
-            f"Model {proxy_model_name} registered - will load on first API request"
+            details={"message": f"Starting {model.get('display_name') or model.get('name')}"},
         )
+    except Exception:
+        pass
 
-        # Save to database
-        running_instance = RunningInstance(
-            model_id=model_id,
-            llama_version=config.get("llama_version", "default"),
-            proxy_model_name=proxy_model_name,
-            started_at=datetime.utcnow(),
-            config=json.dumps(config),
-            runtime_type="llama_cpp",
-        )
-        db.add(running_instance)
-        model.is_active = True
-        db.commit()
-
-        # Broadcast ready event - model is registered and available for requests
-        # The actual loading happens on first API request (on-demand)
-        await unified_monitor.broadcast_model_event(
-            "ready", proxy_model_name, {"model_id": model_id, "model_name": model.name}
-        )
-        await unified_monitor.trigger_status_update()
-
-        return {
-            "model_id": model_id,
-            "proxy_model_name": proxy_model_name,
-            "port": 2000,
-            "api_endpoint": f"http://localhost:2000/v1/chat/completions",
-        }
-
-    except Exception as e:
-        # Clear loading state on error
-        if model.proxy_name:
-            unified_monitor.mark_model_stopped(model.proxy_name)
-
-        await websocket_manager.send_model_status_update(
-            model_id=model_id,
-            status="error",
-            details={"message": f"Failed to start: {str(e)}"},
-        )
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/{model_id}/stop")
-async def stop_model(model_id: int, db: Session = Depends(get_db)):
-    """Stop model via llama-swap"""
-    running_instance = (
-        db.query(RunningInstance).filter(RunningInstance.model_id == model_id).first()
-    )
-    if not running_instance:
-        raise HTTPException(status_code=404, detail="No running instance found")
+    config = _coerce_model_config(model.get("config"))
+    if _model_is_embedding(model) and not config.get("embedding"):
+        config["embedding"] = True
+        store.update_model(model_id, {"config": config})
 
     try:
         from backend.llama_swap_manager import get_llama_swap_manager
-        from backend.main import websocket_manager
-        from backend.unified_monitor import unified_monitor
-
         llama_swap_manager = get_llama_swap_manager()
-
-        proxy_name = running_instance.proxy_model_name
-
-        # Clear loading state if model was still loading
-        if proxy_name:
-            unified_monitor.mark_model_stopped(proxy_name)
-
-        # Unregister from llama-swap (it stops the process)
-        if proxy_name:
-            logger.info(f"Calling unregister_model with proxy_model_name: {proxy_name}")
-            await llama_swap_manager.unregister_model(proxy_name)
-            logger.info("unregister_model call completed")
-
-        # Update database
-        db.delete(running_instance)
-        model = db.query(Model).filter(Model.id == model_id).first()
-        if model:
-            model.is_active = False
-        db.commit()
-
-        # Broadcast stopped event immediately (event-driven, no polling)
-        if proxy_name:
-            await unified_monitor.broadcast_model_event(
-                "stopped", proxy_name, {"model_id": model_id}
-            )
-        await unified_monitor.trigger_status_update()
-
-        return {"message": "Model stopped"}
-
+        await llama_swap_manager.regenerate_config_with_active_version()
+        model_with_proxy = {**(model or {}), "proxy_name": proxy_model_name}
+        await llama_swap_manager.register_model(model_with_proxy, config)
     except Exception as e:
-        await websocket_manager.send_model_status_update(
-            model_id=model_id,
-            status="error",
-            details={"message": f"Failed to stop: {str(e)}"},
-        )
+        try:
+            await get_progress_manager().send_model_status_update(
+                model_id=model_id,
+                status="error",
+                details={"message": f"Failed to start: {str(e)}"},
+            )
+        except Exception:
+            pass
         raise HTTPException(status_code=500, detail=str(e))
 
+    try:
+        get_progress_manager().emit("model_event", {"event": "ready", "proxy_name": proxy_model_name, "model_id": model_id, "model_name": model.get("display_name") or model.get("name")})
+    except Exception:
+        pass
 
-@router.post("/vram-estimate")
-async def estimate_vram_usage(
-    request: EstimationRequest, db: Session = Depends(get_db)
-):
-    """Estimate VRAM usage for given configuration"""
-    model = db.query(Model).filter(Model.id == request.model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    return {
+        "model_id": model_id,
+        "proxy_model_name": proxy_model_name,
+        "port": 2000,
+        "api_endpoint": "http://localhost:2000/v1/chat/completions",
+    }
 
-    try:
-        gpu_info = await get_cached_gpu_info()
-        smart_auto = SmartAutoConfig()
-        usage_mode = (
-            request.usage_mode
-            if request.usage_mode in ["single_user", "multi_user"]
-            else "single_user"
-        )
-        metadata = get_model_metadata(model)
-        vram_estimate = smart_auto.estimate_vram_usage(
-            model,
-            request.config,
-            gpu_info,
-            usage_mode=usage_mode,
-            metadata=metadata,
-        )
 
-        return vram_estimate
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/{model_id:path}/stop")
+async def stop_model(model_id: str):
+    """Stop model via llama-swap"""
+    from backend.llama_swap_client import LlamaSwapClient
 
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    proxy_name = model.get("proxy_name") or generate_proxy_name(
+        model.get("huggingface_id"), model.get("quantization")
+    )
 
-@router.post("/ram-estimate")
-async def estimate_ram_usage(request: EstimationRequest, db: Session = Depends(get_db)):
-    """Estimate RAM usage for given configuration"""
     try:
-        model = db.query(Model).filter(Model.id == request.model_id).first()
-        if not model:
-            raise HTTPException(status_code=404, detail="Model not found")
-
-        smart_auto = SmartAutoConfig()
-        usage_mode = (
-            request.usage_mode
-            if request.usage_mode in ["single_user", "multi_user"]
-            else "single_user"
-        )
-        metadata = get_model_metadata(model)
-        ram_estimate = smart_auto.estimate_ram_usage(
-            model,
-            request.config,
-            usage_mode=usage_mode,
-            metadata=metadata,
-        )
+        running_data = await LlamaSwapClient().get_running_models()
+        running_list = running_data.get("running") or []
+        running_names = {item.get("model") for item in running_list if item.get("state") in ("running", "ready", "loading")}
+    except Exception:
+        running_names = set()
+    if proxy_name not in running_names:
+        raise HTTPException(status_code=404, detail="No running instance found")
 
-        return ram_estimate
+    try:
+        from backend.llama_swap_manager import get_llama_swap_manager
+        llama_swap_manager = get_llama_swap_manager()
+        logger.info(f"Calling unregister_model with proxy_model_name: {proxy_name}")
+        await llama_swap_manager.unregister_model(proxy_name)
+        try:
+            get_progress_manager().emit("model_event", {"event": "stopped", "proxy_name": proxy_name, "model_id": model_id})
+        except Exception:
+            pass
+        return {"message": "Model stopped"}
     except Exception as e:
+        try:
+            await get_progress_manager().send_model_status_update(
+                model_id=model_id,
+                status="error",
+                details={"message": f"Failed to stop: {str(e)}"},
+            )
+        except Exception:
+            pass
         raise HTTPException(status_code=500, detail=str(e))
 
 
@@ -3186,116 +2843,96 @@ class DeleteGroupRequest(BaseModel):
 
 
 @router.post("/delete-group")
-async def delete_model_group(
-    request: DeleteGroupRequest, db: Session = Depends(get_db)
-):
+async def delete_model_group(request: DeleteGroupRequest):
     """Delete all quantizations of a model group"""
+    from backend.llama_swap_client import LlamaSwapClient
+
     huggingface_id = request.huggingface_id
-    models = db.query(Model).filter(Model.huggingface_id == huggingface_id).all()
+    store = get_store()
+    models = [m for m in store.list_models() if m.get("huggingface_id") == huggingface_id]
     if not models:
         raise HTTPException(status_code=404, detail="Model group not found")
 
+    try:
+        running_data = await LlamaSwapClient().get_running_models()
+        running_list = running_data.get("running") or []
+        running_names = {item.get("model") for item in running_list if item.get("state") in ("running", "ready", "loading")}
+    except Exception:
+        running_names = set()
+
     deleted_count = 0
     for model in models:
-        # Stop if running
-        running_instance = (
-            db.query(RunningInstance)
-            .filter(RunningInstance.model_id == model.id)
-            .first()
-        )
-        if running_instance:
-            # Stop via llama-swap
+        proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization"))
+        if proxy_name in running_names:
             try:
                 from backend.llama_swap_manager import get_llama_swap_manager
-
-                llama_swap_manager = get_llama_swap_manager()
-                if running_instance.proxy_model_name:
-                    await llama_swap_manager.unregister_model(
-                        running_instance.proxy_model_name
-                    )
+                await get_llama_swap_manager().unregister_model(proxy_name)
             except Exception as e:
-                logger.warning(
-                    f"Failed to stop model {running_instance.proxy_model_name}: {e}"
-                )
-            db.delete(running_instance)
-
-        # Delete file
-        normalized_path = _normalize_model_path(model.file_path)
-        if normalized_path and os.path.exists(normalized_path):
-            os.remove(normalized_path)
-
-        # Delete from database
-        db.delete(model)
+                logger.warning(f"Failed to stop model {proxy_name}: {e}")
+
+        fname = _get_model_filename(model)
+        if model.get("huggingface_id") and fname:
+            from backend.huggingface import delete_cached_model_file
+            deleted_file = delete_cached_model_file(model.get("huggingface_id"), fname)
+            if not deleted_file:
+                legacy_path = _normalize_model_path(model.get("file_path"))
+                if legacy_path and os.path.exists(legacy_path):
+                    os.remove(legacy_path)
+
+        store.delete_model(model.get("id"))
         deleted_count += 1
 
-    db.commit()
-
-    # If this was a GGUF group and no models remain, clean up the repo folder
-    remaining_gguf = (
-        db.query(Model)
-        .filter(Model.huggingface_id == huggingface_id, Model.model_format == "gguf")
-        .count()
-    )
-    if remaining_gguf == 0:
-        _cleanup_model_folder_if_no_quantizations(db, huggingface_id, "gguf")
-
     return {"message": f"Deleted {deleted_count} quantizations"}
 
 
-@router.delete("/{model_id}")
-async def delete_model(model_id: int, db: Session = Depends(get_db)):
+@router.delete("/{model_id:path}")
+async def delete_model(model_id: str):
     """Delete individual model quantization and its files"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    from backend.llama_swap_client import LlamaSwapClient
 
-    # Stop if running
-    running_instance = (
-        db.query(RunningInstance).filter(RunningInstance.model_id == model_id).first()
-    )
-    if running_instance:
-        # Stop via llama-swap
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization"))
+
+    try:
+        running_data = await LlamaSwapClient().get_running_models()
+        running_list = running_data.get("running") or []
+        running_names = {item.get("model") for item in running_list if item.get("state") in ("running", "ready", "loading")}
+    except Exception:
+        running_names = set()
+    if proxy_name in running_names:
         try:
             from backend.llama_swap_manager import get_llama_swap_manager
-
-            llama_swap_manager = get_llama_swap_manager()
-            if running_instance.proxy_model_name:
-                await llama_swap_manager.unregister_model(
-                    running_instance.proxy_model_name
-                )
+            await get_llama_swap_manager().unregister_model(proxy_name)
         except Exception as e:
-            logger.warning(
-                f"Failed to stop model {running_instance.proxy_model_name}: {e}"
-            )
-        db.delete(running_instance)
-
-    huggingface_id = model.huggingface_id
-    model_format = (model.model_format or "gguf").lower()
-
-    # Delete file
-    normalized_path = _normalize_model_path(model.file_path)
-    if normalized_path and os.path.exists(normalized_path):
-        os.remove(normalized_path)
-
-    # Delete from database
-    db.delete(model)
-    db.commit()
-
-    # If this was the last quantization for this repo/format, remove its folder
-    _cleanup_model_folder_if_no_quantizations(db, huggingface_id, model_format)
-
+            logger.warning(f"Failed to stop model {proxy_name}: {e}")
+
+    huggingface_id = model.get("huggingface_id")
+    filename = _get_model_filename(model)
+
+    if huggingface_id and filename:
+        from backend.huggingface import delete_cached_model_file
+        deleted = delete_cached_model_file(huggingface_id, filename)
+        if not deleted:
+            # Fall back to direct removal for legacy records with file_path
+            legacy_path = _normalize_model_path(model.get("file_path"))
+            if legacy_path and os.path.exists(legacy_path):
+                os.remove(legacy_path)
+                logger.info(f"Removed legacy model file: {legacy_path}")
+
+    store.delete_model(model_id)
     return {"message": "Model quantization deleted"}
 
 
-@router.get("/{model_id}/layer-info")
-async def get_model_layer_info_endpoint(model_id: int, db: Session = Depends(get_db)):
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.get("/{model_id:path}/layer-info")
+async def get_model_layer_info_endpoint(model_id: str):
     """Get model layer information from GGUF metadata"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
 
     layer_info = None
-    normalized_path = _normalize_model_path(model.file_path)
+    normalized_path = _get_model_file_path(model)
     if normalized_path and os.path.exists(normalized_path):
         try:
             layer_info = get_model_layer_info(normalized_path)
@@ -3338,85 +2975,59 @@ async def get_model_layer_info_endpoint(model_id: int, db: Session = Depends(get
     }
 
 
-@router.get("/{model_id}/recommendations")
-async def get_model_recommendations_endpoint(
-    model_id: int, db: Session = Depends(get_db)
-):
-    """Get configuration recommendations for a model based on its architecture"""
-    from backend.smart_auto.recommendations import get_model_recommendations
-
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.get("/{model_id:path}/recommendations")
+async def get_model_recommendations_endpoint(model_id: str):
+    """Stub: recommendations removed with smart_auto. Returns empty defaults."""
+    return {"gpu_layers": None, "context_size": None, "batch_size": None}
 
-    normalized_path = _normalize_model_path(model.file_path)
-    file_path = (
-        normalized_path if normalized_path and os.path.exists(normalized_path) else None
-    )
 
-    try:
-        # Get layer info from GGUF metadata (if available)
-        layer_info = get_model_layer_info(file_path) if file_path else None
-    except Exception as e:
-        logger.error(
-            f"Failed to get layer info for recommendations (model {model_id}): {e}"
-        )
-        layer_info = None
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.get("/{model_id:path}/architecture-presets")
+async def get_architecture_presets_endpoint(model_id: str):
+    """Stub: presets removed. Returns minimal structure."""
+    return {"architecture": "unknown", "presets": {}, "available_presets": []}
 
-    if not layer_info:
-        layer_info = {
-            "layer_count": 32,
-            "architecture": "unknown",
-            "context_length": 0,
-            "attention_head_count": 0,
-            "embedding_length": 0,
-        }
 
-    try:
-        recommendations = await get_model_recommendations(
-            model_layer_info=layer_info,
-            model_name=model.name or model.huggingface_id or "",
-            file_path=file_path,
-        )
-        return recommendations
-    except Exception as e:
-        logger.error(f"Failed to get recommendations for model {model_id}: {e}")
-        raise HTTPException(
-            status_code=500, detail=f"Failed to get recommendations: {str(e)}"
-        )
-
-
-@router.get("/{model_id}/architecture-presets")
-async def get_architecture_presets_endpoint(
-    model_id: int, db: Session = Depends(get_db)
-):
-    """Get architecture-specific presets for a model"""
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
-
-    architecture, presets = get_architecture_and_presets(model)
-    return {
-        "architecture": architecture,
-        "presets": presets,
-        "available_presets": list(presets.keys()),
-    }
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.post("/vram-estimate")
+async def estimate_vram_usage(request: EstimationRequest):
+    """Stub: simple VRAM estimate (smart_auto removed)."""
+    store = get_store()
+    _get_model_or_404(store, request.model_id)
+    cfg = request.config or {}
+    ngl = int(cfg.get("n_gpu_layers") or -1)
+    ctx = int(cfg.get("ctx_size") or 2048)
+    # Very rough: ~1GB base + per-layer and context
+    estimate_mb = 1024 + (abs(ngl) * 50 if ngl != -1 else 2000) + (ctx // 64)
+    return {"vram_estimate_mb": min(estimate_mb, 96 * 1024), "vram_estimate_gb": round(estimate_mb / 1024, 2)}
+
+
+# DEPRECATED: remove with ModelConfig.vue rewrite
+@router.post("/ram-estimate")
+async def estimate_ram_usage(request: EstimationRequest):
+    """Stub: simple RAM estimate (smart_auto removed)."""
+    store = get_store()
+    _get_model_or_404(store, request.model_id)
+    cfg = request.config or {}
+    ctx = int(cfg.get("ctx_size") or 2048)
+    estimate_mb = 512 + (ctx // 32)
+    return {"ram_estimate_mb": estimate_mb, "ram_estimate_gb": round(estimate_mb / 1024, 2)}
 
 
-@router.get("/{model_id}/hf-metadata")
-async def get_model_hf_metadata(model_id: int, db: Session = Depends(get_db)):
-    model = db.query(Model).filter(Model.id == model_id).first()
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+@router.get("/{model_id:path}/hf-metadata")
+async def get_model_hf_metadata(model_id: str):
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
 
     metadata_entry = None
-    if (model.model_format or "gguf").lower() == "safetensors":
+    if (model.get("model_format") or model.get("format") or "gguf").lower() == "safetensors":
         metadata_entry = _load_manifest_entry_for_model(model)
     else:
-        filename = _extract_filename(model.file_path)
+        filename = _get_model_filename(model)
         if not filename:
-            raise HTTPException(status_code=400, detail="Model file path is not set")
-        metadata_entry = get_gguf_manifest_entry(model.huggingface_id, filename)
+            raise HTTPException(status_code=400, detail="Model filename is not set")
+        metadata_entry = get_gguf_manifest_entry(model.get("huggingface_id"), filename)
 
     if not metadata_entry:
         raise HTTPException(status_code=404, detail="Metadata not found for model")
@@ -3432,19 +3043,16 @@ async def get_model_hf_metadata(model_id: int, db: Session = Depends(get_db)):
     }
 
 
-@router.post("/{model_id}/regenerate-info")
-async def regenerate_model_info_endpoint(model_id: int, db: Session = Depends(get_db)):
+@router.post("/{model_id:path}/regenerate-info")
+async def regenerate_model_info_endpoint(model_id: str):
     """
-    Regenerate model information from GGUF metadata and update the database.
-    This will re-read the model file and update architecture, layer count, and other metadata.
+    Regenerate model information from GGUF metadata and update the store.
     """
-    model = db.query(Model).filter(Model.id == model_id).first()
-
-    if not model:
-        raise HTTPException(status_code=404, detail="Model not found")
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
 
     try:
-        metadata = _refresh_model_metadata_from_file(model, db)
+        metadata = _refresh_model_metadata_from_file(model, store)
         return {
             "success": True,
             "model_id": model_id,
@@ -3456,32 +3064,27 @@ async def regenerate_model_info_endpoint(model_id: int, db: Session = Depends(ge
     except ValueError as ve:
         raise HTTPException(status_code=500, detail=str(ve))
     except Exception as e:
-        logger.error(
-            f"Failed to regenerate model info for model {model_id}: {e}", exc_info=True
-        )
-        db.rollback()
-        raise HTTPException(
-            status_code=500, detail=f"Failed to regenerate model info: {str(e)}"
-        )
+        logger.error(f"Failed to regenerate model info for model {model_id}: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Failed to regenerate model info: {str(e)}")
 
 
 @router.get("/supported-flags")
-async def get_supported_flags_endpoint(db: Session = Depends(get_db)):
+async def get_supported_flags_endpoint():
     """Get the list of supported flags for the active llama-server binary"""
     try:
-        # Get the active llama-cpp version
-        active_version = (
-            db.query(LlamaVersion).filter(LlamaVersion.is_active == True).first()
-        )
+        store = get_store()
+        active_version = store.get_active_engine_version("llama_cpp")
+        if not active_version:
+            active_version = store.get_active_engine_version("ik_llama")
 
-        if not active_version or not active_version.binary_path:
+        if not active_version or not active_version.get("binary_path"):
             return {
                 "supported_flags": [],
                 "binary_path": None,
                 "error": "No active llama-cpp version found",
             }
 
-        binary_path = active_version.binary_path
+        binary_path = active_version.get("binary_path")
 
         # Convert to absolute path if needed
         if not os.path.isabs(binary_path):
diff --git a/backend/routes/status.py b/backend/routes/status.py
index bef69ae..c4211d0 100644
--- a/backend/routes/status.py
+++ b/backend/routes/status.py
@@ -1,50 +1,55 @@
-from fastapi import APIRouter, Depends
-from sqlalchemy.orm import Session
+from fastapi import APIRouter
 import psutil
 import os
 
-from backend.database import get_db, RunningInstance
+from backend.llama_swap_client import LlamaSwapClient
 from backend.lmdeploy_manager import get_lmdeploy_manager
 from backend.lmdeploy_installer import get_lmdeploy_installer
 
 router = APIRouter()
 
+DEFAULT_PROXY_PORT = 2000
+LMDEPLOY_PORT = 2001
 
-@router.get("/status")
-async def get_system_status(db: Session = Depends(get_db)):
-    """Get system status and running instances"""
-    running_instances = db.query(RunningInstance).all()
 
-    # Get system info
-    cpu_percent = psutil.cpu_percent(interval=1)
-    memory = psutil.virtual_memory()
-    # Use data directory at project root or /app/data for Docker
-    data_dir = "data" if os.path.exists("data") else "/app/data"
+@router.get("/status")
+async def get_system_status():
+    """Get system status and running instances (from llama-swap)."""
+    client = LlamaSwapClient()
     try:
-        disk = psutil.disk_usage(data_dir)
-    except FileNotFoundError:
-        # Fallback to root directory if data doesn't exist
-        disk = psutil.disk_usage("/")
+        running_data = await client.get_running_models()
+    except Exception:
+        running_data = {"running": []}
+    if isinstance(running_data, list):
+        running_list = running_data
+    else:
+        running_list = running_data.get("running") or []
 
-    # Format running instances (no process checking needed)
-    DEFAULT_PROXY_PORT = 2000
-    LMDEPLOY_PORT = 2001
     active_instances = []
-    for instance in running_instances:
-        port = (
-            LMDEPLOY_PORT if instance.runtime_type == "lmdeploy" else DEFAULT_PROXY_PORT
-        )
+    for i, item in enumerate(running_list):
+        proxy_model_name = item.get("model", "")
+        state = item.get("state", "")
+        runtime_type = "lmdeploy" if state == "lmdeploy" else "llama_cpp"
+        port = LMDEPLOY_PORT if runtime_type == "lmdeploy" else DEFAULT_PROXY_PORT
         active_instances.append(
             {
-                "id": instance.id,
-                "model_id": instance.model_id,
+                "id": i,
+                "model_id": proxy_model_name,
                 "port": port,
-                "runtime_type": instance.runtime_type,
-                "proxy_model_name": instance.proxy_model_name,
-                "started_at": instance.started_at,
+                "runtime_type": runtime_type,
+                "proxy_model_name": proxy_model_name,
+                "started_at": None,
             }
         )
 
+    cpu_percent = psutil.cpu_percent(interval=1)
+    memory = psutil.virtual_memory()
+    data_dir = "data" if os.path.exists("data") else "/app/data"
+    try:
+        disk = psutil.disk_usage(data_dir)
+    except FileNotFoundError:
+        disk = psutil.disk_usage("/")
+
     lmdeploy_manager = get_lmdeploy_manager()
     lmdeploy_status = lmdeploy_manager.status()
     installer_status = get_lmdeploy_installer().status()
diff --git a/backend/routes/unified_monitoring.py b/backend/routes/unified_monitoring.py
deleted file mode 100644
index 8faf061..0000000
--- a/backend/routes/unified_monitoring.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from fastapi import APIRouter, WebSocket, WebSocketDisconnect
-from backend.unified_monitor import unified_monitor
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-router = APIRouter()
-
-
-@router.get("/monitoring/status")
-async def get_system_status():
-    """Get comprehensive system status"""
-    return await unified_monitor.get_system_status()
-
-
-@router.get("/monitoring/models")
-async def get_running_models():
-    """Get currently running models from llama-swap"""
-    return await unified_monitor.get_running_models()
-
-
-@router.post("/monitoring/unload-all")
-async def unload_all_models():
-    """Unload all models via llama-swap"""
-    return await unified_monitor.unload_all_models()
-
-
-@router.get("/monitoring/health")
-async def get_system_health():
-    """Get llama-swap and system health status"""
-    return await unified_monitor.get_system_health()
-
-
-@router.get("/monitoring/debug")
-async def debug_monitoring_data():
-    """Debug endpoint to see what data is being collected"""
-    from backend.unified_monitor import unified_monitor
-    from backend.llama_swap_client import LlamaSwapClient
-
-    # Get raw data from external source
-    external_client = LlamaSwapClient()
-    try:
-        external_models = await external_client.get_running_models()
-    except Exception as e:
-        external_models = {"error": str(e)}
-
-    # Get system status
-    try:
-        system_status = await unified_monitor.get_system_status()
-    except Exception as e:
-        system_status = {"error": str(e)}
-
-    return {
-        "running_models": external_models,
-        "system_status": system_status,
-        "timestamp": "2024-01-01T00:00:00Z",
-    }
-
-
-@router.websocket("/monitoring/ws")
-async def monitoring_websocket(websocket: WebSocket):
-    """WebSocket endpoint for real-time monitoring data"""
-    await unified_monitor.add_subscriber(websocket)
-
-    try:
-        while True:
-            # Keep the connection alive and handle any incoming messages
-            data = await websocket.receive_text()
-            # Echo back any received data (for testing)
-            await websocket.send_text(f"Echo: {data}")
-    except WebSocketDisconnect:
-        await unified_monitor.remove_subscriber(websocket)
-    except Exception as e:
-        logger.error(f"WebSocket error: {e}")
-        await unified_monitor.remove_subscriber(websocket)
diff --git a/backend/smart_auto/__init__.py b/backend/smart_auto/__init__.py
deleted file mode 100644
index 57742a3..0000000
--- a/backend/smart_auto/__init__.py
+++ /dev/null
@@ -1,480 +0,0 @@
-from typing import Dict, Any, Optional
-import psutil
-from backend.database import Model
-from backend.logging_config import get_logger
-
-# Import all required modules at module level for better performance
-from .model_metadata import get_model_metadata
-from .architecture_config import get_architecture_default_context
-from .cpu_config import generate_cpu_config
-from .gpu_config import generate_gpu_config, parse_compute_capability
-from .memory_estimator import get_cpu_memory_gb, estimate_vram_usage, estimate_ram_usage
-from .kv_cache import get_optimal_kv_cache_quant
-from .moe_handler import get_architecture_specific_flags
-from .generation_params import build_generation_params
-from .config_builder import generate_server_params, sanitize_config, apply_preset_tuning
-from .models import SystemResources, ModelMetadata
-
-logger = get_logger(__name__)
-
-
-class SmartAutoConfig:
-    """Smart configuration optimizer for llama.cpp parameters"""
-
-    def __init__(self):
-        self.current_preset = None
-
-    def _generate_cpu_config(
-        self,
-        model_size_mb: float,
-        metadata,
-        architecture: str,
-        layer_count: int,
-        is_moe: bool,
-        expert_count: int,
-    ) -> Dict[str, Any]:
-        """Generate CPU-only configuration with MoE and architecture-specific flags."""
-        cpu_cfg = generate_cpu_config(
-            model_size_mb,
-            architecture,
-            layer_count,
-            metadata.context_length,
-            metadata.vocab_size,
-            metadata.embedding_length,
-            metadata.attention_head_count,
-            debug=None,
-        )
-
-        # Add MoE parameters for CPU-only mode (MoE layers stay on CPU)
-        if is_moe:
-            cpu_cfg["moe_offload_pattern"] = "none"
-            cpu_cfg["moe_offload_custom"] = ""
-            logger.debug("MoE model in CPU-only mode - MoE layers will run on CPU")
-
-        # Add jinja flag if needed (for architectures that require it)
-        if is_moe or architecture in ["glm", "glm4", "qwen3"]:
-            layer_info_for_flags = {
-                "is_moe": is_moe,
-                "expert_count": expert_count,
-                "model_size_mb": model_size_mb,
-                "available_vram_gb": 0,
-                "architecture": architecture,
-            }
-            moe_config = get_architecture_specific_flags(
-                architecture, layer_info_for_flags
-            )
-            if moe_config.get("jinja"):
-                cpu_cfg["jinja"] = True
-
-        return cpu_cfg
-
-    def _apply_moe_optimizations(
-        self,
-        config: Dict[str, Any],
-        metadata,
-        model_size_mb: float,
-        system_resources: SystemResources,
-    ) -> None:
-        """Apply MoE-specific optimizations to configuration."""
-        if not metadata.is_moe:
-            return
-
-        layer_info_for_flags = {
-            "is_moe": metadata.is_moe,
-            "expert_count": metadata.expert_count,
-            "model_size_mb": model_size_mb,
-            "available_vram_gb": system_resources.available_vram_gb,
-            "architecture": metadata.architecture,
-        }
-        moe_config = get_architecture_specific_flags(
-            metadata.architecture, layer_info_for_flags
-        )
-
-        # Set MoE parameters in config
-        if moe_config.get("moe_offload_custom"):
-            config["moe_offload_pattern"] = "custom"
-            config["moe_offload_custom"] = moe_config["moe_offload_custom"]
-        else:
-            config["moe_offload_pattern"] = "none"
-            config["moe_offload_custom"] = ""
-
-        # Set jinja flag if needed
-        if moe_config.get("jinja"):
-            config["jinja"] = True
-
-    async def generate_config(
-        self,
-        model: Model,
-        gpu_info: Dict[str, Any],
-        preset: Optional[str] = None,
-        usage_mode: str = "single_user",
-        speed_quality: Optional[int] = None,
-        use_case: Optional[str] = None,
-        debug: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
-        """Generate optimal configuration based on model and GPU capabilities
-
-        Args:
-            model: The model to configure
-            gpu_info: GPU information dictionary
-            preset: Optional preset name (coding, conversational, long_context) to use as tuning parameters
-            usage_mode: 'single_user' (sequential, peak KV cache) or 'multi_user' (server, typical usage)
-            speed_quality: Speed/quality balance (0-100), where 0 = max speed, 100 = max quality. Default: 50
-            use_case: Optional use case ('chat', 'code', 'creative', 'analysis') for targeted optimization
-        """
-        from backend.presets import get_architecture_and_presets
-
-        try:
-            config = {}
-            # Store preset for later use in generation params
-            self.current_preset = preset
-
-            # Get model metadata
-            model_size_mb = model.file_size / (1024 * 1024) if model.file_size else 0
-            model_name = model.name.lower()
-
-            # Get comprehensive model layer information from unified helper
-            metadata = get_model_metadata(model)
-
-            # Now that get_model_metadata returns dataclass with architecture detection already done
-            layer_count = metadata.layer_count
-            architecture = metadata.architecture
-            context_length = metadata.context_length
-            vocab_size = metadata.vocab_size
-            embedding_length = metadata.embedding_length
-            attention_head_count = metadata.attention_head_count
-            attention_head_count_kv = metadata.attention_head_count_kv
-            is_moe = metadata.is_moe
-            expert_count = metadata.expert_count
-
-            if debug is not None:
-                debug.update(
-                    {
-                        "model_name": model.name,
-                        "model_size_mb": model_size_mb,
-                        "layer_info": metadata.to_dict(),
-                    }
-                )
-
-            # Prepare system resources (GPU capabilities calculated once in SystemResources)
-            cpu_memory = get_cpu_memory_gb()
-            cpu_cores = psutil.cpu_count(logical=False) or 1
-
-            system_resources = SystemResources.from_gpu_info(
-                gpu_info, cpu_memory, cpu_cores
-            )
-
-            # Check flash attention availability using pre-parsed compute capabilities
-            flash_attn_available = (
-                all(cc >= 8.0 for cc in system_resources.compute_capabilities)
-                if system_resources.compute_capabilities
-                else False
-            )
-            system_resources.flash_attn_available = flash_attn_available
-
-            if debug is not None:
-                debug.update(
-                    {
-                        "gpu_count": system_resources.gpu_count,
-                        "total_vram": system_resources.total_vram,
-                        "available_vram_gb": system_resources.available_vram_gb,
-                        "flash_attn_available": flash_attn_available,
-                    }
-                )
-
-            # CPU-only configuration path
-            if not system_resources.gpus:
-                cpu_cfg = self._generate_cpu_config(
-                    model_size_mb,
-                    metadata,
-                    architecture,
-                    layer_count,
-                    is_moe,
-                    expert_count,
-                )
-                return cpu_cfg
-
-            # Select KV cache quantization BEFORE GPU config generation
-            # This affects M_kv which influences context and batch size calculations
-            # Use architecture default context_length for initial selection (will be refined later)
-            kv_cache_config = get_optimal_kv_cache_quant(
-                system_resources.available_vram_gb,
-                context_length,
-                architecture,
-                system_resources.flash_attn_available,
-            )
-            cache_type_k = kv_cache_config.get("cache_type_k", "f16")
-            cache_type_v = kv_cache_config.get("cache_type_v")
-
-            # GPU configuration - pass selected KV cache quantization and usage mode
-            config.update(
-                generate_gpu_config(
-                    model_size_mb,
-                    architecture,
-                    system_resources.gpus,
-                    system_resources.total_vram,
-                    system_resources.gpu_count,
-                    system_resources.nvlink_topology,
-                    layer_count,
-                    context_length,
-                    vocab_size,
-                    embedding_length,
-                    attention_head_count,
-                    attention_head_count_kv=attention_head_count_kv,
-                    compute_capabilities=system_resources.compute_capabilities,
-                    cache_type_k=cache_type_k,
-                    cache_type_v=cache_type_v,
-                    usage_mode=usage_mode,
-                    debug=debug,
-                )
-            )
-
-            # Apply KV cache quantization to config
-            config.update(kv_cache_config)
-
-            # Hybrid consideration: if VRAM is tight, keep KV cache partly on CPU
-            try:
-                # If we have some CPU RAM headroom and low VRAM, prefer no_kv_offload False only when enough VRAM
-                if system_resources.available_vram_gb < (model_size_mb / 1024) * 1.2:
-                    # Signal to avoid KV offload to VRAM when VRAM is tight
-                    config["no_kv_offload"] = True
-                else:
-                    config.setdefault("no_kv_offload", False)
-            except Exception:
-                pass
-
-            # Apply MoE optimizations
-            self._apply_moe_optimizations(
-                config, metadata, model_size_mb, system_resources
-            )
-
-            # Use the computed ctx_size from GPU/CPU config when generating params
-            effective_ctx = int(
-                config.get("ctx_size", context_length) or context_length
-            )
-            if debug is not None:
-                debug["effective_ctx_before_gen_params"] = effective_ctx
-            config.update(build_generation_params(architecture, effective_ctx, None))
-
-            # Apply speed/quality balancing if provided (modifies config in-place)
-            if speed_quality is not None:
-                self._apply_speed_quality_balancing(
-                    config, speed_quality, use_case, metadata, system_resources, debug
-                )
-
-            # Apply preset tuning if provided (modifies config in-place)
-            # Note: preset takes precedence over use_case if both are provided
-            if self.current_preset:
-                apply_preset_tuning(config, self.current_preset)
-            elif use_case:
-                # Apply use_case-specific tuning if no preset
-                self._apply_use_case_tuning(config, use_case)
-
-            # Add server parameters
-            config.update(generate_server_params())
-
-            # Final sanitation and clamping
-            config = sanitize_config(config, system_resources.gpu_count)
-
-            return config
-
-        except Exception as e:
-            raise Exception(f"Failed to generate smart config: {e}")
-
-    def _apply_speed_quality_balancing(
-        self,
-        config: Dict[str, Any],
-        speed_quality: int,
-        use_case: Optional[str],
-        metadata,
-        system_resources,
-        debug: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        """Apply speed/quality balancing to configuration.
-
-        Args:
-            config: Configuration dictionary to modify in-place
-            speed_quality: Speed/quality balance (0-100), where 0 = max speed, 100 = max quality
-            use_case: Optional use case for additional tuning
-            metadata: Model metadata
-            system_resources: SystemResources object
-            debug: Optional debug dictionary
-        """
-        quality_factor = speed_quality / 100.0  # 0.0 = max speed, 1.0 = max quality
-
-        # Context size adjustment
-        # Speed (0-33): reduce context, Balanced (34-66): moderate, Quality (67-100): maximize
-        current_ctx = config.get("ctx_size", 4096)
-        max_context = metadata.context_length
-
-        if speed_quality < 34:
-            # Max speed: reduce context (2048-4096 range)
-            target_ctx = 2048 + int((speed_quality / 34) * 2048)
-        elif speed_quality < 67:
-            # Balanced: moderate context (4096-8192 range)
-            target_ctx = 4096 + int(((speed_quality - 34) / 33) * 4096)
-        else:
-            # Max quality: maximize context (8192-max range)
-            min_quality_ctx = 8192
-            target_ctx = min_quality_ctx + int(
-                ((speed_quality - 67) / 33) * (max_context - min_quality_ctx)
-            )
-
-        # Respect use_case minimums
-        if use_case == "code" and target_ctx < 8192:
-            target_ctx = 8192
-        elif use_case == "analysis" and target_ctx < 16384:
-            target_ctx = 16384
-
-        config["ctx_size"] = min(target_ctx, max_context)
-
-        # Batch size adjustment
-        # Speed-focused: larger batches for throughput
-        # Quality-focused: smaller batches for lower latency per request
-        current_batch = config.get("batch_size", 256)
-        current_ubatch = config.get("ubatch_size", 128)
-
-        if speed_quality < 34:
-            # Max speed: large batches
-            config["batch_size"] = 512 + int((speed_quality / 34) * 256)  # 512-768
-            config["ubatch_size"] = 256 + int((speed_quality / 34) * 128)  # 256-384
-        elif speed_quality < 67:
-            # Balanced: medium batches
-            config["batch_size"] = 384 + int(
-                ((speed_quality - 34) / 33) * 128
-            )  # 384-512
-            config["ubatch_size"] = 192 + int(
-                ((speed_quality - 34) / 33) * 64
-            )  # 192-256
-        else:
-            # Max quality: smaller batches
-            config["batch_size"] = 256 + int(
-                ((speed_quality - 67) / 33) * 128
-            )  # 256-384
-            config["ubatch_size"] = 128 + int(
-                ((speed_quality - 67) / 33) * 64
-            )  # 128-192
-
-        # GPU layers adjustment
-        # Quality factor affects how many layers to offload
-        if config.get("n_gpu_layers", 0) > 0:
-            layer_count = metadata.layer_count
-            base_layers = config["n_gpu_layers"]
-            # Adjust based on quality factor (70-100% of base)
-            adjusted_layers = int(base_layers * (0.7 + (quality_factor * 0.3)))
-            config["n_gpu_layers"] = min(adjusted_layers, layer_count)
-
-        # Parallel processing adjustment
-        # Higher for speed-focused, lower for quality-focused
-        if speed_quality < 50:
-            config["parallel"] = max(1, int(3 - (speed_quality / 50) * 2))  # 3 to 1
-        else:
-            config["parallel"] = 1  # Quality-focused: sequential processing
-
-        # Threads optimization
-        cpu_threads = system_resources.cpu_cores or 4
-        if speed_quality < 50:
-            # Speed-focused: use more threads
-            config["threads"] = cpu_threads
-            config["threads_batch"] = min(cpu_threads, 8)
-        else:
-            # Quality-focused: optimize threads
-            config["threads"] = max(2, int(cpu_threads * 0.8))
-            config["threads_batch"] = max(2, int(cpu_threads * 0.8))
-
-        # Flash Attention: enable for quality-focused configs when available
-        if system_resources.flash_attn_available and quality_factor > 0.6:
-            config["flash_attn"] = True
-            # Flash attention enables V cache quantization
-            if quality_factor < 0.7:
-                config["cache_type_v"] = "q8_0"  # Moderate quantization for balanced
-            else:
-                config["cache_type_v"] = "f16"  # Better quality
-
-        # KV Cache quantization adjustment
-        available_vram_gb = system_resources.available_vram_gb
-        total_vram_gb = (
-            system_resources.total_vram / (1024**3)
-            if system_resources.total_vram
-            else 0
-        )
-
-        if quality_factor < 0.5 and available_vram_gb < total_vram_gb * 0.5:
-            # Low VRAM or speed-focused: use quantization
-            if (
-                config.get("cache_type_k") is None
-                or config.get("cache_type_k") == "f16"
-            ):
-                config["cache_type_k"] = "q8_0"
-            if config.get("flash_attn") and config.get("cache_type_v") is None:
-                config["cache_type_v"] = "q8_0"
-        elif quality_factor > 0.7:
-            # Quality-focused: use full precision
-            config["cache_type_k"] = "f16"
-            if config.get("flash_attn"):
-                config["cache_type_v"] = "f16"
-
-        # Low VRAM mode for tight memory situations
-        if available_vram_gb < total_vram_gb * 0.3 or (
-            quality_factor < 0.4 and available_vram_gb < total_vram_gb * 0.5
-        ):
-            config["low_vram"] = True
-
-        if debug is not None:
-            debug["speed_quality"] = speed_quality
-            debug["quality_factor"] = quality_factor
-            debug["use_case"] = use_case
-            debug["adjusted_ctx_size"] = config["ctx_size"]
-            debug["adjusted_batch_size"] = config["batch_size"]
-
-    def _apply_use_case_tuning(self, config: Dict[str, Any], use_case: str) -> None:
-        """Apply use-case-specific generation parameter tuning.
-
-        Args:
-            config: Configuration dictionary to modify in-place
-            use_case: Use case ('chat', 'code', 'creative', 'analysis')
-        """
-        if use_case == "code":
-            config["temp"] = 0.3
-            config["temperature"] = 0.3
-            config["top_k"] = 30
-            if config.get("ctx_size", 4096) < 8192:
-                config["ctx_size"] = 8192
-        elif use_case == "creative":
-            config["temp"] = 1.2
-            config["temperature"] = 1.2
-            config["top_k"] = 50
-            config["top_p"] = 0.95
-        elif use_case == "analysis":
-            config["temp"] = 0.7
-            config["temperature"] = 0.7
-            if config.get("ctx_size", 4096) < 16384:
-                config["ctx_size"] = 16384
-        elif use_case == "chat":
-            config["temp"] = 0.8
-            config["temperature"] = 0.8
-
-    def estimate_vram_usage(
-        self,
-        model: Model,
-        config: Dict[str, Any],
-        gpu_info: Dict[str, Any],
-        usage_mode: str = "single_user",
-        metadata: Optional[ModelMetadata] = None,
-    ) -> Dict[str, Any]:
-        """Estimate VRAM usage for given configuration using comprehensive model metadata"""
-        return estimate_vram_usage(
-            model, config, gpu_info, metadata=metadata, usage_mode=usage_mode
-        )
-
-    def estimate_ram_usage(
-        self,
-        model: Model,
-        config: Dict[str, Any],
-        usage_mode: str = "single_user",
-        metadata: Optional[ModelMetadata] = None,
-    ) -> Dict[str, Any]:
-        """Estimate RAM usage for given configuration"""
-        return estimate_ram_usage(
-            model, config, metadata=metadata, usage_mode=usage_mode
-        )
diff --git a/backend/smart_auto/architecture_config.py b/backend/smart_auto/architecture_config.py
deleted file mode 100644
index 18e6c19..0000000
--- a/backend/smart_auto/architecture_config.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-Architecture configuration and detection module.
-Consolidates architecture detection and default configuration values.
-"""
-
-from functools import lru_cache
-from .constants import ARCHITECTURE_CONTEXT_DEFAULTS, DEFAULT_CONTEXT_LENGTH
-
-
-@lru_cache(maxsize=128)
-def resolve_architecture(architecture_or_name: str) -> str:
-    """
-    Unified function to resolve architecture from either a model name or architecture string.
-
-    Handles both detection from model names and normalization of GGUF metadata.
-    This replaces the separate detect_architecture_from_name() and normalize_architecture() functions.
-
-    Args:
-        architecture_or_name: Either a model name or architecture string from GGUF metadata
-
-    Returns:
-        Normalized architecture name (e.g., "llama3", "qwen3", etc.)
-    """
-    if not architecture_or_name or not architecture_or_name.strip():
-        return "unknown"
-
-    # Normalize input
-    text = architecture_or_name.lower().strip()
-
-    # Check architectures in order of specificity (most specific first)
-
-    # Qwen architectures
-    if "qwen" in text:
-        if "qwen3" in text or "qwen-3" in text:
-            return "qwen3"
-        if "qwen2" in text or "qwen-2" in text:
-            return "qwen2"
-        return "qwen"
-
-    # Llama architectures (CodeLlama before other Llama variants)
-    if "codellama" in text:
-        return "codellama"
-    if "llama3" in text or "llama-3" in text:
-        return "llama3"
-    if "llama2" in text or "llama-2" in text:
-        return "llama2"
-    if "llama" in text:
-        return "llama"
-
-    # Gemma architectures
-    if "gemma3" in text or "gemma-3" in text:
-        return "gemma3"
-    if "gemma" in text:
-        return "gemma"
-
-    # GLM architectures
-    if "glm-4" in text or "glm4" in text:
-        return "glm4"
-    if "glm" in text or "chatglm" in text:
-        return "glm"
-
-    # DeepSeek architectures
-    if "deepseek" in text:
-        if "v3" in text or "v3.1" in text:
-            return "deepseek-v3"
-        return "deepseek"
-
-    # Other architectures
-    if "mistral" in text:
-        return "mistral"
-    if "phi" in text:
-        return "phi"
-
-    # If text contains something but not recognized, return as generic for model names
-    # or unknown for invalid architecture strings
-    if text and text not in ["unknown", "generic"]:
-        return "generic"
-
-    return "unknown"
-
-
-def get_architecture_default_context(architecture: str) -> int:
-    """
-    Get default context length for an architecture.
-
-    Args:
-        architecture: Normalized architecture name
-
-    Returns:
-        Default context length in tokens
-    """
-    return ARCHITECTURE_CONTEXT_DEFAULTS.get(architecture, DEFAULT_CONTEXT_LENGTH)
-
-
-# Backward compatibility aliases
-def detect_architecture_from_name(model_name: str) -> str:
-    """Deprecated: Use resolve_architecture() instead."""
-    return resolve_architecture(model_name)
-
-
-def normalize_architecture(architecture: str) -> str:
-    """Deprecated: Use resolve_architecture() instead."""
-    return resolve_architecture(architecture)
diff --git a/backend/smart_auto/calculators.py b/backend/smart_auto/calculators.py
deleted file mode 100644
index 10efab8..0000000
--- a/backend/smart_auto/calculators.py
+++ /dev/null
@@ -1,402 +0,0 @@
-"""
-Calculation utilities for smart_auto module.
-Pure functions for batch size, context size, and GPU layer calculations.
-"""
-
-from functools import lru_cache
-from typing import Tuple, Optional
-from .constants import (
-    VRAM_FRAGMENTATION_MARGIN,
-    CONTEXT_SAFETY_MARGIN,
-    LAYERS_PER_GB_SMALL_MODEL,
-    LAYERS_PER_GB_LARGE_MODEL,
-    GPU_LAYER_BUFFER,
-    CONTEXT_RAM_OVERHEAD_GB,
-    MIN_CONTEXT_SIZE,
-    MAX_CONTEXT_SIZE,
-    MIN_BATCH_SIZE,
-    ARCHITECTURE_CPU_BATCH_LIMITS,
-)
-
-
-def calculate_ubatch_size(batch_size: int) -> int:
-    """
-    Calculate optimal ubatch_size from batch_size.
-
-    Unified helper to derive ubatch_size consistently across GPU and CPU modes.
-    """
-    return max(1, min(batch_size, max(1, batch_size // 2)))
-
-
-def calculate_optimal_batch_size_gpu(
-    available_vram_gb: float,
-    model_size_mb: float,
-    context_size: int,
-    embedding_length: int,
-    layer_count: int,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-) -> int:
-    """
-    Calculate optimal batch size for GPU based on memory requirements.
-
-    Uses data-driven approach when possible, falls back to VRAM-based estimation.
-    """
-    # Memory requirements per batch item
-    model_memory_gb = model_size_mb / 1024
-
-    # KV cache memory per batch item
-    # Note: KV cache is shared across batch items in continuous batching, but we estimate
-    # as if each item needs its own context window (conservative estimate)
-    if embedding_length > 0 and layer_count > 0:
-        # Use actual quantization bytes-per-value if provided, otherwise default to fp16
-        from .constants import KV_CACHE_QUANT_FACTORS
-
-        quant_factor_k = KV_CACHE_QUANT_FACTORS.get(
-            cache_type_k or "f16", 0.5
-        )  # f16 = 0.5
-        quant_factor_v = KV_CACHE_QUANT_FACTORS.get(
-            cache_type_v or cache_type_k or "f16", quant_factor_k
-        )
-        bytes_per_k = quant_factor_k * 4  # Convert factor to bytes (f32=4, f16=2, etc.)
-        bytes_per_v = quant_factor_v * 4
-        bytes_per_element = (bytes_per_k + bytes_per_v) / 2  # Average for K+V
-        # Conservative estimate: use embedding_length directly (overestimates slightly for GQA)
-        # This is a simplified calculation for batch sizing - precise GQA calculation done in memory_estimator
-        kv_cache_per_item_gb = (
-            context_size * embedding_length * layer_count * bytes_per_element
-        ) / (1024**3)
-    else:
-        # Conservative estimate: 64 bytes per token
-        kv_cache_per_item_gb = context_size * 64 / (1024**3)
-
-    total_per_item_gb = model_memory_gb + kv_cache_per_item_gb
-
-    if total_per_item_gb <= 0:
-        return MIN_BATCH_SIZE
-
-    # Calculate max batch size based on available memory
-    max_batch_size = int(
-        available_vram_gb * VRAM_FRAGMENTATION_MARGIN / total_per_item_gb
-    )
-
-    # Apply reasonable limits based on model size
-    if embedding_length > 2048:  # Large models (7B+)
-        max_batch_size = min(max_batch_size, 512)
-    elif embedding_length > 1024:  # Medium models (3B-7B)
-        max_batch_size = min(max_batch_size, 1024)
-    else:  # Small models (<3B)
-        max_batch_size = min(max_batch_size, 2048)
-
-    return max(MIN_BATCH_SIZE, max_batch_size)
-
-
-def calculate_optimal_batch_size_cpu(
-    available_ram_gb: float, model_size_mb: float, context_size: int, architecture: str
-) -> Tuple[int, int]:
-    """
-    Calculate optimal batch sizes for CPU mode using dict-based architecture profiles.
-
-    Returns:
-        Tuple of (batch_size, ubatch_size)
-    """
-    model_ram_gb = model_size_mb / 1024
-
-    # Calculate available RAM for batching after model and context
-    reserved_ram_gb = model_ram_gb + (context_size / 1000) + CONTEXT_RAM_OVERHEAD_GB
-    available_for_batch = max(0, available_ram_gb - reserved_ram_gb)
-
-    # Estimate batch memory usage (rough: 1MB per batch item)
-    max_batch_size = int(available_for_batch * 1000)  # 1GB = ~1000 batch items
-
-    # Get architecture-specific limits or use defaults
-    limits = ARCHITECTURE_CPU_BATCH_LIMITS.get(
-        architecture, ARCHITECTURE_CPU_BATCH_LIMITS["default"]
-    )
-
-    batch_size = min(limits["max_batch"], max(limits["min_batch"], max_batch_size))
-    ubatch_size = min(limits["max_ubatch"], max(limits["min_ubatch"], batch_size // 2))
-
-    return batch_size, ubatch_size
-
-
-@lru_cache(maxsize=128)
-def calculate_max_context_size_gpu(
-    available_vram_gb: float,
-    model_size_mb: float,
-    layer_count: int,
-    embedding_length: int,
-    attention_head_count: int,
-    attention_head_count_kv: int,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-) -> int:
-    """
-    Calculate maximum context size for GPU based on memory requirements.
-
-    Cached with LRU to avoid redundant calculations for same parameters.
-
-    Returns:
-        Maximum context size in tokens
-    """
-    # Reserve memory for model
-    model_memory_gb = model_size_mb / 1024
-    reserved_memory_gb = model_memory_gb + 1.0  # Model + 1GB overhead
-    available_for_context_gb = max(0, available_vram_gb - reserved_memory_gb)
-
-    if available_for_context_gb <= 0:
-        return MIN_CONTEXT_SIZE
-
-    # Calculate KV cache memory per token based on transformer architecture
-    # GQA-aware formula: M_kv = n_ctx × N_layers × N_head_kv × d_head × (p_a_k + p_a_v)
-    # where d_head = N_embd / N_head
-    # Use actual quantization bytes-per-value instead of hardcoded fp16
-    if embedding_length > 0 and layer_count > 0:
-        # Get actual quantization bytes-per-value
-        from .constants import KV_CACHE_QUANT_FACTORS
-
-        quant_factor_k = KV_CACHE_QUANT_FACTORS.get(
-            cache_type_k or "f16", 0.5
-        )  # f16 = 0.5
-        quant_factor_v = KV_CACHE_QUANT_FACTORS.get(
-            cache_type_v or cache_type_k or "f16", quant_factor_k
-        )
-        bytes_per_k = quant_factor_k * 4  # Convert factor to bytes (f32=4, f16=2, etc.)
-        bytes_per_v = quant_factor_v * 4
-
-        if attention_head_count_kv > 0 and attention_head_count > 0:
-            # GQA-aware calculation
-            d_head = embedding_length / attention_head_count
-            # KV cache per token: K and V cache per layer, each storing N_head_kv heads
-            kv_cache_per_layer_k = attention_head_count_kv * d_head * bytes_per_k
-            kv_cache_per_layer_v = attention_head_count_kv * d_head * bytes_per_v
-            kv_cache_per_token_bytes = (
-                kv_cache_per_layer_k + kv_cache_per_layer_v
-            ) * layer_count
-        else:
-            # Fallback for non-GQA models (MHA: N_head_kv = N_head)
-            kv_cache_per_token_bytes = (
-                layer_count * embedding_length * (bytes_per_k + bytes_per_v)
-            )
-
-        # Apply usage mode factor for multi_user (allows larger context since KV cache is lower)
-        # For max context calculation: n_ctx = available_vram / (kv_cache_per_token * usage_factor)
-        # So: tokens_per_gb = 1GB / (kv_cache_per_token * usage_factor)
-        from .constants import KV_CACHE_SINGLE_USER_FACTOR, KV_CACHE_MULTI_USER_FACTOR
-
-        if usage_mode == "multi_user":
-            # In multi_user mode, KV cache usage is lower (typical usage), so we can fit more context
-            usage_factor = KV_CACHE_MULTI_USER_FACTOR
-            # Calculate tokens per GB: divide by (bytes_per_token * usage_factor)
-            # This gives more tokens since usage_factor < 1.0
-            tokens_per_gb = (
-                (1024**3) / (kv_cache_per_token_bytes * usage_factor)
-                if kv_cache_per_token_bytes > 0
-                else 0
-            )
-        else:
-            # Single user mode: full KV cache (peak usage), standard calculation
-            usage_factor = KV_CACHE_SINGLE_USER_FACTOR
-            tokens_per_gb = (
-                (1024**3) / (kv_cache_per_token_bytes * usage_factor)
-                if kv_cache_per_token_bytes > 0
-                else 0
-            )
-
-        # Calculate max context size with safety margin
-        if tokens_per_gb > 0:
-            max_context_tokens = int(
-                available_for_context_gb * tokens_per_gb * CONTEXT_SAFETY_MARGIN
-            )
-            # Ensure minimum context size
-            return max(MIN_CONTEXT_SIZE, max_context_tokens)
-        else:
-            # Fallback if calculation fails (e.g., kv_cache_per_token_bytes is 0)
-            return MIN_CONTEXT_SIZE
-    else:
-        # Fallback to conservative estimate: ~1000 tokens per GB
-        estimated = int(available_for_context_gb * 1000)
-        return max(MIN_CONTEXT_SIZE, min(MAX_CONTEXT_SIZE, estimated))
-
-
-def calculate_optimal_context_size_gpu(
-    architecture: str,
-    available_vram: int,
-    model_size_mb: float = 0,
-    layer_count: int = 32,
-    embedding_length: int = 0,
-    attention_head_count: int = 0,
-    attention_head_count_kv: int = 0,
-    base_context: Optional[int] = None,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-) -> int:
-    """
-    Calculate optimal context size for GPU based on VRAM and architecture defaults.
-
-    Returns:
-        Optimal context size in tokens
-    """
-    from .architecture_config import get_architecture_default_context
-
-    base_ctx = base_context or get_architecture_default_context(architecture)
-
-    if available_vram == 0:
-        # CPU mode - conservative context
-        return max(MIN_CONTEXT_SIZE, min(base_ctx, 2048))
-
-    # Use data-driven calculation if we have model parameters
-    if model_size_mb > 0 and layer_count > 0 and embedding_length > 0:
-        vram_gb = available_vram / (1024**3)
-        calculated_max = calculate_max_context_size_gpu(
-            vram_gb,
-            model_size_mb,
-            layer_count,
-            embedding_length,
-            attention_head_count,
-            attention_head_count_kv,
-            cache_type_k=cache_type_k,
-            cache_type_v=cache_type_v,
-            usage_mode=usage_mode,
-        )
-        result = min(base_ctx, calculated_max) if calculated_max > 0 else base_ctx
-        return max(MIN_CONTEXT_SIZE, min(result, MAX_CONTEXT_SIZE))
-
-    # Fallback to architecture-based limits if no model data
-    vram_gb = available_vram / (1024**3)
-
-    # Conservative scaling based on VRAM capacity
-    if vram_gb >= 24:  # High-end GPU
-        return max(MIN_CONTEXT_SIZE, min(base_ctx, MAX_CONTEXT_SIZE))
-    elif vram_gb >= 12:  # Mid-range GPU
-        return max(MIN_CONTEXT_SIZE, min(base_ctx, int(base_ctx * 0.75)))
-    elif vram_gb >= 8:  # Lower-end GPU
-        return max(MIN_CONTEXT_SIZE, min(base_ctx, int(base_ctx * 0.5)))
-    else:  # Very limited VRAM
-        return max(MIN_CONTEXT_SIZE, min(base_ctx, 2048))
-
-
-def calculate_optimal_gpu_layers(
-    free_vram_gb: float,
-    model_size_mb: float,
-    total_layers: int,
-    context_size: int = 4096,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    ubatch_size: int = 512,
-    attention_head_count: int = 0,
-    attention_head_count_kv: int = 0,
-    embedding_length: int = 0,
-    layer_count: int = 0,
-    usage_mode: str = "single_user",
-) -> int:
-    """
-    Calculate optimal number of layers to offload to GPU.
-
-    Uses exact M_kv and M_compute calculations according to theoretical model:
-    n_ngl_max = floor((VRAM_available - M_kv - M_compute) / (M_weights_total / N_layers))
-
-    Args:
-        free_vram_gb: Available VRAM in GB
-        model_size_mb: Model size in MB (GGUF file size)
-        total_layers: Total number of layers in model
-        context_size: Context size in tokens (default: 4096)
-        cache_type_k: K cache quantization type (default: f16)
-        cache_type_v: V cache quantization type (default: same as cache_type_k)
-        ubatch_size: Micro-batch size (default: 512)
-        attention_head_count: Number of attention heads (for GQA calculation)
-        attention_head_count_kv: Number of KV attention heads (for GQA calculation)
-        embedding_length: Embedding dimension (for GQA calculation)
-        layer_count: Layer count (alias for total_layers, for compatibility)
-
-    Returns:
-        Number of GPU layers
-    """
-    # Use total_layers if provided, otherwise layer_count
-    actual_layer_count = (
-        total_layers if total_layers > 0 else (layer_count if layer_count > 0 else 0)
-    )
-
-    if actual_layer_count == 0:
-        # Fallback to old heuristic if layer count unknown
-        estimated_layers_per_gb = (
-            LAYERS_PER_GB_SMALL_MODEL
-            if model_size_mb < 1000
-            else LAYERS_PER_GB_LARGE_MODEL
-        )
-        max_layers = int(free_vram_gb * estimated_layers_per_gb * GPU_LAYER_BUFFER)
-        return max_layers
-
-    # Calculate exact M_kv and M_compute
-    free_vram_bytes = free_vram_gb * (1024**3)
-    model_size_bytes = model_size_mb * (1024**2)
-
-    # Calculate M_kv using exact formula
-    from .memory_estimator import calculate_kv_cache_size
-
-    # Use default values if not provided
-    cache_type_k_actual = cache_type_k or "f16"
-    cache_type_v_actual = cache_type_v or cache_type_k_actual
-
-    # If we have architecture parameters, use precise calculation
-    if embedding_length > 0 and attention_head_count > 0:
-        kv_cache_bytes = calculate_kv_cache_size(
-            context_size,
-            1,  # parallel=1 for layer calculation
-            actual_layer_count,
-            embedding_length,
-            attention_head_count,
-            attention_head_count_kv or attention_head_count,
-            cache_type_k_actual,
-            cache_type_v_actual if cache_type_v else None,
-            usage_mode=usage_mode,
-        )
-    else:
-        # Fallback: estimate KV cache size (conservative)
-        # Assume fp16, use embedding_length if available, otherwise estimate
-        if embedding_length > 0:
-            # Simplified estimate: assume MHA (not GQA)
-            bytes_per_token = actual_layer_count * embedding_length * 4  # K+V at fp16
-            kv_cache_bytes = context_size * bytes_per_token
-        else:
-            # Very conservative fallback: ~64 bytes per token per layer
-            kv_cache_bytes = context_size * actual_layer_count * 64
-
-    # Calculate M_compute: Fixed overhead + variable scratch buffer
-    from .constants import COMPUTE_FIXED_OVERHEAD_MB, COMPUTE_SCRATCH_PER_UBATCH_MB
-
-    compute_overhead_mb = COMPUTE_FIXED_OVERHEAD_MB + (
-        ubatch_size * COMPUTE_SCRATCH_PER_UBATCH_MB
-    )
-    compute_overhead_bytes = int(compute_overhead_mb * (1024**2))
-
-    # Formula from theoretical model:
-    # n_ngl_max = floor((VRAM_available - M_kv - M_compute) / (M_weights_total / N_layers))
-    available_for_weights_bytes = (
-        free_vram_bytes - kv_cache_bytes - compute_overhead_bytes
-    )
-
-    if available_for_weights_bytes <= 0:
-        # Not enough VRAM even for M_kv and M_compute
-        return 0
-
-    mb_per_layer = (
-        model_size_bytes / actual_layer_count if actual_layer_count > 0 else 0
-    )
-    if mb_per_layer <= 0:
-        # Fallback if calculation fails
-        estimated_layers_per_gb = (
-            LAYERS_PER_GB_SMALL_MODEL
-            if model_size_mb < 1000
-            else LAYERS_PER_GB_LARGE_MODEL
-        )
-        max_layers = int(free_vram_gb * estimated_layers_per_gb * GPU_LAYER_BUFFER)
-        return min(max_layers, actual_layer_count)
-
-    max_layers = (
-        int(available_for_weights_bytes / mb_per_layer) if mb_per_layer > 0 else 0
-    )
-
-    return min(max_layers, actual_layer_count)
diff --git a/backend/smart_auto/config_builder.py b/backend/smart_auto/config_builder.py
deleted file mode 100644
index 56ac40b..0000000
--- a/backend/smart_auto/config_builder.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-Configuration builder module.
-Handles configuration sanitization, server parameters, and preset tuning.
-"""
-
-from typing import Dict, Any, Optional
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-
-def clamp_int(name: str, val: Any, lo: int, hi: int, default: int) -> int:
-    """Helper to clamp integer values."""
-    try:
-        iv = int(val)
-    except (ValueError, TypeError):
-        iv = default
-    return max(lo, min(hi, iv))
-
-
-def generate_server_params() -> Dict[str, Any]:
-    """Generate server-specific parameters"""
-    return {
-        "host": "0.0.0.0",  # Allow external connections
-        "timeout": 300,  # 5 minutes timeout
-    }
-
-
-def sanitize_config(config: Dict[str, Any], gpu_count: int) -> Dict[str, Any]:
-    """Clamp and sanitize final config values to enforce invariants and avoid edge-case crashes."""
-    sanitized = dict(config)
-
-    # Clamp integer values
-    sanitized["ctx_size"] = clamp_int(
-        "ctx_size", sanitized.get("ctx_size", 4096), 512, 262144, 4096
-    )
-    sanitized["batch_size"] = clamp_int(
-        "batch_size", sanitized.get("batch_size", 512), 1, 4096, 512
-    )
-    sanitized["ubatch_size"] = clamp_int(
-        "ubatch_size",
-        sanitized.get("ubatch_size"),
-        1,
-        sanitized.get("batch_size", 512),
-        max(1, sanitized.get("batch_size", 512) // 2),
-    )
-    sanitized["parallel"] = clamp_int(
-        "parallel",
-        sanitized.get("parallel", 1),
-        1,
-        max(1, gpu_count if gpu_count > 0 else 1),
-        1,
-    )
-
-    # Ensure boolean fields are properly typed
-    boolean_fields = ["no_mmap", "mlock", "low_vram", "logits_all", "flash_attn"]
-    sanitized.update({b: bool(sanitized[b]) for b in boolean_fields if b in sanitized})
-
-    return sanitized
-
-
-def apply_preset_tuning(config: Dict[str, Any], preset_name: str) -> None:
-    """
-    Apply preset-specific tuning to configuration parameters.
-
-    Consolidates both generation parameter adjustments and config factor tuning
-    into a single clear function.
-    """
-    if preset_name == "coding":
-        config["temperature"] = 0.7
-        config["repeat_penalty"] = 1.05
-        if "batch_size" in config:
-            config["batch_size"] = max(1, int(config["batch_size"] * 0.8))
-        if "ubatch_size" in config:
-            config["ubatch_size"] = max(1, int(config["ubatch_size"] * 0.8))
-        if "parallel" in config:
-            config["parallel"] = max(1, int(config["parallel"] * 1.2))
-        logger.debug("Applied preset 'coding' tuning")
-    # conversational preset has no changes (factors = 1.0), so skip
diff --git a/backend/smart_auto/constants.py b/backend/smart_auto/constants.py
deleted file mode 100644
index c94dee6..0000000
--- a/backend/smart_auto/constants.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""
-Constants used across the smart_auto module.
-Centralizes magic numbers and configuration limits.
-"""
-
-from typing import Dict, Any
-
-# ============================================================================
-# Memory optimization factors
-# ============================================================================
-
-KV_CACHE_OPTIMIZATION_FACTOR = 1.0  # Use actual memory (no optimization factor) - memory mapping doesn't reduce peak usage
-
-# Usage mode factors for KV cache estimation
-# Based on theoretical model: single_user accumulates context (peak), multi_user clears context (typical usage)
-KV_CACHE_SINGLE_USER_FACTOR = 1.0  # Peak estimate (full context window)
-KV_CACHE_MULTI_USER_FACTOR = (
-    0.4  # Typical usage (context cleared between requests, ~40% of peak)
-)
-
-MOE_OFFLOAD_ALL_RATIO = 0.3  # 30% of model for all MoE offloaded
-MOE_OFFLOAD_UP_DOWN_RATIO = 0.2  # 20% of model for up/down MoE offloaded
-MOE_OFFLOAD_UP_RATIO = 0.1  # 10% of model for up MoE offloaded
-
-LLAMA_CPP_OVERHEAD_MB = 256  # 256MB overhead for llama.cpp
-
-# Compute buffer constants (M_compute)
-COMPUTE_FIXED_OVERHEAD_MB = (
-    550  # Fixed CUDA overhead (~550MB for CUDA context, cuBLAS workspace, etc.)
-)
-COMPUTE_SCRATCH_PER_UBATCH_MB = (
-    0.5  # Variable scratch buffer per ubatch size (rough estimate)
-)
-
-# VRAM pressure thresholds for MoE offloading
-VRAM_RATIO_VERY_TIGHT = 1.2  # Very tight VRAM - offload all MoE
-VRAM_RATIO_TIGHT = 1.5  # Tight VRAM - offload up/down projections
-VRAM_RATIO_MODERATE = 2.0  # Moderate VRAM - offload only up projection
-
-# ============================================================================
-# KV Cache quantization factors
-# ============================================================================
-
-KV_CACHE_QUANT_FACTORS: Dict[str, float] = {
-    "f32": 1.0,  # Full precision (no reduction)
-    "f16": 0.5,  # Half precision
-    "bf16": 0.5,  # Bfloat16
-    "q8_0": 0.25,  # 8-bit quant
-    "q5_1": 0.156,  # 5-bit high quality
-    "q5_0": 0.156,  # 5-bit
-    "q4_1": 0.125,  # 4-bit high quality
-    "q4_0": 0.125,  # 4-bit
-    "iq4_nl": 0.125,  # 4-bit non-linear
-}
-
-QUANTIZATION_AVERAGE_FACTOR = 0.5  # Average of K and V cache quantization factors
-
-# ============================================================================
-# Architecture context defaults
-# ============================================================================
-
-ARCHITECTURE_CONTEXT_DEFAULTS: Dict[str, int] = {
-    "llama2": 4096,
-    "llama3": 8192,
-    "llama": 4096,
-    "codellama": 16384,
-    "mistral": 32768,
-    "phi": 2048,
-    "glm": 8192,
-    "glm4": 204800,  # 200K context for GLM-4.6
-    "deepseek": 32768,
-    "deepseek-v3": 32768,
-    "qwen": 32768,  # 32K context
-    "qwen2": 32768,  # 32K context
-    "qwen3": 131072,  # 128K context for Qwen3
-    "gemma": 8192,
-    "gemma3": 8192,
-    "generic": 4096,
-}
-
-DEFAULT_CONTEXT_LENGTH = 4096
-
-# ============================================================================
-# Memory calculation defaults
-# ============================================================================
-
-DEFAULT_BYTES_PER_ELEMENT = 2  # Assume fp16 for activations
-BATCH_INTERMEDIATE_FACTOR = 0.08  # 8% factor for intermediate activations
-BATCH_QKV_FACTOR = 0.04  # 4% factor for QKV projections
-BATCH_COMPUTATION_OVERHEAD_KB = 400  # ~400KB per batch item
-BATCH_FALLBACK_MB = 1.5  # 1.5MB per batch item fallback
-
-BATCH_VRAM_OVERHEAD_RATIO = 0.1  # 10% of KV cache VRAM for batch overhead
-BATCH_RAM_OVERHEAD_RATIO = 0.1  # 10% of KV cache RAM for batch overhead
-
-# Layer estimation defaults
-FALLBACK_LAYER_COUNT = 32
-FALLBACK_EMBEDDING_LENGTH = 4096
-FALLBACK_KV_CACHE_PER_TOKEN_BYTES = 60 * (4096 * 2 + 4096 * 2)  # ~960 KB per token
-
-# ============================================================================
-# Context size limits
-# ============================================================================
-
-MIN_CONTEXT_SIZE = 512
-MAX_CONTEXT_SIZE = 262144  # 256K
-MAX_CPU_CONTEXT_SIZE = 8192  # Conservative limit for CPU mode
-
-# ============================================================================
-# Batch size limits
-# ============================================================================
-
-MIN_BATCH_SIZE = 1
-MAX_BATCH_SIZE = 4096
-
-# ============================================================================
-# GPU/VRAM calculation constants
-# ============================================================================
-
-# VRAM safety margins
-VRAM_SAFETY_MARGIN = 0.9  # 90% of available VRAM
-VRAM_FRAGMENTATION_MARGIN = 0.7  # 70% for batch size calculations
-CONTEXT_SAFETY_MARGIN = 0.8  # 80% for context size calculations
-
-# GPU layer estimation
-LAYERS_PER_GB_SMALL_MODEL = 8  # Models < 1GB
-LAYERS_PER_GB_LARGE_MODEL = 4  # Models >= 1GB
-GPU_LAYER_BUFFER = 0.8  # Leave 20% buffer
-
-# ============================================================================
-# CPU calculation constants
-# ============================================================================
-
-# RAM reservation overhead
-MODEL_RAM_OVERHEAD_GB = 2.0  # Overhead for model loading
-CONTEXT_RAM_OVERHEAD_GB = 1.0  # Additional overhead for context
-
-# ============================================================================
-# Architecture-specific configuration profiles
-# ============================================================================
-
-# CPU architecture optimization profiles
-# Maps architecture to dict of optimization settings
-ARCHITECTURE_CPU_PROFILES: Dict[str, Dict[str, Any]] = {
-    "mistral": {
-        "use_mmap": True,
-    },
-    "llama3": {
-        "use_mmap": "dynamic",  # Special flag for conditional mmap
-    },
-    "llama2": {
-        "use_mmap": "dynamic",  # Special flag for conditional mmap
-    },
-    "codellama": {
-        "use_mmap": True,
-        "logits_all": False,
-    },
-    "phi": {
-        "use_mmap": True,
-    },
-}
-
-# CPU batch size limits per architecture
-ARCHITECTURE_CPU_BATCH_LIMITS: Dict[str, Dict[str, int]] = {
-    "mistral": {
-        "max_batch": 2048,
-        "max_ubatch": 1024,
-        "min_batch": 64,
-        "min_ubatch": 32,
-    },
-    "llama3": {"max_batch": 1536, "max_ubatch": 768, "min_batch": 64, "min_ubatch": 32},
-    "codellama": {
-        "max_batch": 1536,
-        "max_ubatch": 768,
-        "min_batch": 64,
-        "min_ubatch": 32,
-    },
-    "default": {
-        "max_batch": 1024,
-        "max_ubatch": 512,
-        "min_batch": 32,
-        "min_ubatch": 16,
-    },
-}
diff --git a/backend/smart_auto/cpu_config.py b/backend/smart_auto/cpu_config.py
deleted file mode 100644
index 9a5ce86..0000000
--- a/backend/smart_auto/cpu_config.py
+++ /dev/null
@@ -1,200 +0,0 @@
-"""
-CPU configuration module.
-Handles all CPU-specific configuration logic for model inference.
-"""
-
-from typing import Dict, Any, Optional, Tuple
-import psutil
-
-from .architecture_config import get_architecture_default_context
-from .memory_estimator import (
-    get_cpu_memory_gb,
-    tokens_per_gb_by_model_size,
-    ctx_tokens_budget_greedy,
-)
-from .calculators import calculate_optimal_batch_size_cpu, calculate_ubatch_size
-from .constants import (
-    MODEL_RAM_OVERHEAD_GB,
-    CONTEXT_RAM_OVERHEAD_GB,
-    MAX_CPU_CONTEXT_SIZE,
-    MIN_CONTEXT_SIZE,
-    ARCHITECTURE_CPU_PROFILES,
-    ARCHITECTURE_CPU_BATCH_LIMITS,
-)
-
-
-def get_optimal_cpu_context_size(
-    architecture: str, available_ram_gb: float, model_size_mb: float
-) -> int:
-    """Calculate optimal context size for CPU-only mode based on available RAM."""
-    base_context = get_architecture_default_context(architecture)
-
-    # Calculate how much RAM we can allocate for context
-    # Reserve space for model + overhead
-    model_ram_gb = model_size_mb / 1024
-    reserved_ram_gb = model_ram_gb + MODEL_RAM_OVERHEAD_GB
-    available_for_context = max(0, available_ram_gb - reserved_ram_gb)
-
-    # Estimate context memory usage (rough: 1MB per 1000 tokens)
-    max_context_tokens = int(available_for_context * 1000)  # 1GB = ~1000 tokens
-
-    # Apply architecture-specific limits
-    if architecture == "mistral":
-        # Mistral can handle very large contexts
-        optimal_context = min(base_context, max_context_tokens)
-    elif architecture in ["llama3", "codellama"]:
-        # Llama3 and CodeLlama have good context handling
-        optimal_context = min(base_context, max_context_tokens)
-    else:
-        # Conservative for other architectures
-        optimal_context = min(base_context, max_context_tokens, MAX_CPU_CONTEXT_SIZE)
-
-    # Ensure minimum context size
-    return max(MIN_CONTEXT_SIZE, optimal_context)
-
-
-def calculate_optimal_batch_sizes(
-    available_ram_gb: float, model_size_mb: float, ctx_size: int, architecture: str
-) -> Tuple[int, int]:
-    """Calculate optimal batch sizes for CPU mode."""
-    return calculate_optimal_batch_size_cpu(
-        available_ram_gb, model_size_mb, ctx_size, architecture
-    )
-
-
-def get_optimal_parallel_cpu(available_ram_gb: float, model_size_mb: float) -> int:
-    """Calculate optimal parallel sequences for CPU mode."""
-    model_ram_gb = model_size_mb / 1024
-
-    # Calculate how many parallel sequences we can run
-    # Each parallel sequence needs roughly 1GB of RAM
-    max_parallel = int(available_ram_gb / (model_ram_gb + 1.0))
-
-    # Apply reasonable limits
-    if available_ram_gb >= 32:  # High RAM system
-        return min(8, max(1, max_parallel))
-    elif available_ram_gb >= 16:  # Mid RAM system
-        return min(4, max(1, max_parallel))
-    else:  # Low RAM system
-        return min(2, max(1, max_parallel))
-
-
-def get_cpu_architecture_optimizations(
-    architecture: str, available_ram_gb: float
-) -> Dict[str, Any]:
-    """Get architecture-specific optimizations for CPU mode using dict-based profiles."""
-    # Get architecture-specific profile, or empty dict if not found
-    profile = ARCHITECTURE_CPU_PROFILES.get(architecture, {})
-    optimizations = dict(profile)  # Copy to avoid mutating the original
-
-    # Handle dynamic mmap setting for llama architectures
-    if optimizations.get("use_mmap") == "dynamic":
-        optimizations["use_mmap"] = available_ram_gb < 16
-
-    # Common CPU optimizations applied to all architectures
-    optimizations.update(
-        {
-            "embedding": False,  # Disable embedding mode for inference
-            "cont_batching": True,  # Enable continuous batching for efficiency
-            "no_kv_offload": True,  # Don't offload KV cache (CPU mode)
-        }
-    )
-
-    return optimizations
-
-
-def generate_cpu_config(
-    model_size_mb: float,
-    architecture: str,
-    layer_count: int = 32,
-    context_length: int = 4096,
-    vocab_size: int = 0,
-    embedding_length: int = 0,
-    attention_head_count: int = 0,
-    debug: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
-    """Generate CPU-only configuration optimized for available RAM."""
-    # Get system memory info (from centralized helper)
-    total_ram_gb, used_ram_gb, available_ram_gb = get_cpu_memory_gb()
-    if debug is not None:
-        debug.update(
-            {
-                "cpu_total_ram_gb": total_ram_gb,
-                "cpu_available_ram_gb": available_ram_gb,
-            }
-        )
-
-    # Estimate CPU threads (leave some cores free for system)
-    cpu_count_phys = psutil.cpu_count(logical=False) or 1
-    logical_cpu_count = psutil.cpu_count(logical=True) or cpu_count_phys
-    threads = max(1, cpu_count_phys - 1)  # Leave 1 core for system
-    threads_batch = max(
-        1, min(threads, max(1, logical_cpu_count - 2))
-    )  # Guard negatives
-
-    # Calculate optimal context size based on model's max and available RAM (no hard cap)
-    base_ctx = max(512, context_length or 4096)
-    model_gb = max(0.001, model_size_mb / 1024.0)
-    # Tokens per GB heuristic (centralized)
-    tokens_per_gb = tokens_per_gb_by_model_size(model_gb)
-    # Reserve RAM for model + overhead using actual available RAM
-    reserved_ram_gb = model_gb + 2.0
-    available_for_ctx_gb = max(0.0, available_ram_gb - reserved_ram_gb)
-    # Provide a small minimum window so we don't quantize to zero
-    if available_for_ctx_gb <= 0:
-        available_for_ctx_gb = max(0.25, available_ram_gb * 0.1)
-    if debug is not None:
-        debug.update(
-            {
-                "model_gb": model_gb,
-                "tokens_per_gb": tokens_per_gb,
-                "reserved_ram_gb": reserved_ram_gb,
-                "available_for_ctx_gb": available_for_ctx_gb,
-            }
-        )
-    # Initial cap ignoring batch/parallel
-    max_tokens_by_ram = ctx_tokens_budget_greedy(
-        model_gb, available_ram_gb, reserve_overhead_gb=2.0
-    )
-    optimal_ctx_size = max(512, min(base_ctx, max_tokens_by_ram))
-
-    # Calculate optimal batch sizes using centralized function
-    batch_size, ubatch_size = calculate_optimal_batch_size_cpu(
-        available_ram_gb, model_size_mb, optimal_ctx_size, architecture
-    )
-
-    # Adjust ctx_size to account for batch and parallel (ctx * batch * parallel <= tokens_budget)
-    parallel = 1
-    tokens_budget = int(tokens_per_gb * available_for_ctx_gb)
-    if tokens_budget > 0:
-        # Budget ctx tokens directly from available RAM; batch is handled separately
-        safe_ctx = int(tokens_budget)
-        optimal_ctx_size = max(512, min(optimal_ctx_size, safe_ctx))
-    if debug is not None:
-        debug.update(
-            {
-                "tokens_budget": tokens_budget,
-                "batch_size": batch_size,
-                "ubatch_size": ubatch_size,
-                "parallel": parallel,
-                "optimal_ctx_size": optimal_ctx_size,
-            }
-        )
-
-    config = {
-        "threads": threads,
-        "threads_batch": threads_batch,
-        "ctx_size": optimal_ctx_size,
-        "batch_size": batch_size,
-        "ubatch_size": ubatch_size,
-        "parallel": parallel,
-        "no_mmap": False,
-        "mlock": False,
-        "low_vram": False,
-        "logits_all": False,  # Don't compute all logits to save memory
-    }
-
-    # Add architecture-specific optimizations
-    config.update(get_cpu_architecture_optimizations(architecture, available_ram_gb))
-
-    return config
diff --git a/backend/smart_auto/generation_params.py b/backend/smart_auto/generation_params.py
deleted file mode 100644
index 6cdc1b4..0000000
--- a/backend/smart_auto/generation_params.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from typing import Dict, Any, Optional
-
-
-def safe_float(val: Any, default: float = 0.0) -> float:
-    """Safely convert value to float, returning default on failure."""
-    try:
-        return float(val)
-    except (ValueError, TypeError, OverflowError):
-        return default
-
-
-def safe_int(val: Any, default: int = 0) -> int:
-    """Safely convert value to int, returning default on failure."""
-    try:
-        return int(val)
-    except (ValueError, TypeError, OverflowError):
-        return default
-
-
-def clamp_float(val: Any, lo: float, hi: float, default: float) -> float:
-    try:
-        fv = float(val)
-    except Exception:
-        return default
-    return max(lo, min(hi, fv))
-
-
-def clamp_int(val: Any, lo: int, hi: int, default: int) -> int:
-    try:
-        iv = int(val)
-    except Exception:
-        return default
-    return max(lo, min(hi, iv))
-
-
-def build_generation_params(
-    architecture: str, context_length: int, preset_overrides: Dict[str, Any] | None
-) -> Dict[str, Any]:
-    params: Dict[str, Any] = {}
-
-    params.update(
-        {
-            "temperature": 0.8,
-            "top_p": 0.9,
-            "typical_p": 1.0,
-            "min_p": 0.0,
-            "tfs_z": 1.0,
-            "top_k": 40,
-            "repeat_penalty": 1.1,
-            "presence_penalty": 0.0,
-            "frequency_penalty": 0.0,
-            "mirostat": 0,
-            "mirostat_tau": 5.0,
-            "mirostat_eta": 0.1,
-            "ctx_size": max(512, int(context_length or 0)),
-            "stop": [],
-        }
-    )
-
-    if preset_overrides:
-        params.update(preset_overrides)
-
-    params["temp"] = params.get("temperature", params.get("temp", 0.8))
-
-    params["temperature"] = clamp_float(params.get("temperature", 0.8), 0.0, 2.0, 0.8)
-    params["top_p"] = clamp_float(params.get("top_p", 0.9), 0.0, 1.0, 0.9)
-    params["min_p"] = clamp_float(params.get("min_p", 0.0), 0.0, 1.0, 0.0)
-    params["typical_p"] = clamp_float(params.get("typical_p", 1.0), 0.0, 1.0, 1.0)
-    params["tfs_z"] = clamp_float(params.get("tfs_z", 1.0), 0.0, 1.0, 1.0)
-    params["top_k"] = max(0, int(params.get("top_k", 40) or 0))
-    params["repeat_penalty"] = max(0.0, float(params.get("repeat_penalty", 1.1) or 1.1))
-    params["presence_penalty"] = float(params.get("presence_penalty", 0.0) or 0.0)
-    params["frequency_penalty"] = float(params.get("frequency_penalty", 0.0) or 0.0)
-    params["mirostat"] = max(0, min(2, int(params.get("mirostat", 0) or 0)))
-    params["mirostat_tau"] = clamp_float(
-        params.get("mirostat_tau", 5.0), 0.1, 20.0, 5.0
-    )
-    params["mirostat_eta"] = clamp_float(
-        params.get("mirostat_eta", 0.1), 0.01, 2.0, 0.1
-    )
-    params["ctx_size"] = max(
-        512, int(params.get("ctx_size", context_length) or context_length)
-    )
-    if not isinstance(params.get("stop", []), list):
-        params["stop"] = []
-
-    return params
diff --git a/backend/smart_auto/gpu_config.py b/backend/smart_auto/gpu_config.py
deleted file mode 100644
index af6b423..0000000
--- a/backend/smart_auto/gpu_config.py
+++ /dev/null
@@ -1,469 +0,0 @@
-"""
-GPU configuration module.
-Handles all GPU-specific configuration logic including single GPU, multi-GPU, and NVLink topologies.
-"""
-
-from typing import Dict, Any, Optional, List
-import psutil
-
-from .architecture_config import get_architecture_default_context
-from .calculators import (
-    calculate_optimal_batch_size_gpu,
-    calculate_max_context_size_gpu,
-    calculate_optimal_context_size_gpu,
-    calculate_optimal_gpu_layers,
-    calculate_ubatch_size,
-)
-from .constants import (
-    VRAM_FRAGMENTATION_MARGIN,
-    VRAM_SAFETY_MARGIN,
-    MIN_CONTEXT_SIZE,
-    MAX_CONTEXT_SIZE,
-    MIN_BATCH_SIZE,
-    MAX_BATCH_SIZE,
-)
-
-
-def parse_compute_capability(value: str) -> float:
-    """Parse compute capability like '8.0', '7.5' to a float safely."""
-    try:
-        parts = str(value).split(".")
-        major = int(parts[0]) if parts and parts[0].isdigit() else 0
-        minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
-        return major + minor / 10.0
-    except Exception:
-        return 0.0
-
-
-def calculate_optimal_batch_size(
-    available_vram_gb: float,
-    model_size_mb: float,
-    context_size: int,
-    embedding_length: int,
-    layer_count: int,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-) -> int:
-    """Calculate optimal batch size based on memory and throughput analysis."""
-    return calculate_optimal_batch_size_gpu(
-        available_vram_gb,
-        model_size_mb,
-        context_size,
-        embedding_length,
-        layer_count,
-        cache_type_k=cache_type_k,
-        cache_type_v=cache_type_v,
-    )
-
-
-def calculate_max_context_size(
-    available_vram_gb: float,
-    model_size_mb: float,
-    layer_count: int,
-    embedding_length: int,
-    attention_head_count: int,
-    attention_head_count_kv: int,
-) -> int:
-    """Calculate maximum context size based on actual memory requirements."""
-    return calculate_max_context_size_gpu(
-        available_vram_gb,
-        model_size_mb,
-        layer_count,
-        embedding_length,
-        attention_head_count,
-        attention_head_count_kv,
-    )
-
-
-def get_optimal_context_size(
-    architecture: str,
-    available_vram: int,
-    model_size_mb: float = 0,
-    layer_count: int = 32,
-    embedding_length: int = 0,
-    attention_head_count: int = 0,
-    attention_head_count_kv: int = 0,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-) -> int:
-    """Calculate optimal context size based on actual memory requirements and architecture."""
-    base_context = get_architecture_default_context(architecture)
-    return calculate_optimal_context_size_gpu(
-        architecture,
-        available_vram,
-        model_size_mb,
-        layer_count,
-        embedding_length,
-        attention_head_count,
-        attention_head_count_kv,
-        base_context,
-        cache_type_k=cache_type_k,
-        cache_type_v=cache_type_v,
-        usage_mode=usage_mode,
-    )
-
-
-def single_gpu_config(
-    model_size_mb: float,
-    architecture: str,
-    gpu: Dict,
-    layer_count: int = 32,
-    embedding_length: int = 0,
-    attention_head_count: int = 0,
-    attention_head_count_kv: int = 0,
-    compute_capability: float = 0.0,
-    context_length: int = 4096,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-) -> Dict[str, Any]:
-    """Configuration for single GPU.
-
-    Args:
-        compute_capability: Pre-parsed compute capability (e.g., 8.0, 7.5).
-                          Use 0.0 if not available.
-    """
-    # Extract frequently accessed values to avoid repeated dict lookups
-    gpu_memory = gpu.get("memory", {})
-    vram_gb = gpu_memory.get("total", 0) / (1024**3)
-    free_vram_gb = gpu_memory.get("free", 0) / (1024**3)
-    gpu_index = gpu.get("index", 0)
-
-    # Use calculator for GPU layer estimation
-    # Use exact M_kv/M_compute calculation with estimated context/batch values
-    # These will be refined later, but using estimates here gives better initial calculation
-    n_gpu_layers = calculate_optimal_gpu_layers(
-        free_vram_gb,
-        model_size_mb,
-        layer_count,
-        context_size=context_length,  # Use architecture default context
-        cache_type_k=cache_type_k,
-        cache_type_v=cache_type_v,
-        ubatch_size=512,  # Reasonable estimate for initial calculation
-        attention_head_count=attention_head_count,
-        attention_head_count_kv=attention_head_count_kv,
-        embedding_length=embedding_length,
-        usage_mode=usage_mode,
-    )
-
-    config = {
-        "n_gpu_layers": n_gpu_layers,
-        "main_gpu": gpu_index,
-        "threads": max(1, (psutil.cpu_count(logical=False) or 2) - 2),
-        "threads_batch": max(1, (psutil.cpu_count(logical=False) or 2) - 2),
-    }
-
-    # Calculate optimal batch sizes based on actual memory requirements
-    # Note: Use architecture default context_length (will be refined later in generate_gpu_config)
-    # Use selected KV cache quantization if provided
-    if embedding_length > 0 and layer_count > 0:
-        # Use data-driven calculation with architecture default context length
-        optimal_batch_size = calculate_optimal_batch_size(
-            free_vram_gb,
-            model_size_mb,
-            context_length,
-            embedding_length,
-            layer_count,
-            cache_type_k=cache_type_k,
-            cache_type_v=cache_type_v,
-        )
-        config["batch_size"] = max(
-            MIN_BATCH_SIZE, min(MAX_BATCH_SIZE, optimal_batch_size)
-        )
-        config["ubatch_size"] = calculate_ubatch_size(config["batch_size"])
-    else:
-        # Fallback to VRAM-based estimation
-        if vram_gb >= 24:  # High-end GPU
-            config["batch_size"] = min(1024, max(256, int(vram_gb * 30)))
-            config["ubatch_size"] = min(512, max(128, int(vram_gb * 15)))
-        elif vram_gb >= 12:  # Mid-range GPU
-            config["batch_size"] = min(512, max(128, int(vram_gb * 25)))
-            config["ubatch_size"] = min(256, max(64, int(vram_gb * 12)))
-        elif vram_gb >= 8:  # Lower-end GPU
-            config["batch_size"] = min(256, max(64, int(vram_gb * 20)))
-            config["ubatch_size"] = min(128, max(32, int(vram_gb * 10)))
-        else:  # Very limited VRAM
-            config["batch_size"] = min(128, max(32, int(vram_gb * 15)))
-            config["ubatch_size"] = min(64, max(16, int(vram_gb * 7)))
-
-    # Enable flash attention for supported GPUs (Ampere and newer: >= 8.0)
-    if compute_capability >= 8.0:
-        config["flash_attn"] = True
-
-    return config
-
-
-def multi_gpu_config(
-    model_size_mb: float,
-    architecture: str,
-    gpus: list,
-    nvlink_topology: Dict,
-    layer_count: int = 32,
-    compute_capabilities: Optional[List[float]] = None,
-) -> Dict[str, Any]:
-    """Configuration for multiple GPUs with NVLink awareness.
-
-    Args:
-        compute_capabilities: Pre-parsed compute capabilities list. If None, will parse from gpus.
-    """
-    config = {
-        "main_gpu": 0,
-        "n_gpu_layers": -1,  # Use all layers
-        "threads": max(1, psutil.cpu_count(logical=False) - 2),
-        "threads_batch": max(1, psutil.cpu_count(logical=False) - 2),
-    }
-
-    # Enable flash attention if all GPUs support it (Ampere and newer: >= 8.0)
-    if compute_capabilities:
-        # Use pre-parsed compute capabilities
-        if all(cc >= 8.0 for cc in compute_capabilities):
-            config["flash_attn"] = True
-    else:
-        # Fallback: parse from gpus if not provided
-        if all(
-            parse_compute_capability(gpu.get("compute_capability", "0.0")) >= 8.0
-            for gpu in gpus
-        ):
-            config["flash_attn"] = True
-
-    # Configure based on NVLink topology
-    strategy = nvlink_topology.get("recommended_strategy", "pcie_only")
-
-    if strategy == "nvlink_unified":
-        # All GPUs connected via NVLink - use unified memory approach
-        config.update(nvlink_unified_config(gpus, nvlink_topology))
-    elif strategy == "nvlink_clustered":
-        # Multiple NVLink clusters - optimize per cluster
-        config.update(nvlink_clustered_config(gpus, nvlink_topology))
-    elif strategy == "nvlink_partial":
-        # Partial NVLink connectivity - hybrid approach
-        config.update(nvlink_partial_config(gpus, nvlink_topology))
-    else:
-        # PCIe only - traditional tensor splitting
-        config.update(pcie_only_config(gpus))
-
-    return config
-
-
-def nvlink_unified_config(gpus: list, nvlink_topology: Dict) -> Dict[str, Any]:
-    """Configuration for unified NVLink cluster."""
-    # With NVLink, we can use more aggressive tensor splitting
-    # Extract memory values once to avoid repeated dict lookups
-    vram_sizes = [gpu.get("memory", {}).get("total", 0) for gpu in gpus]
-    total_vram = sum(vram_sizes)
-    total_vram_gb = total_vram / (1024**3)
-
-    # Pre-calculate ratios as floats, format only at the end
-    tensor_split = [
-        f"{vram / total_vram:.3f}" if total_vram > 0 else "0.000" for vram in vram_sizes
-    ]
-
-    return {
-        "tensor_split": ",".join(tensor_split),
-        "parallel": min(8, len(gpus) * 2),  # Higher parallelism with NVLink
-        "batch_size": min(
-            4096, max(512, int(total_vram_gb * 150))
-        ),  # Larger batches for high VRAM
-        "ubatch_size": min(2048, max(256, int(total_vram_gb * 75))),
-    }
-
-
-def nvlink_clustered_config(gpus: list, nvlink_topology: Dict) -> Dict[str, Any]:
-    """Configuration for multiple NVLink clusters."""
-    # Extract clusters once to avoid repeated dict lookup
-    clusters = nvlink_topology.get("clusters", [])
-
-    if not clusters:
-        return pcie_only_config(gpus)
-
-    # Use the largest cluster for primary processing
-    largest_cluster = max(clusters, key=lambda c: len(c["gpus"]))
-    cluster_gpu_indices = set(largest_cluster["gpus"])
-
-    # Configure tensor split for the largest cluster
-    # Pre-extract all GPU memory values once to avoid repeated dict lookups
-    gpu_memories = [gpu.get("memory", {}) for gpu in gpus]
-    cluster_vram_sizes = [gpu_memories[i].get("total", 0) for i in cluster_gpu_indices]
-    total_vram = sum(cluster_vram_sizes)
-    total_vram_gb = total_vram / (1024**3)
-
-    # Pre-calculate ratios as floats, format only at the end
-    tensor_split_ratios = []
-    for i, gpu_memory in enumerate(gpu_memories):
-        if i in cluster_gpu_indices:
-            ratio = gpu_memory.get("total", 0) / total_vram if total_vram > 0 else 0.0
-            tensor_split_ratios.append(ratio)
-        else:
-            tensor_split_ratios.append(0.0)
-
-    # Format all ratios in a single pass
-    tensor_split = [f"{ratio:.3f}" for ratio in tensor_split_ratios]
-
-    return {
-        "tensor_split": ",".join(tensor_split),
-        "parallel": min(6, len(largest_cluster["gpus"]) * 2),
-        "batch_size": min(3072, max(384, int(total_vram_gb * 120))),
-        "ubatch_size": min(1536, max(192, int(total_vram_gb * 60))),
-    }
-
-
-def nvlink_partial_config(gpus: list, nvlink_topology: Dict) -> Dict[str, Any]:
-    """Configuration for partial NVLink connectivity."""
-    # Use conservative approach for partial NVLink
-    vram_sizes = [gpu.get("memory", {}).get("total", 0) for gpu in gpus]
-    total_vram = sum(vram_sizes)
-    total_vram_gb = total_vram / (1024**3)
-
-    # Pre-calculate ratios as floats, format only at the end
-    tensor_split = [
-        f"{vram / total_vram:.2f}" if total_vram > 0 else "0.00" for vram in vram_sizes
-    ]
-
-    return {
-        "tensor_split": ",".join(tensor_split),
-        "parallel": min(4, len(gpus)),
-        "batch_size": min(2048, max(256, int(total_vram_gb * 100))),
-        "ubatch_size": min(1024, max(128, int(total_vram_gb * 50))),
-    }
-
-
-def pcie_only_config(gpus: list) -> Dict[str, Any]:
-    """Configuration for PCIe-only multi-GPU setup."""
-    # Calculate tensor split based on VRAM
-    vram_sizes = [gpu.get("memory", {}).get("total", 0) for gpu in gpus]
-    total_vram = sum(vram_sizes)
-    total_vram_gb = total_vram / (1024**3)
-
-    # Pre-calculate ratios as floats, format only at the end
-    tensor_split = [
-        f"{vram / total_vram:.2f}" if total_vram > 0 else "0.00" for vram in vram_sizes
-    ]
-
-    return {
-        "tensor_split": ",".join(tensor_split),
-        "parallel": min(2, len(gpus)),  # Conservative parallelism for PCIe
-        "batch_size": min(1024, max(128, int(total_vram_gb * 80))),
-        "ubatch_size": min(512, max(64, int(total_vram_gb * 40))),
-    }
-
-
-def generate_gpu_config(
-    model_size_mb: float,
-    architecture: str,
-    gpus: list,
-    total_vram: int,
-    gpu_count: int,
-    nvlink_topology: Dict,
-    layer_count: int = 32,
-    context_length: int = 4096,
-    vocab_size: int = 0,
-    embedding_length: int = 0,
-    attention_head_count: int = 0,
-    attention_head_count_kv: int = 0,
-    compute_capabilities: Optional[List[float]] = None,
-    cache_type_k: Optional[str] = None,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-    debug: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
-    """Generate GPU-optimized configuration.
-
-    Args:
-        compute_capabilities: Pre-parsed compute capabilities list from SystemResources.
-    """
-    config = {}
-
-    # Calculate optimal GPU layers
-    available_vram = sum(gpu.get("memory", {}).get("free", 0) for gpu in gpus)
-    available_vram_gb = available_vram / (1024**3)
-
-    if gpu_count == 1:
-        # Use pre-parsed compute capability for single GPU
-        gpu_cc = (
-            compute_capabilities[0]
-            if compute_capabilities and len(compute_capabilities) > 0
-            else 0.0
-        )
-        config.update(
-            single_gpu_config(
-                model_size_mb,
-                architecture,
-                gpus[0],
-                layer_count,
-                embedding_length,
-                attention_head_count,
-                attention_head_count_kv,
-                gpu_cc,
-                context_length,
-                cache_type_k=cache_type_k,
-                cache_type_v=cache_type_v,
-                usage_mode=usage_mode,
-            )
-        )
-    else:
-        config.update(
-            multi_gpu_config(
-                model_size_mb,
-                architecture,
-                gpus,
-                nvlink_topology,
-                layer_count,
-                compute_capabilities,
-            )
-        )
-
-    # Context size based on available VRAM and model parameters
-    # Use selected KV cache quantization if provided
-    ctx_size = get_optimal_context_size(
-        architecture,
-        available_vram,
-        model_size_mb,
-        layer_count,
-        embedding_length,
-        attention_head_count,
-        attention_head_count_kv,
-        cache_type_k=cache_type_k,
-        cache_type_v=cache_type_v,
-        usage_mode=usage_mode,
-    )
-    # Clamp GPU ctx size to sane bounds
-    config["ctx_size"] = max(MIN_CONTEXT_SIZE, min(ctx_size, MAX_CONTEXT_SIZE))
-    if debug is not None:
-        debug.update(
-            {
-                "gpu_available_vram_bytes": int(available_vram),
-                "gpu_ctx_size": config["ctx_size"],
-            }
-        )
-
-    # Batch sizes based on actual memory requirements
-    # Use selected KV cache quantization if provided
-    if embedding_length > 0 and layer_count > 0:
-        optimal_batch_size = calculate_optimal_batch_size(
-            available_vram_gb,
-            model_size_mb,
-            config["ctx_size"],
-            embedding_length,
-            layer_count,
-            cache_type_k=cache_type_k,
-            cache_type_v=cache_type_v,
-        )
-        config["batch_size"] = max(
-            MIN_BATCH_SIZE, min(MAX_BATCH_SIZE, optimal_batch_size)
-        )
-        config["ubatch_size"] = calculate_ubatch_size(config["batch_size"])
-    else:
-        # Fallback to size-based estimation
-        config["batch_size"] = min(1024, max(64, int(model_size_mb / 50)))
-        config["ubatch_size"] = min(
-            config["batch_size"], max(16, int(model_size_mb / 100))
-        )
-
-    # Parallel sequences (conservative for multi-GPU)
-    if gpu_count > 1:
-        config["parallel"] = max(1, min(4, gpu_count))
-    else:
-        config["parallel"] = 1
-
-    return config
diff --git a/backend/smart_auto/kv_cache.py b/backend/smart_auto/kv_cache.py
deleted file mode 100644
index ed45efb..0000000
--- a/backend/smart_auto/kv_cache.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Dict, Any
-
-
-def get_optimal_kv_cache_quant(
-    available_vram_gb: float,
-    context_length: int,
-    architecture: str,
-    flash_attn_available: bool = False,
-) -> Dict[str, Any]:
-    """Determine optimal KV cache quantization to balance memory usage and quality."""
-    if context_length > 32768:
-        cache_type_k = "q5_1" if available_vram_gb > 40 else "q4_1"
-        cache_type_v = cache_type_k if flash_attn_available else None
-        return {"cache_type_k": cache_type_k, "cache_type_v": cache_type_v}
-
-    if context_length > 8192:
-        cache_type_k = "q8_0" if available_vram_gb > 24 else "q4_1"
-        cache_type_v = cache_type_k if flash_attn_available else None
-        return {"cache_type_k": cache_type_k, "cache_type_v": cache_type_v}
-
-    if available_vram_gb > 16:
-        return {
-            "cache_type_k": "f16",
-            "cache_type_v": "f16" if flash_attn_available else None,
-        }
-
-    return {
-        "cache_type_k": "q8_0",
-        "cache_type_v": "q8_0" if flash_attn_available else None,
-    }
diff --git a/backend/smart_auto/memory_estimator.py b/backend/smart_auto/memory_estimator.py
deleted file mode 100644
index 1d10b12..0000000
--- a/backend/smart_auto/memory_estimator.py
+++ /dev/null
@@ -1,478 +0,0 @@
-"""
-Memory estimation module.
-Consolidates RAM and VRAM estimation with shared KV cache calculation logic.
-Also provides CPU memory utilities.
-"""
-
-from typing import Dict, Any, Tuple, Optional
-from functools import lru_cache
-import psutil
-
-from backend.database import Model
-from backend.logging_config import get_logger
-from .model_metadata import get_model_metadata
-from .models import ModelMetadata
-from .constants import (
-    KV_CACHE_QUANT_FACTORS,
-    KV_CACHE_OPTIMIZATION_FACTOR,
-    KV_CACHE_SINGLE_USER_FACTOR,
-    KV_CACHE_MULTI_USER_FACTOR,
-    MODEL_RAM_OVERHEAD_GB,
-    FALLBACK_LAYER_COUNT,
-    FALLBACK_EMBEDDING_LENGTH,
-    FALLBACK_KV_CACHE_PER_TOKEN_BYTES,
-    BATCH_VRAM_OVERHEAD_RATIO,
-    BATCH_RAM_OVERHEAD_RATIO,
-    VRAM_SAFETY_MARGIN,
-    MOE_OFFLOAD_ALL_RATIO,
-    MOE_OFFLOAD_UP_DOWN_RATIO,
-    MOE_OFFLOAD_UP_RATIO,
-    LLAMA_CPP_OVERHEAD_MB,
-    DEFAULT_BYTES_PER_ELEMENT,
-    BATCH_INTERMEDIATE_FACTOR,
-    BATCH_QKV_FACTOR,
-    BATCH_COMPUTATION_OVERHEAD_KB,
-    BATCH_FALLBACK_MB,
-    QUANTIZATION_AVERAGE_FACTOR,
-    COMPUTE_FIXED_OVERHEAD_MB,
-    COMPUTE_SCRATCH_PER_UBATCH_MB,
-)
-
-logger = get_logger(__name__)
-
-
-# CPU memory utilities
-def get_cpu_memory_gb() -> Tuple[float, float, float]:
-    """Return (total_gb, used_gb, available_gb) where available = total - used.
-    Uses actual values, no 60% approximations.
-    """
-    mem = psutil.virtual_memory()
-    total = mem.total / (1024**3)
-    used = mem.used / (1024**3)
-    available = max(0.0, total - used)
-    return total, used, available
-
-
-@lru_cache(maxsize=64)
-def tokens_per_gb_by_model_size(model_size_gb: float) -> int:
-    """Heuristic tokens per GB for KV budget by model size."""
-    if model_size_gb < 2:
-        return 3000
-    if model_size_gb < 6:
-        return 2000
-    if model_size_gb < 12:
-        return 1300
-    return 400
-
-
-def ctx_tokens_budget_greedy(
-    model_size_gb: float, available_cpu_ram_gb: float, reserve_overhead_gb: float = None
-) -> int:
-    """Compute context token budget from CPU RAM after reserving model + overhead.
-    Returns total tokens budget (not divided by batch/parallel).
-    """
-    if reserve_overhead_gb is None:
-        reserve_overhead_gb = MODEL_RAM_OVERHEAD_GB
-    reserved = model_size_gb + max(0.0, reserve_overhead_gb)
-    for_ctx = max(0.0, available_cpu_ram_gb - reserved)
-    tpg = tokens_per_gb_by_model_size(model_size_gb)
-    return max(0, int(for_ctx * tpg))
-
-
-def get_kv_cache_quant_factor(cache_type: str) -> float:
-    """Get memory reduction factor for KV cache quantization."""
-    return KV_CACHE_QUANT_FACTORS.get(cache_type, 1.0)
-
-
-@lru_cache(maxsize=512)
-def calculate_kv_cache_size(
-    ctx_size: int,
-    parallel: int,
-    layer_count: int,
-    embedding_length: int,
-    attention_head_count: int,
-    attention_head_count_kv: int,
-    cache_type_k: str,
-    cache_type_v: Optional[str] = None,
-    usage_mode: str = "single_user",
-) -> int:
-    """
-    Calculate KV cache size in bytes for memory estimation.
-
-    Cached with LRU to avoid redundant calculations for same parameters.
-    Cache size increased to 512 for production workloads.
-
-    Returns:
-        Total KV cache bytes
-    """
-    # Get quantization factors
-    quant_factor_k = get_kv_cache_quant_factor(cache_type_k)
-    quant_factor_v = (
-        get_kv_cache_quant_factor(cache_type_v) if cache_type_v else quant_factor_k
-    )
-
-    if embedding_length > 0 and layer_count > 0:
-        # Calculate bytes per element for K and V cache
-        bytes_per_k = (
-            quant_factor_k * 4
-        )  # Convert factor to actual bytes (f32=4, f16=2, etc.)
-        bytes_per_v = quant_factor_v * 4 if cache_type_v else bytes_per_k
-
-        # GQA-aware KV cache calculation (correct formula from theoretical model)
-        # M_kv = n_ctx × N_layers × N_head_kv × d_head × (p_a_k + p_a_v)
-        # where d_head = N_embd / N_head
-        if attention_head_count_kv > 0 and attention_head_count > 0:
-            # Dimension per head
-            d_head = embedding_length / attention_head_count
-            # KV cache stores N_head_kv heads per layer, each of size d_head
-            kv_cache_per_layer_k = attention_head_count_kv * d_head * bytes_per_k
-            kv_cache_per_layer_v = attention_head_count_kv * d_head * bytes_per_v
-        else:
-            # Fallback for non-GQA models (MHA: N_head_kv = N_head)
-            # In this case, use full embedding dimension
-            kv_cache_per_layer_k = embedding_length * bytes_per_k
-            kv_cache_per_layer_v = embedding_length * bytes_per_v
-
-        # Total per token: (Key + Value) * layers
-        kv_cache_per_token = (kv_cache_per_layer_k + kv_cache_per_layer_v) * layer_count
-    else:
-        # Fallback using constants
-        kv_cache_per_token = FALLBACK_KV_CACHE_PER_TOKEN_BYTES
-
-    # KV cache: ctx_size tokens, each with kv_cache_per_token bytes
-    # Parallel might create multiple context copies, so multiply by parallel
-    # Use actual memory (optimization factor is 1.0 - memory mapping doesn't reduce peak usage)
-    base_kv_cache_bytes = int(
-        ctx_size * kv_cache_per_token * parallel * KV_CACHE_OPTIMIZATION_FACTOR
-    )
-
-    # Apply usage mode factor based on theoretical model:
-    # - single_user: Peak estimate (full context accumulates, full KV cache)
-    # - multi_user: Typical usage (context cleared between requests, lower estimate)
-    if usage_mode == "multi_user":
-        usage_factor = KV_CACHE_MULTI_USER_FACTOR
-    else:  # single_user or default
-        usage_factor = KV_CACHE_SINGLE_USER_FACTOR
-
-    kv_cache_bytes = int(base_kv_cache_bytes * usage_factor)
-
-    return kv_cache_bytes
-
-
-def estimate_vram_usage(
-    model: Model,
-    config: Dict[str, Any],
-    gpu_info: Dict[str, Any],
-    metadata: Optional[ModelMetadata] = None,
-    usage_mode: str = "single_user",
-) -> Dict[str, Any]:
-    """Estimate VRAM usage for given configuration using comprehensive model metadata
-
-    Args:
-        model: The model to estimate for
-        config: Configuration dictionary
-        gpu_info: GPU information dictionary
-        metadata: Optional pre-computed ModelMetadata to avoid redundant calls
-    """
-    try:
-        model_size = model.file_size if model.file_size else 0
-
-        # Extract frequently accessed config values early to avoid repeated dict lookups
-        n_gpu_layers = int(config.get("n_gpu_layers", 0) or 0)
-        ctx_size = int(config.get("ctx_size", 4096) or 4096)
-        parallel = max(1, int(config.get("parallel", 1) or 1))
-        cache_type_k = config.get("cache_type_k", "f16")
-        cache_type_v = config.get("cache_type_v")
-
-        # Use provided metadata or fetch it (cached internally)
-        layer_info = metadata if metadata is not None else get_model_metadata(model)
-        total_layers = max(1, layer_info.layer_count or FALLBACK_LAYER_COUNT)
-        embedding_length = layer_info.embedding_length or 0
-        attention_head_count = layer_info.attention_head_count or 0
-        attention_head_count_kv = layer_info.attention_head_count_kv or 0
-
-        # Layer split between GPU and CPU
-        layer_ratio = min(
-            1.0, max(0.0, (n_gpu_layers / total_layers) if total_layers > 0 else 0.0)
-        )
-        model_vram = int(model_size * layer_ratio)
-        model_ram = max(0, int(model_size - model_vram))
-
-        # Use shared KV cache calculation
-        kv_cache_bytes = calculate_kv_cache_size(
-            ctx_size,
-            parallel,
-            total_layers,
-            embedding_length,
-            attention_head_count,
-            attention_head_count_kv,
-            cache_type_k,
-            cache_type_v,
-            usage_mode=usage_mode,
-        )
-
-        # Determine if KV cache goes to VRAM or RAM
-        # According to theoretical model: when n_gpu_layers > 0, M_kv goes to VRAM by default
-        # The "VRAM Trap": in hybrid mode, M_kv and M_compute both go to VRAM
-        if n_gpu_layers > 0:
-            # In GPU mode (including hybrid), KV cache goes to VRAM
-            kv_cache_vram = kv_cache_bytes
-            kv_cache_ram = 0
-        else:
-            # CPU-only mode: KV cache goes to RAM
-            kv_cache_vram = 0
-            kv_cache_ram = kv_cache_bytes
-
-        # M_compute: Fixed overhead + variable scratch buffer
-        # According to theoretical model: M_compute = M_overhead_fixed + M_scratch_variable(n_ubatch)
-        ubatch_size = config.get("ubatch_size", 512)
-        compute_overhead_mb = COMPUTE_FIXED_OVERHEAD_MB + (
-            ubatch_size * COMPUTE_SCRATCH_PER_UBATCH_MB
-        )
-        compute_overhead_bytes = int(compute_overhead_mb * 1024 * 1024)
-
-        # Allocate M_compute to VRAM if GPU layers > 0 (VRAM Trap)
-        if n_gpu_layers > 0:
-            batch_vram = compute_overhead_bytes
-            batch_ram = 0
-        else:
-            batch_vram = 0
-            batch_ram = compute_overhead_bytes
-
-        estimated_vram = model_vram + kv_cache_vram + batch_vram
-        estimated_ram = model_ram + kv_cache_ram + batch_ram
-
-        # System RAM usage snapshot
-        try:
-            vm = psutil.virtual_memory()
-            system_ram_used = int(vm.used)
-            system_ram_total = int(vm.total)
-        except Exception:
-            system_ram_used = 0
-            system_ram_total = 0
-
-        # VRAM headroom check
-        # Extract gpus list once to avoid repeated dict lookup
-        gpus = gpu_info.get("gpus", [])
-        total_free_vram = sum(g.get("memory", {}).get("free", 0) for g in gpus)
-        fits_in_gpu = (n_gpu_layers == 0) or (
-            estimated_vram <= max(0, total_free_vram * VRAM_SAFETY_MARGIN)
-        )
-
-        memory_mode = "ram_only"
-        if n_gpu_layers > 0:
-            if estimated_ram > 0:
-                memory_mode = "mixed"
-            else:
-                memory_mode = "vram_only"
-
-        return {
-            "memory_mode": memory_mode,
-            # VRAM
-            "estimated_vram": estimated_vram,
-            "model_vram": model_vram,
-            "kv_cache_vram": kv_cache_vram,
-            "batch_vram": batch_vram,
-            # RAM
-            "estimated_ram": estimated_ram,
-            "model_ram": model_ram,
-            "kv_cache_ram": kv_cache_ram,
-            "batch_ram": batch_ram,
-            # System RAM snapshot
-            "system_ram_used": system_ram_used,
-            "system_ram_total": system_ram_total,
-            # Fit flag
-            "fits_in_gpu": fits_in_gpu,
-        }
-    except Exception:
-        try:
-            vm = psutil.virtual_memory()
-            system_ram_used = int(vm.used)
-            system_ram_total = int(vm.total)
-        except Exception:
-            system_ram_used = 0
-            system_ram_total = 0
-        return {
-            "memory_mode": "unknown",
-            "estimated_vram": 0,
-            "model_vram": 0,
-            "kv_cache_vram": 0,
-            "batch_vram": 0,
-            "estimated_ram": 0,
-            "model_ram": 0,
-            "kv_cache_ram": 0,
-            "batch_ram": 0,
-            "system_ram_used": system_ram_used,
-            "system_ram_total": system_ram_total,
-            "fits_in_gpu": True,
-        }
-
-
-def estimate_ram_usage(
-    model: Model,
-    config: Dict[str, Any],
-    metadata: Optional[ModelMetadata] = None,
-    usage_mode: str = "single_user",
-) -> Dict[str, Any]:
-    """Estimate RAM usage for given configuration
-
-    Args:
-        model: The model to estimate for
-        config: Configuration dictionary
-        metadata: Optional pre-computed ModelMetadata to avoid redundant calls
-    """
-    try:
-        model_size = model.file_size if model.file_size else 0
-
-        # Extract frequently accessed config values early to avoid repeated dict lookups
-        n_gpu_layers = config.get("n_gpu_layers", 0)
-        ctx_size = config.get("ctx_size", 4096)
-        batch_size = config.get("batch_size", 512)
-        parallel = config.get("parallel", 1)
-        cache_type_k = config.get("cache_type_k", "f16")
-        cache_type_v = config.get("cache_type_v")
-
-        # Get system RAM info (extract once)
-        vm = psutil.virtual_memory()
-        total_memory = vm.total
-        available_memory = vm.available
-
-        # Use provided metadata or fetch it (cached internally)
-        # Use ModelMetadata dataclass attributes directly
-        layer_info = metadata if metadata is not None else get_model_metadata(model)
-        total_layers = layer_info.layer_count or FALLBACK_LAYER_COUNT
-        embedding_length = layer_info.embedding_length or 0
-        attention_head_count = layer_info.attention_head_count or 0
-        attention_head_count_kv = layer_info.attention_head_count_kv or 0
-        is_moe = layer_info.is_moe
-
-        cpu_layers = total_layers - n_gpu_layers if n_gpu_layers > 0 else total_layers
-
-        if n_gpu_layers > 0:
-            # GPU layers: full model loaded in RAM for GPU transfer
-            model_ram = model_size
-        else:
-            # CPU-only: only CPU layers in RAM
-            layer_ratio = cpu_layers / total_layers if cpu_layers > 0 else 1
-            model_ram = int(model_size * layer_ratio)
-
-        # Enhanced KV cache estimation using model architecture
-        # Use shared KV cache calculation
-        kv_cache_ram = calculate_kv_cache_size(
-            ctx_size,
-            parallel,
-            total_layers,
-            embedding_length,
-            attention_head_count,
-            attention_head_count_kv,
-            cache_type_k,
-            cache_type_v,
-            usage_mode=usage_mode,
-        )
-
-        # MoE models with CPU offloading use RAM for offloaded layers
-        moe_cpu_ram = 0
-        if is_moe and n_gpu_layers > 0:
-            moe_pattern = config.get("moe_offload_custom", "")
-            if moe_pattern:
-                # Estimate RAM usage for offloaded MoE layers
-                if ".*_exps" in moe_pattern:
-                    # All MoE offloaded
-                    moe_cpu_ram = int(model_size * MOE_OFFLOAD_ALL_RATIO)
-                elif "up|down" in moe_pattern:
-                    # Up/Down offloaded
-                    moe_cpu_ram = int(model_size * MOE_OFFLOAD_UP_DOWN_RATIO)
-                elif "_up_" in moe_pattern:
-                    # Only Up offloaded
-                    moe_cpu_ram = int(model_size * MOE_OFFLOAD_UP_RATIO)
-
-        # Batch processing overhead
-        if embedding_length > 0:
-            bytes_per_element = DEFAULT_BYTES_PER_ELEMENT
-
-            # Intermediate activations: batch_size tokens * embedding_length
-            intermediate_ram = int(
-                batch_size
-                * embedding_length
-                * bytes_per_element
-                * BATCH_INTERMEDIATE_FACTOR
-            )
-
-            # QKV projections are also temporary and reused
-            qkv_ram = int(
-                batch_size * 3 * embedding_length * bytes_per_element * BATCH_QKV_FACTOR
-            )
-
-            # Additional buffers are minimal and reused
-            computation_overhead = batch_size * BATCH_COMPUTATION_OVERHEAD_KB * 1024
-
-            batch_ram = intermediate_ram + qkv_ram + computation_overhead
-        else:
-            # Fallback: reduced estimate based on actual usage
-            batch_ram = batch_size * int(BATCH_FALLBACK_MB * 1024 * 1024)
-
-        # Additional overhead for llama.cpp
-        llama_overhead = LLAMA_CPP_OVERHEAD_MB * 1024 * 1024
-
-        total_ram = model_ram + kv_cache_ram + batch_ram + llama_overhead + moe_cpu_ram
-
-        # Check if fits in available RAM
-        fits_in_ram = total_ram <= available_memory
-
-        # Calculate quantization savings
-        quant_factor_k = get_kv_cache_quant_factor(cache_type_k)
-        quant_factor_v = (
-            get_kv_cache_quant_factor(cache_type_v) if cache_type_v else quant_factor_k
-        )
-
-        # Calculate raw KV cache size (for savings calculation) using correct GQA-aware formula
-        if embedding_length > 0 and total_layers > 0:
-            bytes_per_k = quant_factor_k * 4
-            bytes_per_v = quant_factor_v * 4 if cache_type_v else bytes_per_k
-            if attention_head_count_kv > 0 and attention_head_count > 0:
-                # GQA-aware calculation
-                d_head = embedding_length / attention_head_count
-                kv_cache_per_layer_k = attention_head_count_kv * d_head * bytes_per_k
-                kv_cache_per_layer_v = attention_head_count_kv * d_head * bytes_per_v
-            else:
-                # Fallback for non-GQA models
-                kv_cache_per_layer_k = embedding_length * bytes_per_k
-                kv_cache_per_layer_v = embedding_length * bytes_per_v
-            kv_cache_per_token = (
-                kv_cache_per_layer_k + kv_cache_per_layer_v
-            ) * total_layers
-        else:
-            kv_cache_per_token = FALLBACK_KV_CACHE_PER_TOKEN_BYTES
-
-        # Calculate savings (difference between f32 and current quantization)
-        kv_cache_savings = int(
-            ctx_size
-            * parallel
-            * kv_cache_per_token
-            * (
-                1
-                - (
-                    QUANTIZATION_AVERAGE_FACTOR * quant_factor_k
-                    + QUANTIZATION_AVERAGE_FACTOR * quant_factor_v
-                )
-            )
-        )
-
-        return {
-            "estimated_ram": total_ram,
-            "model_ram": model_ram,
-            "kv_cache_ram": kv_cache_ram,
-            "batch_ram": batch_ram,
-            "moe_cpu_ram": moe_cpu_ram,
-            "llama_overhead": llama_overhead,
-            "fits_in_ram": fits_in_ram,
-            "available_ram": available_memory,
-            "total_ram": total_memory,
-            "utilization_percent": (
-                (total_ram / total_memory * 100) if total_memory > 0 else 0
-            ),
-            "kv_cache_savings": kv_cache_savings,
-        }
-
-    except Exception as e:
-        return {"error": str(e), "estimated_ram": 0, "fits_in_ram": False}
diff --git a/backend/smart_auto/model_metadata.py b/backend/smart_auto/model_metadata.py
deleted file mode 100644
index 925dcc3..0000000
--- a/backend/smart_auto/model_metadata.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from typing import Dict, Any
-import os
-from functools import lru_cache
-
-from backend.logging_config import get_logger
-from backend.gguf_reader import get_model_layer_info
-from .architecture_config import resolve_architecture
-from .models import ModelMetadata
-
-logger = get_logger(__name__)
-
-
-@lru_cache(maxsize=256)
-def _get_layer_info_from_file(file_path: str, mtime: float) -> Dict[str, Any]:
-    """
-    Get layer info from GGUF file with LRU caching.
-    Uses mtime as part of cache key for invalidation.
-    """
-    try:
-        return get_model_layer_info(file_path) or {}
-    except Exception as e:
-        logger.warning(f"Failed to read layer info from {file_path}: {e}")
-        return {}
-
-
-@lru_cache(maxsize=64)
-def _estimate_layer_count_cached(model_name: str) -> int:
-    """Cached version of layer count estimation from model name."""
-    if "7b" in model_name or "7B" in model_name:
-        return 32
-    elif "3b" in model_name or "3B" in model_name:
-        return 28
-    elif "1b" in model_name or "1B" in model_name:
-        return 22
-    elif "13b" in model_name or "13B" in model_name:
-        return 40
-    elif "30b" in model_name or "30B" in model_name:
-        return 60
-    elif "65b" in model_name or "65B" in model_name:
-        return 80
-    else:
-        return 32  # Default fallback
-
-
-def get_model_metadata(model) -> ModelMetadata:
-    """
-    Get comprehensive model metadata with caching.
-
-    Uses LRU cache with mtime-based invalidation to prevent redundant file I/O.
-    This is the single source of truth for model layer information.
-    """
-    # Default metadata structure
-    meta: Dict[str, Any] = {
-        "layer_count": 32,
-        "architecture": "unknown",
-        "context_length": 0,
-        "vocab_size": 0,
-        "embedding_length": 0,
-        "attention_head_count": 0,
-        "attention_head_count_kv": 0,
-        "block_count": 0,
-        "is_moe": False,
-        "expert_count": 0,
-        "experts_used_count": 0,
-    }
-
-    try:
-        if model.file_path and os.path.exists(model.file_path):
-            # Use LRU cache with mtime-based invalidation
-            mtime = os.path.getmtime(model.file_path)
-            layer_info = _get_layer_info_from_file(model.file_path, mtime)
-            if layer_info:
-                meta.update(layer_info)
-
-            # Resolve architecture from GGUF metadata
-            raw_architecture = meta.get("architecture", "")
-            normalized = resolve_architecture(raw_architecture)
-            meta["architecture"] = normalized
-
-            if (
-                normalized not in ("unknown", "generic")
-                and raw_architecture != normalized
-            ):
-                logger.debug(
-                    f"Resolved architecture: '{raw_architecture}' -> '{normalized}'"
-                )
-    except Exception as e:
-        logger.warning(
-            f"Failed to read GGUF metadata for model {getattr(model, 'id', 'unknown')}: {e}"
-        )
-
-    # Fallback to name-based detection if architecture is still unknown
-    current_arch = meta.get("architecture", "").strip()
-    if not current_arch or current_arch == "unknown":
-        detected = resolve_architecture(getattr(model, "name", ""))
-        meta["architecture"] = detected
-        if detected not in ("unknown", "generic"):
-            logger.debug(f"Detected architecture from model name: '{detected}'")
-
-    # Fallback to name-based layer count estimation if needed
-    if meta.get("layer_count", 0) == 32 and current_arch == "unknown":
-        model_name = getattr(model, "name", "").lower()
-        meta["layer_count"] = _estimate_layer_count_cached(model_name)
-
-    # Return as ModelMetadata dataclass
-    return ModelMetadata.from_dict(meta)
diff --git a/backend/smart_auto/models.py b/backend/smart_auto/models.py
deleted file mode 100644
index d3dec12..0000000
--- a/backend/smart_auto/models.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""
-Data models for smart_auto module.
-Provides type-safe data classes to replace dictionary passing throughout the module.
-"""
-
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Any, Tuple
-
-
-@dataclass
-class ModelMetadata:
-    """Comprehensive model metadata extracted from GGUF file or name."""
-
-    layer_count: int
-    architecture: str
-    context_length: int
-    vocab_size: int
-    embedding_length: int
-    attention_head_count: int
-    attention_head_count_kv: int
-    block_count: int = 0
-    is_moe: bool = False
-    expert_count: int = 0
-    experts_used_count: int = 0
-    parameter_count: Optional[str] = None  # Formatted as "32B", "36B", etc.
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "ModelMetadata":
-        """Create ModelMetadata from a dictionary (e.g., from get_model_metadata result)."""
-        return cls(
-            layer_count=data.get("layer_count", 32),
-            architecture=data.get("architecture", "unknown"),
-            context_length=data.get("context_length", 0),
-            vocab_size=data.get("vocab_size", 0),
-            embedding_length=data.get("embedding_length", 0),
-            attention_head_count=data.get("attention_head_count", 0),
-            attention_head_count_kv=data.get("attention_head_count_kv", 0),
-            block_count=data.get("block_count", 0),
-            is_moe=data.get("is_moe", False),
-            expert_count=data.get("expert_count", 0),
-            experts_used_count=data.get("experts_used_count", 0),
-            parameter_count=data.get("parameter_count"),
-        )
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary for backward compatibility."""
-        return {
-            "layer_count": self.layer_count,
-            "architecture": self.architecture,
-            "context_length": self.context_length,
-            "vocab_size": self.vocab_size,
-            "embedding_length": self.embedding_length,
-            "attention_head_count": self.attention_head_count,
-            "attention_head_count_kv": self.attention_head_count_kv,
-            "block_count": self.block_count,
-            "is_moe": self.is_moe,
-            "expert_count": self.expert_count,
-            "experts_used_count": self.experts_used_count,
-            "parameter_count": self.parameter_count,
-        }
-
-
-@dataclass
-class SystemResources:
-    """System resources information."""
-
-    gpus: List[Dict[str, Any]]
-    total_vram: int
-    available_vram_gb: float
-    gpu_count: int
-    nvlink_topology: Dict[str, Any]
-    cpu_cores: int
-    cpu_memory_gb: Tuple[float, float, float]  # total, used, available
-    flash_attn_available: bool = False
-    compute_capabilities: List[float] = field(
-        default_factory=list
-    )  # Pre-parsed compute capabilities
-
-    @classmethod
-    def from_gpu_info(
-        cls,
-        gpu_info: Dict[str, Any],
-        cpu_memory: Tuple[float, float, float],
-        cpu_cores: int,
-        flash_attn_available: bool = False,
-    ) -> "SystemResources":
-        """Create SystemResources from gpu_info and system data."""
-        gpus = gpu_info.get("gpus", [])
-
-        # Pre-parse compute capabilities to avoid repeated string parsing
-        compute_capabilities = []
-        for gpu in gpus:
-            cc_str = gpu.get("compute_capability", "0.0")
-            try:
-                parts = str(cc_str).split(".")
-                major = int(parts[0]) if parts and parts[0].isdigit() else 0
-                minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
-                compute_capabilities.append(major + minor / 10.0)
-            except Exception:
-                compute_capabilities.append(0.0)
-
-        return cls(
-            gpus=gpus,
-            total_vram=gpu_info.get("total_vram", 0),
-            available_vram_gb=(
-                sum(gpu.get("memory", {}).get("free", 0) for gpu in gpus) / (1024**3)
-                if gpus
-                else 0.0
-            ),
-            gpu_count=gpu_info.get("device_count", 0),
-            nvlink_topology=gpu_info.get("nvlink_topology", {}),
-            cpu_cores=cpu_cores,
-            cpu_memory_gb=cpu_memory,
-            flash_attn_available=flash_attn_available,
-            compute_capabilities=compute_capabilities,
-        )
-
-
-@dataclass
-class GenerationConfig:
-    """Complete generation configuration with type-safe fields."""
-
-    # GPU configuration
-    n_gpu_layers: int = 0
-    main_gpu: int = 0
-    tensor_split: str = ""
-    flash_attn: bool = False
-
-    # Memory and context
-    ctx_size: int = 4096
-    batch_size: int = 512
-    ubatch_size: int = 256
-    parallel: int = 1
-
-    # CPU configuration
-    threads: int = 4
-    threads_batch: int = 4
-
-    # Memory optimization
-    no_mmap: bool = False
-    mlock: bool = False
-    low_vram: bool = False
-    logits_all: bool = False
-    cont_batching: bool = True
-    no_kv_offload: bool = False
-
-    # Generation parameters
-    temperature: float = 0.8
-    temp: float = 0.8
-    top_p: float = 0.9
-    top_k: int = 40
-    typical_p: float = 1.0
-    min_p: float = 0.0
-    tfs_z: float = 1.0
-    repeat_penalty: float = 1.1
-    presence_penalty: float = 0.0
-    frequency_penalty: float = 0.0
-    mirostat: int = 0
-    mirostat_tau: float = 5.0
-    mirostat_eta: float = 0.1
-    n_predict: int = -1
-    stop: List[str] = field(default_factory=list)
-    seed: int = -1
-
-    # KV cache optimization
-    cache_type_k: str = "f16"
-    cache_type_v: Optional[str] = None
-
-    # Architecture-specific
-    rope_freq_base: Optional[float] = None
-    rope_freq_scale: Optional[float] = None
-    rope_scaling: str = ""
-    yarn_ext_factor: float = 1.0
-    yarn_attn_factor: float = 1.0
-
-    # MoE configuration
-    moe_offload_pattern: str = "none"
-    moe_offload_custom: str = ""
-
-    # Special flags
-    embedding: bool = False
-    jinja: bool = False
-
-    # Server parameters
-    host: str = "0.0.0.0"
-    port: int = 0
-    timeout: int = 300
-
-    # Additional fields for backward compatibility
-    yaml: str = ""
-    customArgs: str = ""
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary for llama-swap integration."""
-        result = {}
-        for key, value in self.__dict__.items():
-            # Skip None values to keep config clean
-            if value is not None and value != []:
-                result[key] = value
-        return result
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "GenerationConfig":
-        """Create GenerationConfig from dictionary."""
-        # Filter out only fields that exist in the dataclass
-        field_names = {f.name for f in cls.__dataclass_fields__.values()}
-        filtered_data = {k: v for k, v in data.items() if k in field_names}
-        return cls(**filtered_data)
-
-    def update(self, updates: Dict[str, Any]) -> "GenerationConfig":
-        """Create a new config with updates applied."""
-        new_dict = self.to_dict()
-        new_dict.update(updates)
-        # Filter out None values and empty lists
-        new_dict = {k: v for k, v in new_dict.items() if v is not None and v != []}
-        return self.from_dict(new_dict)
diff --git a/backend/smart_auto/moe_handler.py b/backend/smart_auto/moe_handler.py
deleted file mode 100644
index fa44757..0000000
--- a/backend/smart_auto/moe_handler.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-MoE (Mixture of Experts) handling module.
-Handles MoE model offloading patterns and architecture-specific flags.
-"""
-
-from typing import Dict, Any, Tuple
-from backend.logging_config import get_logger
-from .constants import VRAM_RATIO_VERY_TIGHT, VRAM_RATIO_TIGHT, VRAM_RATIO_MODERATE
-
-logger = get_logger(__name__)
-
-
-# MoE offload strategies: (vram_ratio_threshold, pattern)
-MOE_OFFLOAD_STRATEGIES: list[Tuple[float, str]] = [
-    (VRAM_RATIO_VERY_TIGHT, ".ffn_.*_exps.=CPU"),  # Very tight: all MoE offloaded
-    (VRAM_RATIO_TIGHT, ".ffn_(up|down)_exps.=CPU"),  # Tight: up/down offloaded
-    (VRAM_RATIO_MODERATE, ".ffn_(up)_exps.=CPU"),  # Moderate: only up offloaded
-    (float("inf"), ""),  # Ample: no offloading
-]
-
-
-def generate_moe_offload_pattern(
-    architecture: str,
-    available_vram_gb: float,
-    model_size_mb: float,
-    is_moe: bool = False,
-    expert_count: int = 0,
-) -> str:
-    """Generate optimal MoE offloading pattern based on VRAM availability
-
-    Returns regex pattern for the -ot (offload type) parameter to control MoE layer placement
-    """
-    if not is_moe or expert_count == 0:
-        return ""  # No MoE offloading for non-MoE models
-
-    model_size_gb = model_size_mb / 1024
-
-    # Calculate VRAM pressure
-    vram_ratio = available_vram_gb / model_size_gb if model_size_gb > 0 else 1.0
-
-    # Find the appropriate strategy based on VRAM ratio
-    for threshold, pattern in MOE_OFFLOAD_STRATEGIES:
-        if vram_ratio < threshold:
-            return pattern
-
-    return ""  # Fallback: no offloading needed
-
-
-def needs_jinja_template(architecture: str, layer_info: Dict[str, Any]) -> bool:
-    """Determine if architecture requires jinja template."""
-    # GLM architectures always need jinja
-    if architecture in ["glm", "glm4"]:
-        return True
-    # Qwen3 coder variants need jinja
-    if architecture == "qwen3":
-        arch_str = layer_info.get("architecture", "").lower()
-        if "coder" in arch_str:
-            return True
-    return False
-
-
-def get_architecture_specific_flags(
-    architecture: str, layer_info: Dict[str, Any]
-) -> Dict[str, Any]:
-    """Get architecture-specific flags and settings.
-
-    Returns dict with flags like jinja, moe_offload_custom, etc.
-    """
-    flags = {"jinja": False, "moe_offload_custom": ""}
-
-    # Check jinja requirement
-    if needs_jinja_template(architecture, layer_info):
-        flags["jinja"] = True
-        logger.info(f"{architecture} architecture detected - enabling jinja template")
-
-    # Generate MoE offloading pattern if applicable
-    is_moe = layer_info.get("is_moe", False)
-    available_vram_gb = layer_info.get("available_vram_gb", 0)
-
-    if not is_moe or available_vram_gb == 0:
-        if is_moe and available_vram_gb == 0:
-            logger.debug(
-                "MoE model detected but no GPU available - MoE layers will run on CPU"
-            )
-        return flags
-
-    # Generate MoE offload pattern for GPU mode
-    expert_count = layer_info.get("expert_count", 0)
-    model_size_mb = layer_info.get("model_size_mb", 0)
-    moe_pattern = generate_moe_offload_pattern(
-        architecture, available_vram_gb, model_size_mb, is_moe, expert_count
-    )
-
-    if moe_pattern:
-        flags["moe_offload_custom"] = moe_pattern
-        logger.debug(f"Generated MoE offload pattern: {moe_pattern}")
-
-    return flags
diff --git a/backend/smart_auto/optimizer.py b/backend/smart_auto/optimizer.py
deleted file mode 100644
index 8622abf..0000000
--- a/backend/smart_auto/optimizer.py
+++ /dev/null
@@ -1,199 +0,0 @@
-"""
-Joint optimization algorithm for llama.cpp configuration.
-
-Implements the theoretical model algorithm from Section III-B that jointly
-optimizes (n_ngl, n_ctx, ubatch_size) given VRAM and RAM constraints.
-
-Prioritizes "Full Offload" (Max_Speed) regime before falling back to "Hybrid" mode.
-"""
-
-from typing import Dict, Any, Optional, Tuple
-from .memory_estimator import calculate_kv_cache_size
-from .constants import (
-    COMPUTE_FIXED_OVERHEAD_MB,
-    COMPUTE_SCRATCH_PER_UBATCH_MB,
-    MIN_CONTEXT_SIZE,
-    MIN_BATCH_SIZE,
-)
-
-
-def find_optimal_config(
-    model_size_bytes: int,
-    total_layers: int,
-    embedding_length: int,
-    attention_head_count: int,
-    attention_head_count_kv: int,
-    available_vram_bytes: int,
-    available_ram_bytes: int,
-    cache_type_k: str = "f16",
-    cache_type_v: Optional[str] = None,
-    ubatch_size: int = 512,
-    desired_performance: str = "Max_Speed",
-    min_context_size: int = MIN_CONTEXT_SIZE,
-) -> Dict[str, Any]:
-    """
-    Find optimal configuration using joint optimization algorithm.
-
-    Implements the theoretical model algorithm that:
-    1. Prioritizes "Full Offload" (Max_Speed) regime first
-    2. Falls back to "Hybrid" mode if full offload fails
-    3. Maximizes context length (n_ctx) given VRAM constraint
-
-    Args:
-        model_size_bytes: Total model size in bytes (GGUF file size)
-        total_layers: Total number of layers (N_layers)
-        embedding_length: Hidden embedding dimension (N_embd)
-        attention_head_count: Number of attention heads (N_head)
-        attention_head_count_kv: Number of KV attention heads (N_head_kv)
-        available_vram_bytes: Available VRAM in bytes
-        available_ram_bytes: Available RAM in bytes
-        cache_type_k: K cache quantization type
-        cache_type_v: V cache quantization type (default: same as cache_type_k)
-        ubatch_size: Micro-batch size for M_compute calculation
-        desired_performance: 'Max_Speed' or 'Max_Context'
-        min_context_size: Minimum acceptable context size
-
-    Returns:
-        Dictionary with:
-            - mode: "Full_Offload", "Hybrid_Mode", "Full_Offload_Failed", or "Insufficient_Memory"
-            - n_ngl_best: Optimal number of GPU layers
-            - n_ctx_best: Optimal context size
-            - cache_type_k: Selected K cache quantization
-            - cache_type_v: Selected V cache quantization
-            - ubatch_size: Optimal micro-batch size
-    """
-    # Step 1: Calculate model constants
-    N_layers = total_layers
-    M_weights_total = model_size_bytes
-
-    # Calculate KV cache cost per token (C_kv_per_token)
-    # Using GQA-aware formula: M_kv = n_ctx × N_layers × N_head_kv × d_head × (p_a_k + p_a_v)
-    if embedding_length > 0 and attention_head_count > 0:
-        from .constants import KV_CACHE_QUANT_FACTORS
-
-        quant_factor_k = KV_CACHE_QUANT_FACTORS.get(cache_type_k, 0.5)
-        quant_factor_v = KV_CACHE_QUANT_FACTORS.get(
-            cache_type_v or cache_type_k, quant_factor_k
-        )
-        bytes_per_k = quant_factor_k * 4
-        bytes_per_v = quant_factor_v * 4
-
-        if attention_head_count_kv > 0:
-            d_head = embedding_length / attention_head_count
-            kv_cache_per_layer_k = attention_head_count_kv * d_head * bytes_per_k
-            kv_cache_per_layer_v = attention_head_count_kv * d_head * bytes_per_v
-        else:
-            # Fallback for non-GQA models
-            kv_cache_per_layer_k = embedding_length * bytes_per_k
-            kv_cache_per_layer_v = embedding_length * bytes_per_v
-
-        C_kv_per_token = (kv_cache_per_layer_k + kv_cache_per_layer_v) * N_layers
-    else:
-        # Fallback: conservative estimate
-        C_kv_per_token = 1024  # ~1KB per token fallback
-
-    # Calculate M_compute
-    M_compute_bytes = int(
-        (COMPUTE_FIXED_OVERHEAD_MB + (ubatch_size * COMPUTE_SCRATCH_PER_UBATCH_MB))
-        * (1024**2)
-    )
-
-    # Step 2: Try "Full Offload" (Max_Speed) regime first
-    M_weights_vram_full = M_weights_total  # n_ngl = N_layers
-    M_weights_ram_full = 0
-
-    VRAM_fixed_cost_full = M_weights_vram_full + M_compute_bytes
-
-    if (
-        VRAM_fixed_cost_full < available_vram_bytes
-        and M_weights_ram_full < available_ram_bytes
-    ):
-        # Model fits in VRAM. Calculate max context size.
-        VRAM_remaining_for_kv = available_vram_bytes - VRAM_fixed_cost_full
-
-        if C_kv_per_token > 0:
-            n_ctx_candidate = VRAM_remaining_for_kv // C_kv_per_token
-        else:
-            n_ctx_candidate = 0
-
-        if n_ctx_candidate >= min_context_size:
-            return {
-                "mode": "Full_Offload (Max_Speed)",
-                "n_ngl_best": N_layers,
-                "n_ctx_best": n_ctx_candidate,
-                "cache_type_k": cache_type_k,
-                "cache_type_v": cache_type_v,
-                "ubatch_size": ubatch_size,
-            }
-
-    # If full offload failed and user wants Max_Speed only, return failure
-    if desired_performance == "Max_Speed":
-        return {
-            "mode": "Full_Offload_Failed",
-            "n_ngl_best": 0,
-            "n_ctx_best": 0,
-            "cache_type_k": cache_type_k,
-            "cache_type_v": cache_type_v,
-            "ubatch_size": ubatch_size,
-        }
-
-    # Step 3: Try "Hybrid" (Max_Context) regime
-    # Find minimum n_ngl required to fit remaining weights in RAM
-    if M_weights_total > available_ram_bytes:
-        n_ngl_min = max(
-            1, int((M_weights_total - available_ram_bytes) * N_layers / M_weights_total)
-        )
-    else:
-        n_ngl_min = 0
-
-    n_ngl_best = 0
-    n_ctx_best = 0
-
-    # Iterate from full offload down to minimum
-    for n_ngl_candidate in range(N_layers, n_ngl_min - 1, -1):
-        if n_ngl_candidate <= 0:
-            continue
-
-        M_weights_vram_hybrid = (n_ngl_candidate / N_layers) * M_weights_total
-        M_weights_ram_hybrid = M_weights_total - M_weights_vram_hybrid
-        VRAM_fixed_cost_hybrid = M_weights_vram_hybrid + M_compute_bytes
-
-        # Check if this n_ngl is possible (weights + compute must fit in VRAM)
-        if VRAM_fixed_cost_hybrid >= available_vram_bytes:
-            continue  # This n_ngl is too high
-
-        # Check if remaining weights fit in RAM
-        if M_weights_ram_hybrid > available_ram_bytes:
-            continue  # This n_ngl requires too much RAM
-
-        # Calculate max context for this n_ngl
-        VRAM_remaining_for_kv = available_vram_bytes - VRAM_fixed_cost_hybrid
-
-        if C_kv_per_token > 0:
-            n_ctx_candidate = VRAM_remaining_for_kv // C_kv_per_token
-        else:
-            n_ctx_candidate = 0
-
-        # We're looking for the combination that yields the highest n_ctx
-        if n_ctx_candidate > n_ctx_best:
-            n_ctx_best = n_ctx_candidate
-            n_ngl_best = n_ngl_candidate
-
-    if n_ctx_best >= min_context_size:
-        return {
-            "mode": "Hybrid_Mode (Max_Context)",
-            "n_ngl_best": n_ngl_best,
-            "n_ctx_best": n_ctx_best,
-            "cache_type_k": cache_type_k,
-            "cache_type_v": cache_type_v,
-            "ubatch_size": ubatch_size,
-        }
-    else:
-        return {
-            "mode": "Insufficient_Memory",
-            "n_ngl_best": 0,
-            "n_ctx_best": 0,
-            "cache_type_k": cache_type_k,
-            "cache_type_v": cache_type_v,
-            "ubatch_size": ubatch_size,
-        }
diff --git a/backend/smart_auto/recommendations.py b/backend/smart_auto/recommendations.py
deleted file mode 100644
index 3e010fd..0000000
--- a/backend/smart_auto/recommendations.py
+++ /dev/null
@@ -1,372 +0,0 @@
-"""
-Recommendation engine for model configuration parameters.
-Uses smart_auto logic with balanced preset (speed_quality=50, conversational).
-"""
-
-from typing import Dict, Any, Optional
-import asyncio
-from backend.database import Model
-from .architecture_config import resolve_architecture
-
-
-def _create_minimal_model(
-    layer_info: Dict[str, Any], model_name: str = "", file_path: Optional[str] = None
-) -> Model:
-    """Create a minimal Model object from layer info for smart_auto."""
-    model = Model()
-    model.name = model_name or "Unknown"
-    model.file_path = (
-        file_path  # Use provided file_path if available for metadata reading
-    )
-    model.file_size = 0
-    model.huggingface_id = model_name
-    return model
-
-
-def _create_minimal_gpu_info() -> Dict[str, Any]:
-    """Create minimal GPU info for smart_auto (assumes GPU available but will work without)."""
-    return {
-        "gpus": [],
-        "total_vram": 0,
-        "available_vram": 0,
-        "compute_capabilities": [],
-        "nvlink_topology": None,
-    }
-
-
-def _extract_recommendation_from_config(
-    config: Dict[str, Any],
-    key: str,
-    layer_info: Dict[str, Any],
-    recommendation_type: str,
-) -> Dict[str, Any]:
-    """Extract recommendation structure from generated config value."""
-
-    if recommendation_type == "gpu_layers":
-        layer_count = layer_info.get("layer_count", 32)
-        value = config.get("n_gpu_layers", layer_count)
-        # Clamp value to max
-        value = min(value, layer_count)
-        return {
-            "recommended_value": value,
-            "description": f"Recommended {value} layers"
-            + (" (full offload)" if value == layer_count else ""),
-            "balanced_value": layer_count // 2 if layer_count > 0 else 0,
-            "balanced_description": f"{layer_count // 2 if layer_count > 0 else 0} layers (balanced)",
-            "min": 0,
-            "max": layer_count,
-            "ranges": [
-                {"value": 0, "description": "CPU-only mode (slowest, lowest VRAM)"},
-                {
-                    "value": layer_count // 2 if layer_count > 0 else 0,
-                    "description": "Balanced (good performance, moderate VRAM)",
-                },
-                {
-                    "value": layer_count,
-                    "description": "Full offload (fastest, highest VRAM)",
-                },
-            ],
-        }
-
-    elif recommendation_type == "context_size":
-        context_length = layer_info.get("context_length", 131072)
-        value = config.get("ctx_size", context_length)
-        # Clamp value to max
-        value = min(value, context_length)
-        return {
-            "recommended_value": value,
-            "description": f"Recommended {value:,} tokens",
-            "min": 512,
-            "max": context_length,
-            "ranges": [
-                {"min": 512, "max": 2048, "description": "Short conversations"},
-                {"min": 4096, "max": 8192, "description": "Standard conversations"},
-                {
-                    "min": 16384,
-                    "max": context_length,
-                    "description": f"Long documents (max {context_length:,})",
-                },
-            ],
-        }
-
-    elif recommendation_type == "batch_size":
-        value = config.get("batch_size", 512)
-        # Calculate max based on attention heads, clamp to reasonable range
-        attention_heads = layer_info.get("attention_head_count", 32)
-        max_val = min(1024, max(512, attention_heads * 16))
-        # Clamp value to max
-        value = min(value, max_val)
-        return {
-            "recommended_value": value,
-            "description": f"Recommended {value}",
-            "min": 1,
-            "max": max_val,
-            "ranges": [
-                {"min": 1, "max": 128, "description": "Low memory usage"},
-                {"min": 256, "max": 512, "description": "Balanced (recommended)"},
-                {"min": max_val, "max": max_val, "description": "Maximum throughput"},
-            ],
-        }
-
-    elif recommendation_type == "temperature":
-        value = config.get("temperature", config.get("temp", 0.8))
-        # Clamp value to max
-        value = min(value, 2.0)
-        arch = layer_info.get("architecture", "").lower()
-        recommended_str = f"{value:.1f} for balanced conversation"
-
-        if "glm" in arch or "deepseek" in arch:
-            recommended_str = f"{value:.1f} for GLM/DeepSeek models"
-        elif "qwen" in arch:
-            recommended_str = f"{value:.1f} for Qwen models"
-        elif "codellama" in arch:
-            recommended_str = f"{value:.1f} for code generation"
-
-        return {
-            "recommended_value": value,
-            "description": recommended_str,
-            "min": 0.0,
-            "max": 2.0,
-            "ranges": [
-                {
-                    "min": 0.1,
-                    "max": 0.3,
-                    "description": "Code generation, technical tasks",
-                },
-                {
-                    "min": 0.7,
-                    "max": 1.0,
-                    "description": "General conversation (recommended)",
-                },
-                {
-                    "min": 1.5,
-                    "max": 2.0,
-                    "description": "Creative writing, brainstorming",
-                },
-            ],
-        }
-
-    elif recommendation_type == "top_k":
-        value = config.get("top_k", 40)
-        # Clamp value to max
-        value = min(value, 200)
-        arch = layer_info.get("architecture", "").lower()
-        recommended_str = f"{value} for most models"
-
-        if "glm" in arch or "deepseek" in arch:
-            recommended_str = f"{value} for GLM/DeepSeek models"
-
-        return {
-            "recommended_value": value,
-            "description": recommended_str,
-            "min": 0,
-            "max": 200,
-            "ranges": [
-                {"min": 10, "max": 30, "description": "Focused, code-like outputs"},
-                {"min": 40, "max": 50, "description": "Balanced (recommended)"},
-                {
-                    "min": 100,
-                    "max": 200,
-                    "description": "High diversity, creative writing",
-                },
-            ],
-        }
-
-    elif recommendation_type == "top_p":
-        value = config.get("top_p", 0.9)
-        # Clamp value to max
-        value = min(value, 1.0)
-        arch = layer_info.get("architecture", "").lower()
-        recommended_str = f"{value:.2f} for most models"
-
-        if "glm" in arch or "deepseek" in arch:
-            recommended_str = f"{value:.2f} for GLM/DeepSeek models"
-        elif "qwen" in arch:
-            recommended_str = f"{value:.2f} for Qwen models"
-
-        return {
-            "recommended_value": value,
-            "description": recommended_str,
-            "min": 0.0,
-            "max": 1.0,
-            "ranges": [
-                {"min": 0.7, "max": 0.8, "description": "More conservative"},
-                {"min": 0.9, "max": 0.95, "description": "Balanced (recommended)"},
-                {"min": 0.95, "max": 1.0, "description": "Higher diversity"},
-            ],
-        }
-
-    elif recommendation_type == "parallel":
-        value = config.get("parallel", 1)
-        # Clamp value to max
-        value = min(value, 8)
-        attention_heads = layer_info.get("attention_head_count", 32)
-        return {
-            "recommended_value": value,
-            "description": f"Recommended {value} based on {attention_heads} attention heads",
-            "min": 1,
-            "max": 8,
-        }
-
-    # Fallback
-    return {
-        "recommended_value": config.get(key, 0),
-        "description": f"Recommended {config.get(key, 0)}",
-        "min": 0,
-        "max": 100,
-    }
-
-
-async def _generate_balanced_config(
-    model_layer_info: Dict[str, Any],
-    model_name: str = "",
-    file_path: Optional[str] = None,
-) -> Dict[str, Any]:
-    """Generate balanced configuration using smart_auto with speed_quality=50 and conversational preset."""
-    from backend.smart_auto import SmartAutoConfig
-
-    # Create minimal model object with file_path if available
-    model = _create_minimal_model(model_layer_info, model_name, file_path)
-
-    # Create minimal GPU info (will work for CPU-only too)
-    gpu_info = _create_minimal_gpu_info()
-
-    # Create smart_auto config generator
-    smart_config = SmartAutoConfig()
-
-    # Generate config with balanced settings:
-    # - speed_quality=50 (balanced between speed and quality)
-    # - preset="conversational" (balanced preset)
-    # - usage_mode="single_user" (standard usage)
-    config = await smart_config.generate_config(
-        model=model,
-        gpu_info=gpu_info,
-        preset="conversational",  # Balanced preset
-        usage_mode="single_user",
-        speed_quality=50,  # Balanced (50 = equal speed/quality)
-        use_case=None,
-        debug=None,
-    )
-
-    return config
-
-
-async def get_model_recommendations(
-    model_layer_info: Dict[str, Any],
-    model_name: str = "",
-    file_path: Optional[str] = None,
-) -> Dict[str, Any]:
-    """
-    Get all recommendations using smart_auto logic with balanced preset.
-
-    Uses smart_auto's generate_config with:
-    - speed_quality=50 (balanced)
-    - preset="conversational" (balanced preset)
-    - usage_mode="single_user"
-
-    Args:
-        model_layer_info: Layer information dict from GGUF metadata
-        model_name: Optional model name for fallback
-        file_path: Optional file path for metadata reading
-
-    Returns:
-        Dict with all recommendations extracted from smart_auto generated config
-    """
-    try:
-        # Generate balanced config using smart_auto
-        config = await _generate_balanced_config(
-            model_layer_info, model_name, file_path
-        )
-
-        # Extract recommendations from generated config
-        return {
-            "gpu_layers": _extract_recommendation_from_config(
-                config, "n_gpu_layers", model_layer_info, "gpu_layers"
-            ),
-            "context_size": _extract_recommendation_from_config(
-                config, "ctx_size", model_layer_info, "context_size"
-            ),
-            "batch_size": _extract_recommendation_from_config(
-                config, "batch_size", model_layer_info, "batch_size"
-            ),
-            "temperature": _extract_recommendation_from_config(
-                config, "temperature", model_layer_info, "temperature"
-            ),
-            "top_k": _extract_recommendation_from_config(
-                config, "top_k", model_layer_info, "top_k"
-            ),
-            "top_p": _extract_recommendation_from_config(
-                config, "top_p", model_layer_info, "top_p"
-            ),
-            "parallel": _extract_recommendation_from_config(
-                config, "parallel", model_layer_info, "parallel"
-            ),
-        }
-    except Exception as e:
-        # Fallback to basic recommendations if smart_auto fails
-        from backend.logging_config import get_logger
-
-        logger = get_logger(__name__)
-        logger.warning(
-            f"Failed to generate recommendations with smart_auto: {e}. Using fallback."
-        )
-
-        # Return basic fallback recommendations
-        layer_count = model_layer_info.get("layer_count", 32)
-        context_length = model_layer_info.get("context_length", 131072)
-        attention_heads = model_layer_info.get("attention_head_count", 32)
-
-        return {
-            "gpu_layers": {
-                "recommended_value": layer_count,
-                "description": f"Recommended {layer_count} layers (full offload)",
-                "min": 0,
-                "max": layer_count,
-                "ranges": [
-                    {"value": 0, "description": "CPU-only mode"},
-                    {"value": layer_count // 2, "description": "Balanced"},
-                    {"value": layer_count, "description": "Full offload"},
-                ],
-            },
-            "context_size": {
-                "recommended_value": context_length,
-                "description": f"Max {context_length:,} tokens",
-                "min": 512,
-                "max": context_length,
-                "ranges": [],
-            },
-            "batch_size": {
-                "recommended_value": 512,
-                "description": "Recommended 512",
-                "min": 1,
-                "max": 1024,
-                "ranges": [],
-            },
-            "temperature": {
-                "recommended_value": 0.8,
-                "description": "0.8 for balanced conversation",
-                "min": 0.0,
-                "max": 2.0,
-                "ranges": [],
-            },
-            "top_k": {
-                "recommended_value": 40,
-                "description": "40 for most models",
-                "min": 0,
-                "max": 200,
-                "ranges": [],
-            },
-            "top_p": {
-                "recommended_value": 0.9,
-                "description": "0.9 for most models",
-                "min": 0.0,
-                "max": 1.0,
-                "ranges": [],
-            },
-            "parallel": {
-                "recommended_value": min(8, max(1, attention_heads // 4)),
-                "description": f"Recommended based on {attention_heads} attention heads",
-                "min": 1,
-                "max": 8,
-            },
-        }
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
new file mode 100644
index 0000000..bd24e8a
--- /dev/null
+++ b/backend/tests/conftest.py
@@ -0,0 +1,8 @@
+"""Pytest configuration and fixtures."""
+import sys
+from pathlib import Path
+
+# Ensure backend is importable when running from repo root
+root = Path(__file__).resolve().parent.parent.parent
+if str(root) not in sys.path:
+    sys.path.insert(0, str(root))
diff --git a/backend/tests/test_app_smoke.py b/backend/tests/test_app_smoke.py
new file mode 100644
index 0000000..c77c4eb
--- /dev/null
+++ b/backend/tests/test_app_smoke.py
@@ -0,0 +1,58 @@
+"""
+Smoke tests to verify the app and key routes work after refactoring.
+Run with: PYTHONPATH=. pytest backend/tests/test_app_smoke.py -v
+(Requires: pip install -r requirements.txt)
+"""
+import pytest
+from fastapi.testclient import TestClient
+
+from backend.main import app
+
+
+@pytest.fixture
+def client():
+    return TestClient(app)
+
+
+def test_app_starts(client):
+    """App should start and respond."""
+    # Root or health-style endpoint may redirect; use API
+    response = client.get("/api/status")
+    assert response.status_code == 200
+    data = response.json()
+    assert "system" in data
+
+
+def test_param_registry_route(client):
+    """Param registry should return basic/advanced params."""
+    response = client.get("/api/models/param-registry")
+    assert response.status_code == 200
+    data = response.json()
+    assert "basic" in data
+    assert "advanced" in data
+    assert isinstance(data["basic"], list)
+    assert isinstance(data["advanced"], list)
+
+
+def test_models_list_route(client):
+    """Models list should return 200 and a list (possibly empty)."""
+    response = client.get("/api/models/")
+    assert response.status_code == 200
+    data = response.json()
+    assert isinstance(data, list)
+
+
+def test_models_list_route_no_trailing_slash(client):
+    """GET /api/models (no trailing slash) should return model list, not param-registry."""
+    response = client.get("/api/models")
+    assert response.status_code == 200
+    data = response.json()
+    assert isinstance(data, list)
+
+
+@pytest.mark.skip(reason="SSE stream never ends; TestClient blocks on full response")
+def test_sse_events_route(client):
+    """SSE events endpoint returns 200 and event-stream content-type."""
+    response = client.get("/api/events")
+    assert response.status_code == 200
+    assert "text/event-stream" in response.headers.get("content-type", "")
diff --git a/backend/tests/test_architecture_profiles.py b/backend/tests/test_architecture_profiles.py
deleted file mode 100644
index a955ce9..0000000
--- a/backend/tests/test_architecture_profiles.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from backend.architecture_profiles import compute_layers_for_architecture
-
-
-def test_glm4moe_profile_uses_block_and_nextn():
-    metadata = {
-        "general.architecture": "glm4moe",
-        "glm4moe.block_count": 47,
-        "glm4moe.nextn_predict_layers": 1,
-    }
-    result = compute_layers_for_architecture(
-        architecture="glm4moe",
-        metadata=metadata,
-        base_block_count=47,
-    )
-    assert result["block_count"] == 47
-    assert result["effective_layer_count"] == 48
-
-
-def test_llama_like_profile_adds_output_head():
-    metadata = {
-        "general.architecture": "llama",
-        "llama.block_count": 32,
-    }
-    result = compute_layers_for_architecture(
-        architecture="llama",
-        metadata=metadata,
-        base_block_count=32,
-    )
-    assert result["block_count"] == 32
-    assert result["effective_layer_count"] == 33
-
-
-def test_qwen_family_profile_adds_output_head():
-    metadata = {
-        "general.architecture": "qwen2",
-        "qwen2.block_count": 28,
-    }
-    result = compute_layers_for_architecture(
-        architecture="qwen2",
-        metadata=metadata,
-        base_block_count=28,
-    )
-    assert result["block_count"] == 28
-    assert result["effective_layer_count"] == 29
-
-
-def test_generic_profile_uses_base_block_count_plus_one():
-    metadata = {
-        "general.architecture": "some-new-arch",
-    }
-    result = compute_layers_for_architecture(
-        architecture="some-new-arch",
-        metadata=metadata,
-        base_block_count=40,
-    )
-    assert result["block_count"] == 40
-    assert result["effective_layer_count"] == 41
-
-
-def test_generic_profile_falls_back_to_32_when_no_block_count():
-    metadata = {
-        "general.architecture": "unknown-arch",
-    }
-    result = compute_layers_for_architecture(
-        architecture="unknown-arch",
-        metadata=metadata,
-        base_block_count=0,
-    )
-    assert result["block_count"] == 0
-    assert result["effective_layer_count"] == 32
diff --git a/backend/unified_monitor.py b/backend/unified_monitor.py
deleted file mode 100644
index 633dfa4..0000000
--- a/backend/unified_monitor.py
+++ /dev/null
@@ -1,716 +0,0 @@
-import asyncio
-import psutil
-import time
-import yaml
-import os
-from collections import deque
-from datetime import datetime
-from typing import Dict, Any, Optional, List
-from sqlalchemy.orm import Session
-
-from backend.websocket_manager import websocket_manager
-from backend.gpu_detector import get_gpu_info
-from backend.llama_swap_client import LlamaSwapClient
-from backend.database import SessionLocal, RunningInstance, Model
-from backend.logging_config import get_logger
-
-try:
-    import pynvml  # type: ignore
-except ImportError:
-    pynvml = None  # type: ignore[assignment]
-
-DEFAULT_PROXY_PORT = 2000
-LMDEPLOY_PORT = 2001
-
-logger = get_logger(__name__)
-
-
-class UnifiedMonitor:
-    """Unified monitoring service with smart llama-swap integration.
-
-    Key insight: The /running endpoint is ALWAYS safe to poll (never returns 503).
-    Only /v1/chat/completions returns 503 during model loading.
-
-    We poll /running frequently to:
-    1. Detect model state changes (loading → running)
-    2. Detect external model starts (via llama-swap UI)
-    3. Sync database with actual llama-swap state
-    """
-
-    def __init__(self):
-        self.is_running = False
-        self.monitor_task: Optional[asyncio.Task] = None
-        self.update_interval = 2.0  # Poll /running every 2 seconds (safe, no 503s)
-
-        # Data storage
-        self.recent_logs = deque(maxlen=100)  # Keep last 100 log entries
-
-        # llama-swap client
-        self.llama_swap_client = LlamaSwapClient()
-
-        # Model mapping cache
-        self.model_mapping = {}
-        self._load_model_mapping()
-
-        # Optional direct WS subscribers (used by routes/unified_monitoring.py)
-        self.subscribers: List[Any] = []
-
-        # Loading state tracking
-        self._models_loading: Dict[str, datetime] = {}  # model_name -> start_time
-        self._loading_timeout = 300  # 5 minutes max loading time
-
-        # Previous model states for change detection
-        self._previous_model_states: Dict[str, str] = {}  # model_name -> state
-
-    def _load_model_mapping(self):
-        """Load model mapping from llama-swap configuration"""
-        try:
-            config_path = "/app/data/llama-swap-config.yaml"
-            if os.path.exists(config_path):
-                with open(config_path, "r") as f:
-                    config = yaml.safe_load(f)
-
-                # Extract model mappings from the config
-                models_config = config.get("models", {})
-                for llama_swap_name, model_config in models_config.items():
-                    cmd = model_config.get("cmd", "")
-                    # Extract the actual model file path from the command
-                    if "--model" in cmd:
-                        parts = cmd.split("--model")
-                        if len(parts) > 1:
-                            model_path = parts[1].strip().split()[0]
-                            # Extract just the filename without path and extension
-                            filename = os.path.basename(model_path).replace(".gguf", "")
-                            self.model_mapping[llama_swap_name] = {
-                                "filename": filename,
-                                "full_path": model_path,
-                            }
-
-                logger.info(
-                    f"Loaded {len(self.model_mapping)} model mappings from llama-swap config"
-                )
-                logger.debug(f"Model mappings: {self.model_mapping}")
-            else:
-                logger.warning(f"llama-swap config not found at {config_path}")
-        except Exception as e:
-            logger.error(f"Failed to load model mapping: {e}")
-
-    def add_log(self, log_event: Dict[str, Any]):
-        """Add a log event to the buffer"""
-        self.recent_logs.append(log_event)
-
-    def mark_model_loading(self, model_name: str):
-        """Mark a model as currently loading"""
-        self._models_loading[model_name] = datetime.utcnow()
-        self.llama_swap_client.mark_model_loading(model_name)
-        logger.info(f"Model '{model_name}' is now loading")
-
-    def mark_model_ready(self, model_name: str):
-        """Mark a model as ready (finished loading)"""
-        if model_name in self._models_loading:
-            load_time = (
-                datetime.utcnow() - self._models_loading[model_name]
-            ).total_seconds()
-            del self._models_loading[model_name]
-            logger.info(
-                f"Model '{model_name}' is now ready (loaded in {load_time:.1f}s)"
-            )
-        self.llama_swap_client.mark_model_ready(model_name)
-
-    def mark_model_stopped(self, model_name: str):
-        """Mark a model as stopped (clear loading state)"""
-        self._models_loading.pop(model_name, None)
-        self.llama_swap_client.clear_loading_state(model_name)
-        logger.debug(f"Model '{model_name}' loading state cleared")
-
-    async def broadcast_model_event(
-        self, event_type: str, model_name: str, details: Dict[str, Any] = None
-    ):
-        """Broadcast a model event immediately (no polling needed).
-
-        This is called when model state changes (start/stop/ready) to push
-        updates to frontend instantly without waiting for the next poll cycle.
-        """
-        event_data = {
-            "type": "model_event",
-            "event": event_type,  # "loading", "ready", "stopped", "error"
-            "model": model_name,
-            "timestamp": datetime.utcnow().isoformat(),
-            "details": details or {},
-        }
-
-        try:
-            await websocket_manager.broadcast(event_data)
-            logger.debug(f"Broadcast model event: {event_type} for {model_name}")
-        except Exception as e:
-            logger.error(f"Failed to broadcast model event: {e}")
-
-    async def trigger_status_update(self):
-        """Trigger an immediate status update broadcast.
-
-        Called after model start/stop to push fresh data to frontend
-        without waiting for the next poll cycle.
-        """
-        try:
-            await self._collect_and_send_unified_data()
-        except Exception as e:
-            logger.error(f"Failed to trigger status update: {e}")
-
-    def get_loading_models(self) -> Dict[str, Any]:
-        """Get currently loading models with their loading times"""
-        now = datetime.utcnow()
-        loading = {}
-        expired = []
-
-        for model_name, start_time in self._models_loading.items():
-            elapsed = (now - start_time).total_seconds()
-            if elapsed > self._loading_timeout:
-                # Model has been loading too long, consider it stuck
-                expired.append(model_name)
-                logger.warning(
-                    f"Model '{model_name}' loading timeout ({elapsed:.0f}s > {self._loading_timeout}s)"
-                )
-            else:
-                loading[model_name] = {
-                    "started_at": start_time.isoformat(),
-                    "elapsed_seconds": elapsed,
-                }
-
-        # Clean up expired loading states
-        for model_name in expired:
-            del self._models_loading[model_name]
-            self.llama_swap_client.clear_loading_state(model_name)
-
-        return loading
-
-    def has_loading_models(self) -> bool:
-        """Check if any models are currently loading"""
-        # Clean up expired first
-        self.get_loading_models()
-        return len(self._models_loading) > 0
-
-    async def add_subscriber(self, websocket):
-        """Accept and register a WebSocket subscriber (minimal implementation)."""
-        try:
-            await websocket.accept()
-        except Exception:
-            return
-        self.subscribers.append(websocket)
-
-    async def remove_subscriber(self, websocket):
-        """Remove a WebSocket subscriber and close if open."""
-        try:
-            if websocket in self.subscribers:
-                self.subscribers.remove(websocket)
-            try:
-                await websocket.close()
-            except Exception:
-                pass
-        except Exception:
-            pass
-
-    async def start_monitoring(self):
-        """Start the unified monitoring background task"""
-        if self.is_running:
-            return
-
-        self.is_running = True
-        self.monitor_task = asyncio.create_task(self._monitor_loop())
-
-        logger.info("Unified monitoring started")
-
-    async def stop_monitoring(self):
-        """Stop the unified monitoring background task"""
-        self.is_running = False
-
-        if self.monitor_task:
-            self.monitor_task.cancel()
-
-        logger.info("Unified monitoring stopped")
-
-    async def _monitor_loop(self):
-        """Main monitoring loop - polls /running endpoint every 2 seconds.
-
-        The /running endpoint is SAFE to poll (never returns 503).
-        This allows us to:
-        - Detect when models finish loading
-        - Detect external model starts (via llama-swap UI)
-        - Keep database in sync with llama-swap
-        """
-        while self.is_running:
-            try:
-                await self._collect_and_send_unified_data()
-                await asyncio.sleep(self.update_interval)
-            except asyncio.CancelledError:
-                break
-            except Exception as e:
-                logger.error(f"Unified monitoring error: {e}")
-                await asyncio.sleep(self.update_interval)
-
-    async def _collect_and_send_unified_data(self):
-        """Collect all monitoring data and send as single unified message"""
-        try:
-            # 1. System metrics
-            cpu_percent = psutil.cpu_percent(interval=0)
-            memory = psutil.virtual_memory()
-            # Use data directory at project root or /app/data for Docker
-            data_dir = "data" if os.path.exists("data") else "/app/data"
-            try:
-                disk = psutil.disk_usage(data_dir)
-            except FileNotFoundError:
-                disk = psutil.disk_usage("/")
-
-            # 2. Running instances from database
-            db = SessionLocal()
-            try:
-                running_instances = db.query(RunningInstance).all()
-                active_instances = []
-                for instance in running_instances:
-                    port = (
-                        LMDEPLOY_PORT
-                        if instance.runtime_type == "lmdeploy"
-                        else DEFAULT_PROXY_PORT
-                    )
-                    active_instances.append(
-                        {
-                            "id": instance.id,
-                            "model_id": instance.model_id,
-                            "port": port,
-                            "runtime_type": instance.runtime_type,
-                            "proxy_model_name": instance.proxy_model_name,
-                            "started_at": (
-                                instance.started_at.isoformat()
-                                if instance.started_at
-                                else None
-                            ),
-                        }
-                    )
-            finally:
-                db.close()
-
-            # 3. Running models from llama-swap /running endpoint
-            # This endpoint is SAFE - it never returns 503, even during model loading
-            # We poll it every 2 seconds to detect:
-            # - Model state changes (loading → running)
-            # - External model starts (via llama-swap UI)
-            enhanced_external_models = []
-            try:
-                external_response = await self.llama_swap_client.get_running_models()
-
-                # Extract the running models array from the response
-                if (
-                    isinstance(external_response, dict)
-                    and "running" in external_response
-                ):
-                    external_models = external_response["running"]
-                elif isinstance(external_response, list):
-                    external_models = external_response
-                else:
-                    external_models = []
-
-                # Process models and detect state changes
-                for model in external_models:
-                    model_name = model.get("model", "")
-                    state = model.get("state", "unknown")
-                    previous_state = self._previous_model_states.get(model_name)
-
-                    # Detect state transitions
-                    if previous_state != state:
-                        logger.info(
-                            f"Model '{model_name}' state changed: {previous_state} → {state}"
-                        )
-
-                        if state == "loading":
-                            # Model started loading
-                            if model_name not in self._models_loading:
-                                self._models_loading[model_name] = datetime.utcnow()
-                            await self.broadcast_model_event("loading", model_name)
-
-                        elif state in ("running", "ready"):
-                            # Model finished loading - broadcast ready event!
-                            if model_name in self._models_loading:
-                                load_time = (
-                                    datetime.utcnow() - self._models_loading[model_name]
-                                ).total_seconds()
-                                logger.info(
-                                    f"Model '{model_name}' ready after {load_time:.1f}s"
-                                )
-                                del self._models_loading[model_name]
-                            await self.broadcast_model_event("ready", model_name)
-
-                        self._previous_model_states[model_name] = state
-
-                    enhanced_model = {
-                        "model": model_name,
-                        "state": state,
-                        "mapping": self.model_mapping.get(model_name, {}),
-                        "is_loading": state == "loading",
-                    }
-                    enhanced_external_models.append(enhanced_model)
-
-                # Detect models that were removed (stopped externally)
-                current_model_names = {m.get("model", "") for m in external_models}
-                for prev_model in list(self._previous_model_states.keys()):
-                    if prev_model not in current_model_names:
-                        logger.info(
-                            f"Model '{prev_model}' stopped (removed from llama-swap)"
-                        )
-                        await self.broadcast_model_event("stopped", prev_model)
-                        del self._previous_model_states[prev_model]
-                        self._models_loading.pop(prev_model, None)
-
-                # Sync database with llama-swap state
-                await self._sync_database_with_external_models(enhanced_external_models)
-
-            except Exception as e:
-                logger.debug(
-                    f"Failed to poll /running (llama-swap may be starting): {e}"
-                )
-
-            # 4. GPU info
-            try:
-                gpu_info = await get_gpu_info()
-                vram_data = None
-                if not gpu_info.get("cpu_only_mode", True):
-                    vram_data = await self._get_vram_data(gpu_info)
-            except Exception as e:
-                logger.error(f"Failed to get GPU info: {e}")
-                gpu_info = {"cpu_only_mode": True, "device_count": 0}
-                vram_data = None
-
-            # 5. Get loading models info
-            loading_models = self.get_loading_models()
-
-            # 6. Create unified monitoring data
-            unified_data = {
-                "type": "unified_monitoring",
-                "timestamp": datetime.utcnow().isoformat(),
-                "system": {
-                    "cpu_percent": cpu_percent,
-                    "memory": {
-                        "total": memory.total,
-                        "available": memory.available,
-                        "percent": memory.percent,
-                        "used": memory.used,
-                        "free": memory.free,
-                        "cached": getattr(memory, "cached", 0),
-                        "buffers": getattr(memory, "buffers", 0),
-                        "swap_total": psutil.swap_memory().total,
-                        "swap_used": psutil.swap_memory().used,
-                    },
-                    "disk": {
-                        "total": disk.total,
-                        "used": disk.used,
-                        "free": disk.free,
-                        "percent": (disk.used / disk.total) * 100,
-                    },
-                },
-                "gpu": {
-                    "cpu_only_mode": gpu_info.get("cpu_only_mode", True),
-                    "device_count": gpu_info.get("device_count", 0),
-                    "total_vram": gpu_info.get("total_vram", 0),
-                    "available_vram": gpu_info.get("available_vram", 0),
-                    "vram_data": vram_data,
-                },
-                "models": {
-                    "running_instances": active_instances,
-                    "loading": loading_models,  # Models currently loading
-                    "has_loading": len(loading_models) > 0,
-                },
-                "proxy_status": {
-                    "enabled": True,
-                    "port": 2000,
-                    "endpoint": "http://localhost:2000/v1/chat/completions",
-                },
-                "logs": list(self.recent_logs)[-20:],  # Last 20 logs
-            }
-
-            # 6. Send unified data to all WebSocket connections
-            logger.debug(f"Broadcasting unified monitoring data: {unified_data}")
-            await websocket_manager.broadcast(unified_data)
-
-        except Exception as e:
-            logger.error(f"Error collecting unified monitoring data: {e}")
-
-    async def _get_vram_data(self, gpu_info: Dict[str, Any]) -> Dict[str, Any]:
-        """Get current VRAM usage data"""
-        if pynvml is None:
-            logger.debug("NVML not available; skipping VRAM detail collection")
-            return {
-                "total": 0,
-                "used": 0,
-                "free": 0,
-                "percent": 0,
-                "gpus": [],
-                "cuda_version": gpu_info.get("cuda_version", "N/A"),
-                "device_count": gpu_info.get("device_count", 0),
-                "timestamp": time.time(),
-            }
-        try:
-            pynvml.nvmlInit()
-
-            device_count = gpu_info.get("device_count", 0)
-            total_vram = 0
-            used_vram = 0
-            gpu_details = []
-
-            for i in range(device_count):
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-
-                gpu_total = mem_info.total
-                gpu_used = mem_info.used
-                gpu_free = mem_info.free
-
-                total_vram += gpu_total
-                used_vram += gpu_used
-
-                gpu_details.append(
-                    {
-                        "device_id": i,
-                        "total": gpu_total,
-                        "used": gpu_used,
-                        "free": gpu_free,
-                        "utilization": utilization.gpu,
-                        "memory_utilization": utilization.memory,
-                    }
-                )
-
-            return {
-                "total": total_vram,
-                "used": used_vram,
-                "free": total_vram - used_vram,
-                "percent": (used_vram / total_vram * 100) if total_vram > 0 else 0,
-                "gpus": gpu_details,
-                "cuda_version": gpu_info.get("cuda_version", "N/A"),
-                "device_count": gpu_info.get("device_count", 0),
-                "timestamp": time.time(),
-            }
-        except Exception as e:
-            logger.error(f"Failed to get VRAM data: {e}")
-            return {
-                "total": 0,
-                "used": 0,
-                "free": 0,
-                "percent": 0,
-                "gpus": [],
-                "cuda_version": "N/A",
-                "device_count": 0,
-                "timestamp": time.time(),
-            }
-        finally:
-            try:
-                pynvml.nvmlShutdown()
-            except Exception:
-                pass
-
-    async def _sync_database_with_external_models(
-        self, external_models: List[Dict[str, Any]]
-    ):
-        """Sync database RunningInstance records with external running models"""
-        try:
-            db: Session = SessionLocal()
-            try:
-                # Get all current running instances from database
-                current_instances = db.query(RunningInstance).all()
-                llama_cpp_instances = [
-                    instance
-                    for instance in current_instances
-                    if (instance.runtime_type or "llama_cpp") == "llama_cpp"
-                ]
-                current_proxy_names = {
-                    instance.proxy_model_name
-                    for instance in llama_cpp_instances
-                    if instance.proxy_model_name
-                }
-
-                # Get external model names
-                external_names = {model["model"] for model in external_models}
-
-                # Find models that are running externally but not in database
-                missing_in_db = external_names - current_proxy_names
-
-                # Find models that are in database but not running externally
-                missing_in_external = current_proxy_names - external_names
-
-                # Add missing models to database
-                for proxy_name in missing_in_db:
-                    # Find the corresponding model in the database by matching the proxy name
-                    model = self._find_model_by_proxy_name(db, proxy_name)
-                    if model:
-                        # Create a new RunningInstance
-                        new_instance = RunningInstance(
-                            model_id=model.id,
-                            proxy_model_name=proxy_name,
-                            started_at=datetime.utcnow(),
-                            runtime_type="llama_cpp",
-                        )
-                        db.add(new_instance)
-                        logger.info(f"Added missing model '{proxy_name}' to database")
-
-                        # Update model.is_active
-                        model.is_active = True
-
-                # Remove models that are no longer running externally
-                for proxy_name in missing_in_external:
-                    instances_to_remove = (
-                        db.query(RunningInstance)
-                        .filter(
-                            RunningInstance.proxy_model_name == proxy_name,
-                            RunningInstance.runtime_type == "llama_cpp",
-                        )
-                        .all()
-                    )
-
-                    for instance in instances_to_remove:
-                        # Update model.is_active
-                        model = (
-                            db.query(Model)
-                            .filter(Model.id == instance.model_id)
-                            .first()
-                        )
-                        if model:
-                            model.is_active = False
-
-                        # Remove the RunningInstance
-                        db.delete(instance)
-                        logger.info(
-                            f"Removed stopped model '{proxy_name}' from database"
-                        )
-
-                db.commit()
-                logger.debug(
-                    f"Database sync completed. Added: {len(missing_in_db)}, Removed: {len(missing_in_external)}"
-                )
-
-            finally:
-                db.close()
-
-        except Exception as e:
-            logger.error(f"Error syncing database with external models: {e}")
-            import traceback
-
-            logger.error(f"Traceback: {traceback.format_exc()}")
-
-    def _find_model_by_proxy_name(
-        self, db: Session, proxy_name: str
-    ) -> Optional[Model]:
-        """Find a model in the database by matching the proxy name"""
-        try:
-            # Use the stored proxy_name field for direct lookup
-            model = db.query(Model).filter(Model.proxy_name == proxy_name).first()
-
-            if model:
-                return model
-
-            logger.warning(f"Could not find model for proxy name: {proxy_name}")
-            return None
-
-        except Exception as e:
-            logger.error(f"Error finding model by proxy name '{proxy_name}': {e}")
-            return None
-
-    # API methods for HTTP endpoints
-    async def get_system_status(self) -> Dict[str, Any]:
-        """Get current system status (for HTTP API)"""
-        cpu_percent = psutil.cpu_percent(interval=0)
-        memory = psutil.virtual_memory()
-        disk = psutil.disk_usage("/app/data")
-
-        db = SessionLocal()
-        try:
-            running_instances = db.query(RunningInstance).all()
-            active_instances = []
-            for instance in running_instances:
-                port = (
-                    LMDEPLOY_PORT
-                    if instance.runtime_type == "lmdeploy"
-                    else DEFAULT_PROXY_PORT
-                )
-                active_instances.append(
-                    {
-                        "id": instance.id,
-                        "model_id": instance.model_id,
-                        "port": port,
-                        "runtime_type": instance.runtime_type,
-                        "proxy_model_name": instance.proxy_model_name,
-                        "started_at": instance.started_at,
-                    }
-                )
-        finally:
-            db.close()
-
-        return {
-            "system": {
-                "cpu_percent": cpu_percent,
-                "memory": {
-                    "total": memory.total,
-                    "available": memory.available,
-                    "percent": memory.percent,
-                    "used": memory.used,
-                    "free": memory.free,
-                },
-                "disk": {
-                    "total": disk.total,
-                    "used": disk.used,
-                    "free": disk.free,
-                    "percent": (disk.used / disk.total) * 100,
-                },
-            },
-            "running_instances": active_instances,
-            "proxy_status": {
-                "enabled": True,
-                "port": 2000,
-                "endpoint": "http://localhost:2000/v1/chat/completions",
-            },
-            "timestamp": datetime.utcnow().isoformat(),
-        }
-
-    async def get_running_models(self) -> List[Dict[str, Any]]:
-        """Get currently running models from llama-swap"""
-        try:
-            return await self.llama_swap_client.get_running_models()
-        except Exception as e:
-            logger.debug(f"Failed to get running models from llama-swap: {e}")
-            return []
-
-    async def unload_all_models(self) -> Dict[str, Any]:
-        """Unload all models via llama-swap"""
-        try:
-            return await self.llama_swap_client.unload_all_models()
-        except Exception as e:
-            logger.error(f"Failed to unload all models: {e}")
-            return {"error": str(e)}
-
-    async def get_system_health(self) -> Dict[str, Any]:
-        """Get llama-swap and system health status"""
-        try:
-            health_result = await self.llama_swap_client.check_health()
-            llama_swap_healthy = health_result.get("healthy", False)
-            loading_models = health_result.get("loading_models", [])
-        except Exception as e:
-            logger.error(f"Failed to check llama-swap health: {e}")
-            llama_swap_healthy = False
-            loading_models = []
-
-        return {
-            "llama_swap_proxy": "healthy" if llama_swap_healthy else "unhealthy",
-            "monitoring_active": self.is_running,
-            "active_connections": len(websocket_manager.active_connections),
-            "loading_models": loading_models,
-            "has_loading_models": len(loading_models) > 0 or self.has_loading_models(),
-        }
-
-    def get_recent_logs(self, limit: int = 100) -> List[Dict[str, Any]]:
-        """Get recent logs from monitor buffer"""
-        logs = list(self.recent_logs)
-        return logs[-limit:]
-
-    def add_log(self, log_event: Dict[str, Any]):
-        """Add a log event to the buffer"""
-        self.recent_logs.append(log_event)
-
-
-# Global unified monitor instance
-unified_monitor = UnifiedMonitor()
diff --git a/backend/websocket_manager.py b/backend/websocket_manager.py
deleted file mode 100644
index 20960c0..0000000
--- a/backend/websocket_manager.py
+++ /dev/null
@@ -1,192 +0,0 @@
-from fastapi import WebSocket
-from typing import List, Dict, Optional, Callable
-import json
-import asyncio
-import time
-from datetime import datetime
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-
-class WebSocketManager:
-    def __init__(self):
-        self.active_connections: List[WebSocket] = []
-        self.subscribers: Dict[str, List[Callable]] = {}
-
-    async def connect(self, websocket: WebSocket):
-        try:
-            await websocket.accept()
-            self.active_connections.append(websocket)
-            logger.info(
-                f"WebSocket connected. Total connections: {len(self.active_connections)}"
-            )
-
-            # Send a test message to verify the connection works
-            await self.send_personal_message(
-                json.dumps(
-                    {
-                        "type": "connection_test",
-                        "message": "WebSocket connection established successfully",
-                        "timestamp": datetime.utcnow().isoformat(),
-                    }
-                ),
-                websocket,
-            )
-
-        except Exception as e:
-            logger.error(f"Error in WebSocketManager.connect: {e}")
-            raise
-
-    def disconnect(self, websocket: WebSocket):
-        if websocket in self.active_connections:
-            self.active_connections.remove(websocket)
-            logger.info(
-                f"WebSocket disconnected. Total connections: {len(self.active_connections)}"
-            )
-
-    async def send_personal_message(self, message: str, websocket: WebSocket):
-        try:
-            await websocket.send_text(message)
-        except:
-            self.disconnect(websocket)
-
-    async def broadcast(self, message: dict):
-        """Broadcast message to all active WebSocket connections"""
-        if not self.active_connections:
-            logger.debug(
-                f"No active WebSocket connections to broadcast message: {message.get('type', 'unknown')}"
-            )
-            return
-
-        message_str = json.dumps(message)
-        logger.debug(
-            f"Broadcasting to {len(self.active_connections)} connections: {message.get('type', 'unknown')}"
-        )
-
-        async def _send(conn):
-            try:
-                await conn.send_text(message_str)
-                return None
-            except Exception as e:
-                return conn
-
-        # Send concurrently and collect failed connections
-        results = await asyncio.gather(
-            *[_send(c) for c in list(self.active_connections)], return_exceptions=False
-        )
-        for failed in results:
-            if isinstance(failed, WebSocket):
-                self.disconnect(failed)
-
-    # Legacy methods for backward compatibility
-    async def send_download_progress(
-        self,
-        task_id: str,
-        progress: int,
-        message: str = "",
-        bytes_downloaded: int = 0,
-        total_bytes: int = 0,
-        speed_mbps: float = 0,
-        eta_seconds: int = 0,
-        filename: str = "",
-        model_format: str = "gguf",
-        files_completed: int = None,
-        files_total: int = None,
-        current_filename: str = None,
-        huggingface_id: str = None,
-    ):
-        await self.broadcast(
-            {
-                "type": "download_progress",
-                "task_id": task_id,
-                "progress": progress,
-                "message": message,
-                "bytes_downloaded": bytes_downloaded,
-                "total_bytes": total_bytes,
-                "speed_mbps": speed_mbps,
-                "eta_seconds": eta_seconds,
-                "filename": filename,
-                "model_format": model_format,
-                "files_completed": files_completed,
-                "files_total": files_total,
-                "current_filename": current_filename or filename,
-                "huggingface_id": huggingface_id,
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
-
-    async def send_build_progress(
-        self,
-        task_id: str,
-        stage: str,
-        progress: int,
-        message: str = "",
-        log_lines: List[str] = None,
-    ):
-        message_data = {
-            "type": "build_progress",
-            "task_id": task_id,
-            "stage": stage,
-            "progress": progress,
-            "message": message,
-            "log_lines": log_lines or [],
-            "timestamp": datetime.utcnow().isoformat(),
-        }
-
-        logger.debug(
-            f"Sending build progress: task_id={task_id}, stage={stage}, progress={progress}, message='{message}', connections={len(self.active_connections)}"
-        )
-        logger.debug(f"Message data: {message_data}")
-        await self.broadcast(message_data)
-
-    async def send_model_status_update(
-        self, model_id: int, status: str, details: dict = None
-    ):
-        await self.broadcast(
-            {
-                "type": "model_status",
-                "model_id": model_id,
-                "status": status,
-                "details": details or {},
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
-
-    async def send_notification(
-        self, title: str, message: str, type: str = "info", actions: List[dict] = None
-    ):
-        await self.broadcast(
-            {
-                "type": "notification",
-                "title": title,
-                "message": message,
-                "notification_type": type,
-                "actions": actions or [],
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
-
-    async def send_lmdeploy_status(self, status: dict):
-        """Broadcast LMDeploy installer status update."""
-        await self.broadcast(
-            {
-                "type": "lmdeploy_status",
-                **status,
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
-
-    async def send_lmdeploy_runtime_log(self, line: str):
-        """Broadcast LMDeploy runtime log line."""
-        await self.broadcast(
-            {
-                "type": "lmdeploy_runtime_log",
-                "line": line,
-                "timestamp": datetime.utcnow().isoformat(),
-            }
-        )
-
-
-# Global WebSocket manager instance
-websocket_manager = WebSocketManager()
diff --git a/docker-compose.cuda.yml b/docker-compose.cuda.yml
index 6e76ef1..57d5cc0 100644
--- a/docker-compose.cuda.yml
+++ b/docker-compose.cuda.yml
@@ -2,17 +2,20 @@ version: '3.8'
 
 services:
   llama-cpp-studio:
-    build: .
+    build:
+      context: .
+      pull: true
     ports:
       - "8080:8080"
       - "2000:2000"
     volumes:
       - ./data:/app/data
-      - ./backend:/app/backend
     environment:
       - CUDA_VISIBLE_DEVICES=all
       - HF_HUB_ENABLE_HF_TRANSFER=1
-      - RELOAD=true
+      - HF_HOME=/app/data/temp/.cache/huggingface
+      - HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub
+      - RELOAD=false
       # Uncomment and set your HuggingFace API key to enable model search and download
       # - HUGGINGFACE_API_KEY=your_huggingface_token_here
     # Alternative: Use .env file for environment variables
diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml
deleted file mode 100644
index e9c00d2..0000000
--- a/docker-compose.rocm.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-version: '3.8'
-
-services:
-  llama-cpp-studio:
-    build: .
-    image: llama-cpp-studio:rocm
-    ports:
-      - "8080:8080"
-      - "2000:2000"
-    volumes:
-      - ./data:/app/data
-      - ./backend:/app/backend
-      # Mount ROCm devices
-      - /dev/kfd:/dev/kfd
-      - /dev/dri:/dev/dri
-    environment:
-      # Disable CUDA to use ROCm instead
-      - CUDA_VISIBLE_DEVICES=""
-      - RELOAD=true
-      # ROCm environment variables
-      - HSA_OVERRIDE_GFX_VERSION=10.3.0
-      - HIP_VISIBLE_DEVICES=all
-      - ROC_ENABLE_PRE_VEGA=1
-      # Uncomment and set your HuggingFace API key
-      # - HUGGINGFACE_API_KEY=your_huggingface_token_here
-    devices:
-      # AMD GPU access
-      - /dev/kfd:/dev/kfd
-      - /dev/dri:/dev/dri
-    # Enable privileged mode for GPU access
-    privileged: true
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/api/status"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    cap_add:
-      - SYS_ADMIN
-    shm_size: '2gb'
diff --git a/docker-compose.vulkan.yml b/docker-compose.vulkan.yml
deleted file mode 100644
index 96324ca..0000000
--- a/docker-compose.vulkan.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-version: '3.8'
-
-services:
-  llama-cpp-studio:
-    build: .
-    image: llama-cpp-studio:vulkan
-    ports:
-      - "8080:8080"
-      - "2000:2000"
-    volumes:
-      - ./data:/app/data
-      - ./backend:/app/backend
-      - /tmp/.X11-unix:/tmp/.X11-unix:rw
-    environment:
-      - DISPLAY=${DISPLAY}
-      - XDG_RUNTIME_DIR=${XDG_RUNTIME_DIR}
-      - HF_HUB_ENABLE_HF_TRANSFER=1
-      # Disable CUDA to use Vulkan instead
-      - CUDA_VISIBLE_DEVICES=""
-      - RELOAD=true
-      # Vulkan device selection (optional)
-      - VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/radeon_icd.x86_64.json
-      # Uncomment and set your HuggingFace API key
-      # - HUGGINGFACE_API_KEY=your_huggingface_token_here
-    devices:
-      # Mount DRI devices for Vulkan access
-      - /dev/dri:/dev/dri
-    # Enable privileged mode for GPU access (required for Vulkan)
-    privileged: true
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/api/status"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    cap_add:
-      - SYS_ADMIN
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index f4158fe..ff2c434 100644
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -8,18 +8,9 @@ if [ -d "/app/data" ]; then
     # Check if we can write to the data directory
     if [ ! -w "/app/data" ]; then
         echo "WARNING: /app/data directory is not writable by current user ($(id -u))"
-        echo "This will cause database and file write errors."
+        echo "This will cause configuration and model write errors."
         echo "To fix, run on the host: sudo chown -R $(id -u):$(id -g) <volume-path>"
     fi
-    
-    # Check database file specifically
-    if [ -f "/app/data/db.sqlite" ] && [ ! -w "/app/data/db.sqlite" ]; then
-        echo "ERROR: Database file /app/data/db.sqlite exists but is not writable"
-        echo "Current user: $(id -u) ($(whoami))"
-        echo "File owner: $(stat -c '%U:%G (%u:%g)' /app/data/db.sqlite 2>/dev/null || echo 'unknown')"
-        echo "To fix, run on the host: sudo chown $(id -u):$(id -g) <volume-path>/db.sqlite"
-        echo "Or remove the database file to recreate it with correct permissions"
-    fi
 fi
 
 # Source the CUDA environment setup script if it exists
diff --git a/frontend/src/App.vue b/frontend/src/App.vue
index d749d0f..b4b0326 100644
--- a/frontend/src/App.vue
+++ b/frontend/src/App.vue
@@ -1,7 +1,7 @@
 <template>
   <div id="app" class="animate-fade-in">
     <ConfirmDialog />
-    
+    <Toast />
     <div class="layout-wrapper">
       <!-- Header -->
       <AppHeader 
@@ -31,14 +31,13 @@ import { useRouter } from 'vue-router'
 
 // PrimeVue
 import ConfirmDialog from 'primevue/confirmdialog'
+import Toast from 'primevue/toast'
 import { useConfirm } from 'primevue/useconfirm'
-
-// Third-party
-import { toast } from 'vue3-toastify'
+import { useToast } from 'primevue/usetoast'
 
 // Stores
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
+import { useEnginesStore } from '@/stores/engines'
+import { useProgressStore } from '@/stores/progress'
 
 // Composables
 import { useTheme } from '@/composables/useTheme'
@@ -50,20 +49,21 @@ import AppFooter from '@/components/layout/AppFooter.vue'
 
 const router = useRouter()
 const confirm = useConfirm()
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
+const toast = useToast()
+const systemStore = useEnginesStore()
+const progressStore = useProgressStore()
 const { initTheme } = useTheme()
 
 const statusLoading = ref(false)
 
 onMounted(() => {
   initTheme()
-  wsStore.connect()
+  progressStore.connect()
   refreshStatus()
 })
 
 onUnmounted(() => {
-  wsStore.disconnect()
+  progressStore.disconnect()
 })
 
 const refreshStatus = async () => {
@@ -71,7 +71,7 @@ const refreshStatus = async () => {
   try {
     await systemStore.fetchSystemStatus()
   } catch (error) {
-    toast.error('Failed to refresh system status')
+    toast.add({ severity: 'error', summary: 'Failed to refresh system status', detail: error?.message, life: 4000 })
   } finally {
     statusLoading.value = false
   }
diff --git a/frontend/src/components/BuildProgress.vue b/frontend/src/components/BuildProgress.vue
deleted file mode 100644
index 86e9a19..0000000
--- a/frontend/src/components/BuildProgress.vue
+++ /dev/null
@@ -1,375 +0,0 @@
-<template>
-  <div class="build-progress" v-if="builds.length > 0">
-    <div class="build-header">
-      <h3>Active Builds</h3>
-      <Button 
-        icon="pi pi-times" 
-        @click="clearCompleted"
-        severity="secondary"
-        text
-        size="small"
-        :disabled="!hasCompletedBuilds"
-      />
-    </div>
-    
-    <div class="build-list">
-      <div 
-        v-for="build in builds" 
-        :key="build.task_id"
-        class="build-item"
-        :class="{ 'completed': build.progress === 100, 'error': build.error }"
-      >
-        <div class="build-info">
-          <div class="build-title">{{ build.title || 'llama.cpp Build' }}</div>
-          <div class="build-stage">{{ build.stage }}</div>
-          <div v-if="build.message" class="build-message">{{ build.message }}</div>
-        </div>
-        
-        <div class="build-progress-bar">
-          <ProgressBar 
-            :value="build.progress" 
-            :showValue="false"
-            class="progress-bar"
-          />
-          <span class="progress-text">{{ build.progress }}%</span>
-        </div>
-        
-        <div class="build-status">
-          <i 
-            v-if="build.progress === 100" 
-            class="pi pi-check-circle status-icon success"
-          ></i>
-          <i 
-            v-else-if="build.error" 
-            class="pi pi-times-circle status-icon error"
-          ></i>
-          <i 
-            v-else 
-            class="pi pi-spin pi-spinner status-icon in-progress"
-          ></i>
-        </div>
-        
-        <div class="build-actions">
-          <Button 
-            icon="pi pi-eye" 
-            @click="toggleLogs(build)"
-            severity="secondary"
-            text
-            size="small"
-            :label="build.showLogs ? 'Hide Logs' : 'Show Logs'"
-          />
-          <Button 
-            v-if="build.error"
-            icon="pi pi-refresh" 
-            @click="retryBuild(build)"
-            severity="secondary"
-            text
-            size="small"
-          />
-        </div>
-        
-        <!-- Build Logs -->
-        <div v-if="build.showLogs && build.log_lines.length > 0" class="build-logs">
-          <div class="logs-header">
-            <span>Build Logs</span>
-            <Button 
-              icon="pi pi-times" 
-              @click="build.showLogs = false"
-              severity="secondary"
-              text
-              size="small"
-            />
-          </div>
-          <LogViewer 
-            :logs="build.log_lines" 
-            mode="raw"
-            :show-header="false"
-            :compact="true"
-            max-height="200px"
-          />
-        </div>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, computed, onMounted, onUnmounted } from 'vue'
-import { useWebSocketStore } from '@/stores/websocket'
-import { useSystemStore } from '@/stores/system'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import ProgressBar from 'primevue/progressbar'
-import LogViewer from '@/components/common/LogViewer.vue'
-
-const wsStore = useWebSocketStore()
-const systemStore = useSystemStore()
-
-const builds = ref([])
-const unsubscribe = ref(null)
-const unsubscribeNotifications = ref(null)
-
-const hasCompletedBuilds = computed(() => 
-  builds.value.some(b => b.progress === 100 || b.error)
-)
-
-onMounted(() => {
-  // Subscribe to build progress updates
-  unsubscribe.value = wsStore.subscribeToBuildProgress(handleBuildProgress)
-  
-  // Also subscribe to notifications for completion events
-  unsubscribeNotifications.value = wsStore.subscribeToNotifications(handleNotification)
-})
-
-onUnmounted(() => {
-  if (unsubscribe.value) {
-    unsubscribe.value()
-  }
-  if (unsubscribeNotifications.value) {
-    unsubscribeNotifications.value()
-  }
-})
-
-const handleBuildProgress = (data) => {
-  const existingIndex = builds.value.findIndex(b => b.task_id === data.task_id)
-  
-  if (existingIndex >= 0) {
-    // Update existing build
-    const existing = builds.value[existingIndex]
-    builds.value[existingIndex] = {
-      ...existing,
-      ...data,
-      log_lines: [...(existing.log_lines || []), ...(data.log_lines || [])],
-      showLogs: existing.showLogs || false
-    }
-  } else {
-    // Add new build
-    builds.value.push({
-      ...data,
-      log_lines: data.log_lines || [],
-      showLogs: false,
-      error: false,
-      title: extractBuildTitle(data.message) || 'llama.cpp Build'
-    })
-  }
-  
-  // Show completion notification and refresh versions list
-  if (data.progress === 100) {
-    toast.success(`${extractBuildTitle(data.message) || 'Build'} completed successfully`)
-    // Refresh the versions list to show the newly built version
-    systemStore.fetchLlamaVersions()
-  }
-  
-  // Also refresh versions list on error (in case of partial installations)
-  if (data.stage === 'error') {
-    toast.error(`${extractBuildTitle(data.message) || 'Build'} failed`)
-    // Refresh versions list even on error to ensure UI is up to date
-    systemStore.fetchLlamaVersions()
-  }
-}
-
-const handleNotification = (data) => {
-  // Handle completion notifications from backend
-  if (data.notification_type === 'success' && 
-      (data.title.includes('Build Complete') || data.title.includes('Installation Complete'))) {
-    // Refresh versions list when build/installation completes
-    systemStore.fetchLlamaVersions()
-  }
-  
-  // Also refresh on error notifications
-  if (data.notification_type === 'error' && 
-      (data.title.includes('Build Failed') || data.title.includes('Installation Failed'))) {
-    // Refresh versions list even on error to ensure UI is up to date
-    systemStore.fetchLlamaVersions()
-  }
-}
-
-const extractBuildTitle = (message) => {
-  if (!message) return null
-  
-  // Try to extract build title from message
-  if (message.includes('llama.cpp')) return 'llama.cpp Build'
-  if (message.includes('install')) return 'Installation'
-  if (message.includes('compile')) return 'Compilation'
-  return null
-}
-
-const toggleLogs = (build) => {
-  build.showLogs = !build.showLogs
-}
-
-const clearCompleted = () => {
-  builds.value = builds.value.filter(b => b.progress < 100 && !b.error)
-}
-
-const retryBuild = (build) => {
-  // Remove from list and let the user retry manually
-  const index = builds.value.findIndex(b => b.task_id === build.task_id)
-  if (index >= 0) {
-    builds.value.splice(index, 1)
-  }
-  
-  toast.info('Please try building again from the llama.cpp manager')
-}
-</script>
-
-<style scoped>
-.build-progress {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: 1rem;
-  margin-bottom: 1rem;
-  box-shadow: var(--shadow-md);
-}
-
-.build-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: 1rem;
-}
-
-.build-header h3 {
-  margin: 0;
-  color: var(--text-primary);
-  font-size: 1rem;
-  font-weight: 600;
-}
-
-.build-list {
-  display: flex;
-  flex-direction: column;
-  gap: 0.75rem;
-}
-
-.build-item {
-  display: grid;
-  grid-template-columns: 1fr auto auto auto;
-  gap: 1rem;
-  align-items: center;
-  padding: 0.75rem;
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  transition: all 0.2s ease;
-}
-
-.build-item.completed {
-  background: var(--bg-card);
-  border-color: var(--status-success);
-  opacity: 0.9;
-}
-
-.build-item.error {
-  background: var(--bg-card);
-  border-color: var(--status-error);
-  opacity: 0.9;
-}
-
-.build-info {
-  min-width: 0;
-}
-
-.build-title {
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 0.25rem;
-}
-
-.build-stage {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  margin-bottom: 0.25rem;
-}
-
-.build-message {
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-  font-style: italic;
-}
-
-.build-progress-bar {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  min-width: 120px;
-}
-
-.progress-bar {
-  flex: 1;
-}
-
-.progress-text {
-  font-size: 0.875rem;
-  font-weight: 600;
-  color: var(--text-primary);
-  min-width: 35px;
-  text-align: right;
-}
-
-.build-status {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  width: 24px;
-}
-
-.build-actions {
-  display: flex;
-  gap: 0.25rem;
-}
-
-.build-logs {
-  grid-column: 1 / -1;
-  margin-top: 0.75rem;
-  background: var(--bg-tertiary);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  overflow: hidden;
-}
-
-.logs-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  padding: 0.5rem 0.75rem;
-  background: var(--bg-surface);
-  border-bottom: 1px solid var(--border-primary);
-  font-size: 0.875rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-
-.status-icon {
-  font-size: 1.25rem;
-}
-
-.status-icon.success {
-  color: var(--status-success);
-}
-
-.status-icon.error {
-  color: var(--status-error);
-}
-
-.status-icon.in-progress {
-  color: var(--status-info);
-}
-
-@media (max-width: 768px) {
-  .build-item {
-    grid-template-columns: 1fr;
-    gap: 0.5rem;
-  }
-  
-  .build-progress-bar {
-    min-width: auto;
-  }
-  
-  .build-actions {
-    justify-content: flex-start;
-  }
-}
-</style>
diff --git a/frontend/src/components/DownloadProgress.vue b/frontend/src/components/DownloadProgress.vue
deleted file mode 100644
index 7373e54..0000000
--- a/frontend/src/components/DownloadProgress.vue
+++ /dev/null
@@ -1,217 +0,0 @@
-<template>
-  <div v-if="activeDownloads.length > 0" class="download-progress-container">
-    <h3>Download Progress</h3>
-    <div class="download-list">
-      <div 
-        v-for="download in activeDownloads" 
-        :key="download.task_id"
-        class="download-item"
-      >
-        <div class="download-header">
-          <div class="download-info">
-            <div class="download-filename">{{ download.filename }}</div>
-            <div class="download-status">{{ download.message }}</div>
-          </div>
-          <div class="download-actions">
-            <Button 
-              icon="pi pi-times" 
-              size="small" 
-              severity="secondary" 
-              text
-              @click="removeDownload(download.task_id)"
-              :disabled="download.progress < 100"
-            />
-          </div>
-        </div>
-        
-        <div class="progress-container">
-          <ProgressBar 
-            :value="download.progress" 
-            :showValue="false"
-            class="progress-bar"
-          />
-          <div class="progress-text">
-            <span class="progress-percentage">{{ download.progress }}%</span>
-            <span class="progress-size">{{ formatBytes(download.bytes_downloaded) }} / {{ formatBytes(download.total_bytes) }}</span>
-          </div>
-        </div>
-        
-        <div v-if="download.speed_mbps > 0 || download.eta_seconds > 0" class="download-stats">
-          <span v-if="download.speed_mbps > 0" class="speed">
-            <i class="pi pi-download"></i>
-            {{ (download.speed_mbps || 0).toFixed(1) }} MB/s
-          </span>
-          <span v-if="download.eta_seconds > 0" class="eta">
-            <i class="pi pi-clock"></i>
-            {{ formatTime(download.eta_seconds) }}
-          </span>
-        </div>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, onMounted, onUnmounted } from 'vue'
-import { useWebSocketStore } from '@/stores/websocket'
-import Button from 'primevue/button'
-import ProgressBar from 'primevue/progressbar'
-import { formatBytes } from '@/utils/formatting'
-
-const wsStore = useWebSocketStore()
-const activeDownloads = ref([])
-const unsubscribe = ref(null)
-
-onMounted(() => {
-  // Subscribe to download progress updates
-  unsubscribe.value = wsStore.subscribeToDownloadProgress(handleDownloadProgress)
-})
-
-onUnmounted(() => {
-  if (unsubscribe.value) {
-    unsubscribe.value()
-  }
-})
-
-const handleDownloadProgress = (data) => {
-  const existingIndex = activeDownloads.value.findIndex(d => d.task_id === data.task_id)
-  
-  if (existingIndex >= 0) {
-    // Update existing download
-    activeDownloads.value[existingIndex] = {
-      ...activeDownloads.value[existingIndex],
-      ...data
-    }
-    
-    // Remove completed downloads after a delay
-    if (data.progress >= 100) {
-      setTimeout(() => {
-        removeDownload(data.task_id)
-      }, 3000)
-    }
-  } else {
-    // Add new download
-    activeDownloads.value.push(data)
-  }
-}
-
-const removeDownload = (taskId) => {
-  const index = activeDownloads.value.findIndex(d => d.task_id === taskId)
-  if (index >= 0) {
-    activeDownloads.value.splice(index, 1)
-  }
-}
-
-const formatTime = (seconds) => {
-  if (seconds < 60) {
-    return `${seconds}s`
-  } else if (seconds < 3600) {
-    const minutes = Math.floor(seconds / 60)
-    const remainingSeconds = seconds % 60
-    return `${minutes}m ${remainingSeconds}s`
-  } else {
-    const hours = Math.floor(seconds / 3600)
-    const minutes = Math.floor((seconds % 3600) / 60)
-    return `${hours}h ${minutes}m`
-  }
-}
-</script>
-
-<style scoped>
-.download-progress-container {
-  margin-bottom: var(--spacing-2xl);
-  padding: var(--spacing-xl);
-  background: var(--bg-card);
-  border-radius: var(--radius-xl);
-  border: 1px solid var(--border-primary);
-  box-shadow: var(--shadow-lg);
-}
-
-.download-progress-container h3 {
-  margin: 0 0 var(--spacing-lg) 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-}
-
-.download-list {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-}
-
-.download-item {
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border-radius: var(--radius-lg);
-  border: 1px solid var(--border-primary);
-  box-shadow: var(--shadow-sm);
-}
-
-.download-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: 0.75rem;
-}
-
-.download-info {
-  flex: 1;
-}
-
-.download-filename {
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 0.25rem;
-}
-
-.download-status {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.download-actions {
-  margin-left: 1rem;
-}
-
-.progress-container {
-  margin-bottom: 0.5rem;
-}
-
-.progress-bar {
-  margin-bottom: 0.5rem;
-}
-
-.progress-text {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  font-size: 0.875rem;
-}
-
-.progress-percentage {
-  font-weight: 600;
-  color: var(--primary-color);
-}
-
-.progress-size {
-  color: var(--text-secondary);
-}
-
-.download-stats {
-  display: flex;
-  gap: 1rem;
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-}
-
-.speed, .eta {
-  display: flex;
-  align-items: center;
-  gap: 0.25rem;
-}
-
-.speed i, .eta i {
-  font-size: 0.75rem;
-}
-</style>
\ No newline at end of file
diff --git a/frontend/src/components/GgufModelList.vue b/frontend/src/components/GgufModelList.vue
deleted file mode 100644
index 56b3d3b..0000000
--- a/frontend/src/components/GgufModelList.vue
+++ /dev/null
@@ -1,455 +0,0 @@
-<template>
-  <div class="model-grid">
-    <BaseCard
-      v-for="modelGroup in modelGroups"
-      :key="modelGroup.huggingface_id"
-      card-class="model-card"
-    >
-      <template #header>
-        <div class="model-card-header">
-        <div>
-          <div class="model-name">{{ modelGroup.huggingface_id }}</div>
-          <div class="model-tags">
-            <span class="model-tag tag-type">{{ modelGroup.model_type }}</span>
-            <span class="model-tag tag-count">{{ modelGroup.quantizations.length }} quantizations</span>
-            <span class="model-tag tag-pipeline" v-if="modelGroup.is_embedding_model">Embedding</span>
-          </div>
-        </div>
-        <div class="model-status">
-          <span
-            :class="[
-              'status-indicator',
-              hasLoadingQuantization(modelGroup) ? 'status-loading' : (hasRunningQuantization(modelGroup) ? 'status-running' : 'status-stopped'),
-              { 'llama-swap-running': hasLlamaSwapQuantization(modelGroup) }
-            ]"
-          >
-            <i :class="hasLoadingQuantization(modelGroup) ? 'pi pi-spin pi-spinner' : (hasLlamaSwapQuantization(modelGroup) ? 'pi pi-share-alt' : (hasRunningQuantization(modelGroup) ? 'pi pi-play' : 'pi pi-pause'))"></i>
-            {{ getModelStatusText(modelGroup) }}
-          </span>
-        </div>
-      </div>
-
-      </template>
-      
-      <div class="quantization-list">
-        <div
-          v-for="quantization in modelGroup.quantizations"
-          :key="quantization.id"
-          class="quantization-item"
-          :class="{ selected: selectedQuantization[modelGroup.huggingface_id] === quantization.id }"
-        >
-          <div class="quantization-info">
-            <div class="quantization-name">
-              {{ quantization.quantization }}
-              <Button
-                v-if="quantization.is_active && quantization.proxy_name"
-                icon="pi pi-external-link"
-                @click="openUpstreamUrl(quantization.proxy_name)"
-                severity="info"
-                size="small"
-                text
-                class="upstream-link"
-                v-tooltip.top="getUpstreamUrl(quantization.proxy_name)"
-              />
-            </div>
-            <div class="quantization-details">
-              <span class="quantization-size">{{ formatFileSize(quantization.file_size) }}</span>
-              <span
-                v-if="quantization.llama_swap_status === 'loading'"
-                class="quantization-status loading"
-              >
-                <i class="pi pi-spin pi-spinner"></i>
-                Loading...
-              </span>
-              <span
-                v-else-if="quantization.is_active"
-                class="quantization-status running"
-                :class="{ 'llama-swap-running': quantization.llama_swap_status === 'running' }"
-              >
-                <i :class="quantization.llama_swap_status === 'running' ? 'pi pi-share-alt' : 'pi pi-play'"></i>
-                Running
-              </span>
-            </div>
-          </div>
-          <div class="quantization-actions">
-            <Button
-              icon="pi pi-check"
-              @click="emitSelectQuantization(modelGroup.huggingface_id, quantization.id)"
-              :class="{ 'p-button-outlined': selectedQuantization[modelGroup.huggingface_id] !== quantization.id }"
-              size="small"
-              severity="success"
-              text
-            />
-            <Button
-              icon="pi pi-trash"
-              @click="emitDeleteQuantization(quantization)"
-              severity="danger"
-              size="small"
-              text
-            />
-          </div>
-        </div>
-      </div>
-
-      <template #footer>
-        <div class="model-actions">
-          <div class="action-group">
-            <Button
-              v-if="!hasRunningQuantization(modelGroup)"
-              label="Start"
-              icon="pi pi-play"
-              @click="emitStart(modelGroup)"
-              :loading="startingModels[selectedQuantization[modelGroup.huggingface_id]]"
-              :disabled="!selectedQuantization[modelGroup.huggingface_id]"
-              severity="success"
-              size="small"
-            />
-            <Button
-              v-else
-              label="Stop"
-              icon="pi pi-stop"
-              @click="emitStop(modelGroup)"
-              :loading="stoppingModels[getRunningQuantizationId(modelGroup)]"
-              severity="danger"
-              size="small"
-            />
-            <Button
-              label="Configure"
-              icon="pi pi-cog"
-              @click="emitConfigure(modelGroup)"
-              :disabled="!selectedQuantization[modelGroup.huggingface_id]"
-              severity="secondary"
-              size="small"
-              outlined
-            />
-          </div>
-          <div class="action-group">
-            <Button
-              label="Delete All"
-              icon="pi pi-trash"
-              @click="emitDeleteGroup(modelGroup)"
-              severity="danger"
-              size="small"
-              outlined
-            />
-          </div>
-        </div>
-      </template>
-    </BaseCard>
-  </div>
-</template>
-
-<script setup>
-// PrimeVue
-import Button from 'primevue/button'
-
-// Components
-import BaseCard from '@/components/common/BaseCard.vue'
-
-// Utils
-import { formatFileSize } from '@/utils/formatting'
-
-const props = defineProps({
-  modelGroups: {
-    type: Array,
-    default: () => []
-  },
-  selectedQuantization: {
-    type: Object,
-    default: () => ({})
-  },
-  startingModels: {
-    type: Object,
-    default: () => ({})
-  },
-  stoppingModels: {
-    type: Object,
-    default: () => ({})
-  }
-})
-
-const emit = defineEmits([
-  'select-quantization',
-  'start',
-  'stop',
-  'configure',
-  'delete-quantization',
-  'delete-group'
-])
-
-const hasRunningQuantization = (modelGroup) => {
-  return modelGroup.quantizations?.some(q => q.is_active)
-}
-
-const hasLoadingQuantization = (modelGroup) => {
-  return modelGroup.quantizations?.some(q => q.llama_swap_status === 'loading')
-}
-
-const hasLlamaSwapQuantization = (modelGroup) => {
-  return modelGroup.quantizations?.some(q => q.llama_swap_status === 'running')
-}
-
-const getModelStatusText = (modelGroup) => {
-  if (hasLoadingQuantization(modelGroup)) return 'Loading...'
-  if (hasRunningQuantization(modelGroup)) return 'Running'
-  return 'Stopped'
-}
-
-const getRunningQuantizationId = (modelGroup) => {
-  const running = modelGroup.quantizations?.find(q => q.is_active)
-  return running ? running.id : null
-}
-
-const emitSelectQuantization = (huggingfaceId, quantizationId) => {
-  emit('select-quantization', { huggingfaceId, quantizationId })
-}
-
-const emitStart = (modelGroup) => {
-  emit('start', modelGroup)
-}
-
-const emitStop = (modelGroup) => {
-  emit('stop', {
-    modelGroup,
-    quantizationId: getRunningQuantizationId(modelGroup)
-  })
-}
-
-const emitConfigure = (modelGroup) => {
-  emit('configure', modelGroup)
-}
-
-const emitDeleteQuantization = (quantization) => {
-  emit('delete-quantization', quantization)
-}
-
-const emitDeleteGroup = (modelGroup) => {
-  emit('delete-group', modelGroup)
-}
-
-const getUpstreamUrl = (proxyName) => {
-  const host = window.location.hostname
-  const port = '2000'
-  return `http://${host}:${port}/upstream/${proxyName}/`
-}
-
-const openUpstreamUrl = (proxyName) => {
-  const url = getUpstreamUrl(proxyName)
-  window.open(url, '_blank')
-}
-</script>
-
-<style scoped>
-.model-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-  gap: var(--spacing-md);
-}
-
-/* Model card styles - using BaseCard with custom enhancements */
-.model-card {
-  position: relative;
-  overflow: visible;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.model-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-  z-index: 1;
-}
-
-.model-card:hover {
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-  transform: translateY(-5px) scale(1.02);
-  border-color: var(--accent-cyan);
-}
-
-.model-card:hover::before {
-  opacity: 1;
-}
-
-.model-card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: var(--spacing-sm);
-}
-
-.model-name {
-  font-weight: 700;
-  color: var(--text-primary);
-  margin-bottom: var(--spacing-sm);
-  font-size: 1.1rem;
-  line-height: 1.3;
-}
-
-.model-tags {
-  display: flex;
-  gap: var(--spacing-xs);
-  flex-wrap: wrap;
-}
-
-.model-tag {
-  font-size: 0.75rem;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  background: var(--bg-surface);
-  color: var(--text-secondary);
-  border: 1px solid var(--border-secondary);
-}
-
-.model-tag.tag-type {
-  text-transform: capitalize;
-}
-
-.model-tag.tag-count {
-  background: var(--status-info-soft);
-  color: var(--accent-cyan);
-  border-color: rgba(34, 211, 238, 0.2);
-}
-
-.model-tag.tag-pipeline {
-  background: rgba(59, 130, 246, 0.12);
-  color: var(--accent-blue);
-  border-color: rgba(59, 130, 246, 0.25);
-  text-transform: uppercase;
-  letter-spacing: 0.05em;
-  font-weight: 600;
-}
-
-.model-status {
-  display: flex;
-  align-items: center;
-}
-
-.status-indicator {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 500;
-}
-
-.status-running {
-  background: var(--status-success-soft);
-  color: var(--accent-green);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.status-loading {
-  background: var(--status-warning-soft, rgba(251, 191, 36, 0.1));
-  color: var(--accent-yellow, #fbbf24);
-  border: 1px solid rgba(251, 191, 36, 0.2);
-}
-
-.status-stopped {
-  background: var(--bg-surface);
-  color: var(--text-secondary);
-  border: 1px solid var(--border-secondary);
-}
-
-.quantization-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  margin: var(--spacing-sm) 0;
-}
-
-.quantization-item {
-  border: 1px solid var(--border-secondary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-sm);
-  background: var(--bg-tertiary);
-  transition: border-color var(--transition-fast), transform var(--transition-fast);
-}
-
-.quantization-item.selected {
-  border-color: var(--accent-cyan);
-  transform: translateY(-2px);
-  box-shadow: var(--shadow-sm);
-}
-
-.quantization-info {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.quantization-name {
-  font-weight: 600;
-  color: var(--text-primary);
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-}
-
-.quantization-details {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.8rem;
-  color: var(--text-secondary);
-}
-
-.quantization-size {
-  font-weight: 500;
-}
-
-.quantization-status.running {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  background: var(--status-success-soft);
-  color: var(--accent-green);
-  font-size: 0.75rem;
-}
-
-.quantization-status.loading {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  background: var(--status-warning-soft, rgba(251, 191, 36, 0.1));
-  color: var(--accent-yellow, #fbbf24);
-  font-size: 0.75rem;
-}
-
-.quantization-actions {
-  display: flex;
-  gap: var(--spacing-xs);
-  margin-top: var(--spacing-xs);
-}
-
-.upstream-link {
-  padding: 0;
-  height: auto;
-}
-
-.model-actions {
-  display: flex;
-  justify-content: space-between;
-  gap: var(--spacing-sm);
-  margin-top: var(--spacing-md);
-}
-
-.action-group {
-  display: flex;
-  gap: var(--spacing-xs);
-}
-</style>
-
diff --git a/frontend/src/components/SafetensorsModelList.vue b/frontend/src/components/SafetensorsModelList.vue
deleted file mode 100644
index 9696d01..0000000
--- a/frontend/src/components/SafetensorsModelList.vue
+++ /dev/null
@@ -1,1917 +0,0 @@
-<template>
-  <div class="safetensors-card">
-    <div class="card-header">
-      <div>
-        <h2>Safetensors Models</h2>
-        <p class="subtitle">Run Hugging Face safetensors via LMDeploy TurboMind</p>
-      </div>
-      <div class="actions">
-        <Button 
-          icon="pi pi-refresh" 
-          @click="$emit('refresh')" 
-          :loading="loading"
-          severity="secondary"
-          text
-        />
-        <Button 
-          icon="pi pi-sync"
-          @click="refreshStatus"
-          :loading="statusLoading"
-          severity="secondary"
-          text
-          v-tooltip.bottom="'Refresh LMDeploy status'"
-        />
-        <Button 
-          icon="pi pi-database"
-          @click="reloadFromDisk"
-          :loading="reloadingFromDisk"
-          severity="secondary"
-          text
-          v-tooltip.bottom="'Reset database entries and reload all safetensors models from disk'"
-        />
-      </div>
-    </div>
-
-    <div v-if="loading" class="loading-state">
-      <i class="pi pi-spin pi-spinner"></i>
-      <span>Loading safetensors models...</span>
-    </div>
-
-    <div v-if="!lmdeployReady" class="lmdeploy-alert">
-      <div class="alert-icon">
-        <i class="pi pi-exclamation-triangle"></i>
-      </div>
-      <div class="alert-content">
-        <h3>LMDeploy is not installed</h3>
-        <p>
-          Install LMDeploy from the new LMDeploy page before starting safetensors runtimes.
-          The install happens at runtime, keeping the Docker image slim.
-        </p>
-        <Button label="Open LMDeploy Page" icon="pi pi-box" size="small" @click="openLmdeployPage" />
-      </div>
-    </div>
-
-    <div v-if="lmdeployOperation" class="lmdeploy-alert info">
-      <div class="alert-icon">
-        <i class="pi pi-spin pi-spinner"></i>
-      </div>
-      <div class="alert-content">
-        <h3>Installer running</h3>
-        <p>LMDeploy installer is currently {{ lmdeployOperation }}. Runtime controls are disabled until it finishes.</p>
-      </div>
-    </div>
-
-    <div v-else-if="groupedModels.length > 0" class="model-grid">
-      <div 
-        v-for="group in groupedModels" 
-        :key="group.huggingface_id"
-        class="model-card grouped-card"
-      >
-        <div class="model-card-header group-header">
-          <div class="group-header-main">
-            <div class="model-name">{{ group.huggingface_id }}</div>
-            <div class="model-path" v-if="group.metadata?.base_model">{{ group.metadata.base_model }}</div>
-          </div>
-          <div class="group-summary">
-            <span>{{ group.file_count }} {{ group.file_count === 1 ? 'file' : 'files' }}</span>
-            <span class="dot">•</span>
-            <span>{{ formatFileSize(group.total_size) }}</span>
-          </div>
-        </div>
-
-        <div class="group-status-row">
-          <span
-            :class="[
-              'status-indicator',
-              isGroupRunning(group) ? 'status-running' : 'status-stopped'
-            ]"
-          >
-            <i :class="isGroupRunning(group) ? 'pi pi-play' : 'pi pi-pause'"></i>
-            <span>{{ isGroupRunning(group) ? 'Running in LMDeploy' : 'Stopped' }}</span>
-          </span>
-        </div>
-
-        <div class="model-body grouped-body">
-          <div class="file-list plain-file-list">
-            <span 
-              v-for="file in group.files" 
-              :key="file.model_id || file.filename"
-              class="file-name-plain"
-            >
-              {{ file.filename }}
-            </span>
-          </div>
-
-          <div class="group-actions">
-            <div class="action-group">
-              <Button 
-                label="Configure & Run" 
-                icon="pi pi-sliders-h"
-                severity="secondary"
-                size="small"
-                :disabled="!group.files?.length"
-                @click="openGroupConfig(group)"
-                :loading="isGroupConfigLoading(group)"
-              />
-              <Button 
-                v-if="group.files?.length && isGroupRunning(group)"
-                label="Stop" 
-                icon="pi pi-stop"
-                severity="danger"
-                size="small"
-                outlined
-                :loading="isGroupStopping(group)"
-                @click="stopGroupRuntime(group)"
-              />
-            </div>
-            <Button 
-              icon="pi pi-trash"
-              severity="danger"
-              size="small"
-              outlined
-              text
-              :disabled="!group.files?.length"
-              @click="$emit('delete', group)"
-            />
-          </div>
-        </div>
-      </div>
-    </div>
-
-    <div v-else class="empty-state">
-      <i class="pi pi-shield"></i>
-      <h3>No Safetensors Models</h3>
-      <p>Download safetensors files from the Search tab to prepare them for LMDeploy.</p>
-    </div>
-
-    <Dialog 
-      v-model:visible="dialogVisible" 
-      modal 
-      :style="{ width: '1000px', maxWidth: '95vw' }"
-      :breakpoints="{ '1200px': '90vw', '960px': '95vw', '640px': '95vw' }"
-    >
-      <template #header>
-        <div class="dialog-header">
-          <div>
-            <h3>{{ selectedModel?.huggingface_id || 'Configure LMDeploy' }}</h3>
-            <p v-if="selectedModel?.files?.length">
-              {{ selectedModel.files.length }} file{{ selectedModel.files.length !== 1 ? 's' : '' }}
-            </p>
-            <p v-else-if="selectedModel?.filename">{{ selectedModel.filename }}</p>
-          </div>
-          <div class="dialog-header-actions">
-            <Button
-              label="Regenerate Metadata"
-              icon="pi pi-refresh"
-              severity="info"
-              outlined
-              size="small"
-              :loading="metadataRefreshing"
-              :disabled="metadataRefreshing || !selectedModelId"
-              @click="regenerateMetadata"
-              v-tooltip.top="'Refresh model metadata from Hugging Face'"
-            />
-            <Tag 
-              :severity="selectedModelRunning ? 'success' : 'warning'"
-              :value="selectedModelRunning ? 'Running' : 'Stopped'"
-            />
-          </div>
-        </div>
-      </template>
-
-      <div v-if="selectedRuntime">
-        <div class="config-section">
-          <h4>Sequence & Parallelism</h4>
-          <div class="config-grid">
-            <div class="config-field span-2">
-              <label>Session Length (--session-len)</label>
-              <div class="slider-row">
-                <Slider v-model="formState.session_len" :min="1024" :max="sessionLimit" :step="256" />
-                <InputNumber v-model="formState.session_len" :min="1024" :max="sessionLimit" :step="256" inputId="sessionInput" />
-              </div>
-              <small class="field-help">
-                Maximum sequence length for a conversation. Controls the context window size.
-                Base context from metadata:
-                <span v-if="baseContextLength">{{ baseContextLength.toLocaleString() }} tokens</span>
-                <span v-else>unknown</span>.
-                <span v-if="isQwen3 && baseContextLength === 32768" class="qwen3-note">
-                  <strong>Note:</strong> For Qwen3 models, max_position_embeddings (40,960) includes 32,768 tokens for outputs and 8,192 reserved for prompts.
-                  The UI shows the usable output context (32,768) for reference, but you can configure up to the full capacity.
-                  If average context ≤ 32,768, YaRN scaling is not recommended as it may degrade performance.
-                </span>
-                Enable RoPE / YaRN scaling below to multiply the base context (up to {{ MAX_SCALING_FACTOR }}×) when supported.
-                Use the "Regenerate Metadata" button in the dialog header to refresh model metadata.
-              </small>
-            </div>
-            <div class="config-field span-2">
-              <label>RoPE / YaRN Scaling</label>
-              <div class="rope-scaling-controls">
-                <Dropdown
-                  v-model="formState.rope_scaling_mode"
-                  :options="ropeScalingOptions"
-                  optionLabel="label"
-                  optionValue="value"
-                  :disabled="!canUseScaling"
-                  class="rope-mode-dropdown"
-                />
-                <div class="slider-row rope-factor-row">
-                  <Slider
-                    v-model="formState.rope_scaling_factor"
-                    :min="1"
-                    :max="MAX_SCALING_FACTOR"
-                    :step="0.05"
-                    :disabled="!scalingEnabled"
-                  />
-                  <InputNumber
-                    v-model="formState.rope_scaling_factor"
-                    :min="1"
-                    :max="MAX_SCALING_FACTOR"
-                    :step="0.05"
-                    :disabled="!scalingEnabled"
-                    inputId="ropeScalingInput"
-                    mode="decimal"
-                  />
-                </div>
-              </div>
-              <small class="field-help">
-                <template v-if="canUseScaling">
-                  Effective context: {{ effectiveContextLength.toLocaleString() }} tokens
-                  <span v-if="scalingEnabled">
-                    <span v-if="adaptedBaseContext">
-                      ({{ formState.rope_scaling_factor.toFixed(2) }}× {{ adaptedBaseContext.toLocaleString() }} adapted base)
-                    </span>
-                    <span v-else>
-                      ({{ formState.rope_scaling_factor.toFixed(2) }}× {{ sessionLimit.toLocaleString() }} base)
-                    </span>
-                    <span v-if="maxPositionEmbeddings && effectiveContextLength >= maxPositionEmbeddings" class="max-length-warning">
-                      (clamped to max_position_embeddings: {{ maxPositionEmbeddings.toLocaleString() }})
-                    </span>
-                  </span>
-                  <span v-else>(scaling disabled)</span>.
-                  <div v-if="scalingWarning" class="scaling-warning" style="color: orange; margin-top: 0.5rem;">
-                    <i class="pi pi-exclamation-triangle"></i>
-                    <span>{{ scalingWarning }}</span>
-                  </div>
-                  <div v-if="modelMaxLength" class="max-length-info">
-                    Model max length: {{ modelMaxLength.toLocaleString() }} tokens.
-                  </div>
-                  <div v-if="maxPositionEmbeddings" class="max-length-info">
-                    Max position embeddings: {{ maxPositionEmbeddings.toLocaleString() }} tokens.
-                  </div>
-                </template>
-                <template v-else>
-                  <span v-if="maxPositionEmbeddings">
-                    Max position embeddings: {{ maxPositionEmbeddings.toLocaleString() }} tokens.
-                  </span>
-                  <span v-else>
-                    RoPE scaling requires a known base context length. Regenerate metadata if you expect one.
-                  </span>
-                </template>
-              </small>
-            </div>
-            <div class="config-field span-2">
-              <label>HF Rope Scaling Overrides (--hf-overrides.⋯)</label>
-              <div class="hf-overrides-grid">
-                <div class="hf-override-field">
-                  <span class="field-label">rope_scaling.rope_type</span>
-                  <InputText
-                    v-model="formState.hf_override_rope_type"
-                    placeholder="e.g. yarn"
-                    :disabled="!scalingEnabled"
-                  />
-                </div>
-                <div class="hf-override-field">
-                  <span class="field-label">rope_scaling.factor</span>
-                  <InputNumber
-                    v-model="formState.hf_override_rope_factor"
-                    :min="1"
-                    :max="MAX_SCALING_FACTOR"
-                    :step="0.05"
-                    mode="decimal"
-                    :disabled="!scalingEnabled"
-                  />
-                </div>
-                <div class="hf-override-field">
-                  <span class="field-label">rope_scaling.original_max_position_embeddings</span>
-                  <InputNumber
-                    v-model="formState.hf_override_rope_original_max"
-                    :min="0"
-                    :max="SESSION_FALLBACK_LIMIT"
-                    :step="512"
-                    :disabled="!scalingEnabled"
-                  />
-                </div>
-              </div>
-              <small class="field-help">
-                These map directly to individual <code>--hf-overrides.rope_scaling.*</code> flags.
-                Fill them when LMDeploy requires explicit Hugging Face rope overrides for scaling.
-              </small>
-            </div>
-            <div class="config-field">
-              <label>Max Prefill Tokens (--max-prefill-token-num)</label>
-              <InputNumber v-model="formState.max_prefill_token_num" :step="256" />
-              <small class="field-help">Maximum tokens processed per iteration during prefill phase. Higher values increase throughput but use more memory. Default: 8192</small>
-            </div>
-            <div class="config-field">
-              <label>Tensor Parallel (--tp)</label>
-              <InputNumber v-model="formState.tensor_parallel" :min="1" :max="8" :step="1" />
-              <small class="field-help">Number of GPUs for tensor parallelism. Must be a power of 2 (1, 2, 4, 8). Splits model layers across GPUs.</small>
-            </div>
-            <div class="config-field">
-              <label>Max Batch Size (--max-batch-size)</label>
-              <InputNumber v-model="formState.max_batch_size" :min="1" :max="128" :step="1" />
-              <small class="field-help">Maximum number of concurrent requests processed in a single batch. Higher values improve throughput but increase latency.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Precision & Backend</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>DType (--dtype)</label>
-              <Dropdown v-model="formState.dtype" :options="dtypeOptions" optionLabel="label" optionValue="value" />
-              <small class="field-help">Data type for model weights and activations. Auto selects FP16 for FP32/FP16 models, BF16 for BF16 models. Ignored for quantized models.</small>
-            </div>
-            <div class="config-field">
-              <label>Model Format (--model-format)</label>
-              <Dropdown v-model="formState.model_format" :options="modelFormatOptions" optionLabel="label" optionValue="value" placeholder="Auto-detect" />
-              <small class="field-help">Model quantization format. Leave empty for auto-detection. Required for AWQ, GPTQ, FP8, or MXFP4 quantized models.</small>
-            </div>
-            <div class="config-field">
-              <label>Quant Policy (--quant-policy)</label>
-              <Dropdown v-model="formState.quant_policy" :options="quantPolicyOptions" optionLabel="label" optionValue="value" />
-              <small class="field-help">KV cache quantization: 0 = no quantization, 4 = 4-bit KV cache, 8 = 8-bit KV cache. Reduces memory usage at cost of slight accuracy.</small>
-            </div>
-            <div class="config-field">
-              <label>Communicator (--communicator)</label>
-              <Dropdown v-model="formState.communicator" :options="communicatorOptions" optionLabel="label" optionValue="value" />
-              <small class="field-help">Multi-GPU communication backend. NCCL (recommended) for most setups. CUDA-IPC can be faster for same-node NVLink-connected GPUs.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Cache & Performance</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Cache Max Entry (--cache-max-entry-count)</label>
-              <InputNumber v-model="formState.cache_max_entry_count" :min="0.1" :max="1" :step="0.05" mode="decimal" />
-              <small class="field-help">Percentage of free GPU memory used for KV cache (excluding model weights). Higher values allow longer contexts but reduce available memory. Default: 0.8 (80%)</small>
-            </div>
-            <div class="config-field">
-              <label>Cache Block Seq Len (--cache-block-seq-len)</label>
-              <InputNumber v-model="formState.cache_block_seq_len" :min="32" :max="2048" :step="32" />
-              <small class="field-help">Token sequence length per KV cache block. Must be multiple of 32 for compute capability ≥8.0, or 64 otherwise. Default: 64</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Prefix Caching (--enable-prefix-caching)</label>
-                <InputSwitch v-model="formState.enable_prefix_caching" />
-              </div>
-              <small class="field-help">Enable prefix matching and caching. Reuses cached KV for common prompt prefixes, improving performance for repeated prompts.</small>
-            </div>
-            <div class="config-field">
-              <label>Tokens Per Iteration (--num-tokens-per-iter)</label>
-              <InputNumber v-model="formState.num_tokens_per_iter" :min="0" :max="262144" :step="64" />
-              <small class="field-help">Number of tokens processed in a single forward pass. 0 = auto-detect. Higher values increase throughput but use more memory.</small>
-            </div>
-            <div class="config-field">
-              <label>Max Prefill Iterations (--max-prefill-iters)</label>
-              <InputNumber v-model="formState.max_prefill_iters" :min="1" :max="16" :step="1" />
-              <small class="field-help">Maximum number of forward passes during prefill stage. Higher values allow processing longer prompts in fewer iterations. Default: 1</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Enable Metrics (--enable-metrics)</label>
-                <InputSwitch v-model="formState.enable_metrics" />
-              </div>
-              <small class="field-help">Enable performance metrics collection. Provides detailed timing and throughput statistics for monitoring and optimization.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Server Configuration</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Model Name (--model-name)</label>
-              <InputText v-model="formState.model_name" placeholder="Optional model identifier" />
-              <small class="field-help">Model name for OpenAI-style /v1/models listing. Leave empty for auto-generated name.</small>
-            </div>
-            <div class="config-field">
-              <label>Log Level (--log-level)</label>
-              <Dropdown v-model="formState.log_level" :options="logLevelOptions" optionLabel="label" optionValue="value" placeholder="None (Default)" />
-              <small class="field-help">Logging verbosity level. Select None to use LMDeploy's default.</small>
-            </div>
-            <div class="config-field">
-              <label>Max Concurrent Requests (--max-concurrent-requests)</label>
-              <InputNumber v-model="formState.max_concurrent_requests" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Maximum number of concurrent API requests. Leave empty to use default.</small>
-            </div>
-            <div class="config-field">
-              <label>API Keys (--api-keys)</label>
-              <InputText v-model="formState.api_keys" placeholder="key1,key2,key3" />
-              <small class="field-help">Comma-separated list of API keys for authentication. Leave empty to disable.</small>
-            </div>
-            <div class="config-field">
-              <label>Proxy URL (--proxy-url)</label>
-              <InputText v-model="formState.proxy_url" placeholder="http://proxy.example.com" />
-              <small class="field-help">Proxy URL for requests. Leave empty for no proxy.</small>
-            </div>
-            <div class="config-field">
-              <label>Allow Origins (--allow-origins)</label>
-              <InputText v-model="formState.allow_origins" placeholder="origin1,origin2" />
-              <small class="field-help">Comma-separated list of allowed CORS origins. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Allow Methods (--allow-methods)</label>
-              <InputText v-model="formState.allow_methods" placeholder="GET,POST,OPTIONS" />
-              <small class="field-help">Comma-separated list of allowed HTTP methods. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Allow Headers (--allow-headers)</label>
-              <InputText v-model="formState.allow_headers" placeholder="header1,header2" />
-              <small class="field-help">Comma-separated list of allowed CORS headers. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Max Log Length (--max-log-len)</label>
-              <InputNumber v-model="formState.max_log_len" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Maximum log message length. Leave empty to use default.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Allow Credentials (--allow-credentials)</label>
-                <InputSwitch v-model="formState.allow_credentials" />
-              </div>
-              <small class="field-help">Allow credentials in CORS requests.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>SSL (--ssl)</label>
-                <InputSwitch v-model="formState.ssl" />
-              </div>
-              <small class="field-help">Enable SSL/TLS for the API server.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Disable FastAPI Docs (--disable-fastapi-docs)</label>
-                <InputSwitch v-model="formState.disable_fastapi_docs" />
-              </div>
-              <small class="field-help">Disable FastAPI automatic documentation endpoints.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Allow Terminate by Client (--allow-terminate-by-client)</label>
-                <InputSwitch v-model="formState.allow_terminate_by_client" />
-              </div>
-              <small class="field-help">Allow clients to terminate requests.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Enable Abort Handling (--enable-abort-handling)</label>
-                <InputSwitch v-model="formState.enable_abort_handling" />
-              </div>
-              <small class="field-help">Enable handling of aborted requests.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Model Configuration</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Device (--device)</label>
-              <Dropdown v-model="formState.device" :options="deviceOptions" optionLabel="label" optionValue="value" placeholder="None (Default: cuda)" />
-              <small class="field-help">Target device for model execution. Select None to use default (cuda).</small>
-            </div>
-            <div class="config-field">
-              <label>Chat Template (--chat-template)</label>
-              <InputText v-model="formState.chat_template" placeholder="Path to chat template file" />
-              <small class="field-help">Path to custom chat template file. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Tool Call Parser (--tool-call-parser)</label>
-              <InputText v-model="formState.tool_call_parser" placeholder="Parser name or path" />
-              <small class="field-help">Tool call parser configuration. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Reasoning Parser (--reasoning-parser)</label>
-              <InputText v-model="formState.reasoning_parser" placeholder="Parser name or path" />
-              <small class="field-help">Reasoning parser configuration. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Revision (--revision)</label>
-              <InputText v-model="formState.revision" placeholder="git revision or branch" />
-              <small class="field-help">Model revision/branch to use. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Download Dir (--download-dir)</label>
-              <InputText v-model="formState.download_dir" placeholder="/path/to/downloads" />
-              <small class="field-help">Directory for model downloads. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Adapters (--adapters)</label>
-              <InputText v-model="formState.adapters" placeholder="adapter1,adapter2" />
-              <small class="field-help">Comma-separated list of adapter paths (LoRA, etc.). Leave empty for none.</small>
-            </div>
-            <div class="config-field">
-              <label>Logprobs Mode (--logprobs-mode)</label>
-              <Dropdown v-model="formState.logprobs_mode" :options="logprobsModeOptions" optionLabel="label" optionValue="value" placeholder="None" />
-              <small class="field-help">Log probabilities output mode. Leave empty for None.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Eager Mode (--eager-mode)</label>
-                <InputSwitch v-model="formState.eager_mode" />
-              </div>
-              <small class="field-help">Enable eager execution mode.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Disable Vision Encoder (--disable-vision-encoder)</label>
-                <InputSwitch v-model="formState.disable_vision_encoder" />
-              </div>
-              <small class="field-help">Disable vision encoder for vision-language models.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Vision</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Vision Max Batch Size (--vision-max-batch-size)</label>
-              <InputNumber v-model="formState.vision_max_batch_size" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Maximum batch size for vision-related tasks. Default: 1. Leave empty to use default.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Speculative Decoding</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Speculative Algorithm (--speculative-algorithm)</label>
-              <Dropdown v-model="formState.speculative_algorithm" :options="speculativeAlgorithmOptions" optionLabel="label" optionValue="value" placeholder="None" />
-              <small class="field-help">Speculative decoding algorithm. Leave empty to disable.</small>
-            </div>
-            <div class="config-field">
-              <label>Speculative Draft Model (--speculative-draft-model)</label>
-              <InputText v-model="formState.speculative_draft_model" placeholder="Path to draft model" />
-              <small class="field-help">Path to draft model for speculative decoding. Required if algorithm is set.</small>
-            </div>
-            <div class="config-field">
-              <label>Speculative Num Draft Tokens (--speculative-num-draft-tokens)</label>
-              <InputNumber v-model="formState.speculative_num_draft_tokens" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Number of draft tokens for speculative decoding. Leave empty to use default.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Distributed / Multi-node</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>Data Parallelism (--dp)</label>
-              <InputNumber v-model="formState.dp" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Data parallelism degree. Leave empty to use default (1).</small>
-            </div>
-            <div class="config-field">
-              <label>Expert Parallelism (--ep)</label>
-              <InputNumber v-model="formState.ep" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Expert parallelism degree. Leave empty to use default (1).</small>
-            </div>
-            <div class="config-field">
-              <label>Role (--role)</label>
-              <Dropdown v-model="formState.role" :options="roleOptions" optionLabel="label" optionValue="value" placeholder="None (Default: Hybrid)" />
-              <small class="field-help">Node role in distributed setup. Select None to use default (Hybrid).</small>
-            </div>
-            <div class="config-field">
-              <label>Node Rank (--node-rank)</label>
-              <InputNumber v-model="formState.node_rank" :min="0" :step="1" :allowEmpty="true" />
-              <small class="field-help">Rank of this node in distributed setup. Leave empty to use default (0).</small>
-            </div>
-            <div class="config-field">
-              <label>Number of Nodes (--nnodes)</label>
-              <InputNumber v-model="formState.nnodes" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Total number of nodes. Leave empty to use default (1).</small>
-            </div>
-            <div class="config-field">
-              <label>CP (--cp)</label>
-              <InputNumber v-model="formState.cp" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Checkpoint parallelism. Leave empty to use default (1).</small>
-            </div>
-            <div class="config-field">
-              <label>Distributed Executor Backend (--distributed-executor-backend)</label>
-              <Dropdown v-model="formState.distributed_executor_backend" :options="distributedExecutorBackendOptions" optionLabel="label" optionValue="value" placeholder="None" />
-              <small class="field-help">Backend for distributed execution. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>Migration Backend (--migration-backend)</label>
-              <Dropdown v-model="formState.migration_backend" :options="migrationBackendOptions" optionLabel="label" optionValue="value" placeholder="None" />
-              <small class="field-help">Migration backend for distributed setup. Leave empty for default.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Enable Microbatch (--enable-microbatch)</label>
-                <InputSwitch v-model="formState.enable_microbatch" />
-              </div>
-              <small class="field-help">Enable microbatch processing.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Enable EPLB (--enable-eplb)</label>
-                <InputSwitch v-model="formState.enable_eplb" />
-              </div>
-              <small class="field-help">Enable expert parallelism load balancing.</small>
-            </div>
-            <div class="config-field switch-field">
-              <div class="switch-label-group">
-                <label>Enable Return Routed Experts (--enable-return-routed-experts)</label>
-                <InputSwitch v-model="formState.enable_return_routed_experts" />
-              </div>
-              <small class="field-help">Enable return routed experts for MoE models.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>DLLM (Diffusion LLM) - Advanced</h4>
-          <div class="config-grid">
-            <div class="config-field">
-              <label>DLLM Block Length (--dllm-block-length)</label>
-              <InputNumber v-model="formState.dllm_block_length" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Block length for DLLM. Leave empty to use default.</small>
-            </div>
-            <div class="config-field">
-              <label>DLLM Unmasking Strategy (--dllm-unmasking-strategy)</label>
-              <Dropdown v-model="formState.dllm_unmasking_strategy" :options="dllmUnmaskingStrategyOptions" optionLabel="label" optionValue="value" placeholder="None" />
-              <small class="field-help">Unmasking strategy for DLLM. Leave empty for default.</small>
-            </div>
-            <div class="config-field">
-              <label>DLLM Denoising Steps (--dllm-denoising-steps)</label>
-              <InputNumber v-model="formState.dllm_denoising_steps" :min="1" :step="1" :allowEmpty="true" />
-              <small class="field-help">Number of denoising steps for DLLM. Leave empty to use default.</small>
-            </div>
-            <div class="config-field">
-              <label>DLLM Confidence Threshold (--dllm-confidence-threshold)</label>
-              <InputNumber v-model="formState.dllm_confidence_threshold" :min="0" :max="1" :step="0.01" mode="decimal" :allowEmpty="true" />
-              <small class="field-help">Confidence threshold for DLLM (0.0-1.0). Leave empty to use default.</small>
-            </div>
-          </div>
-        </div>
-
-        <div class="config-section">
-          <h4>Advanced</h4>
-          <div class="config-grid">
-            <div class="config-field span-2">
-              <label>Additional CLI Arguments</label>
-              <InputText v-model="formState.additional_args" placeholder="--custom-flag value" />
-              <small class="field-help">Additional command-line arguments passed directly to LMDeploy. Use for experimental or version-specific flags not exposed in the UI.</small>
-            </div>
-          </div>
-        </div>
-
-        <Divider />
-
-        <div class="metadata-section">
-          <div>
-            <h4>Model Metadata</h4>
-            <ul>
-              <li v-if="metadata.architecture"><strong>Architecture:</strong> {{ metadata.architecture }}</li>
-              <li v-if="metadata.base_model"><strong>Base Model:</strong> {{ metadata.base_model }}</li>
-              <li v-if="metadata.pipeline_tag"><strong>Pipeline:</strong> {{ metadata.pipeline_tag }}</li>
-              <li v-if="metadata.parameters"><strong>Parameters:</strong> {{ metadata.parameters }}</li>
-            </ul>
-          </div>
-          <div v-if="dtypeEntries.length" class="dtype-panel">
-            <h4>Tensor dtypes</h4>
-            <div class="dtype-tags">
-              <Tag 
-                v-for="item in dtypeEntries" 
-                :key="item.label" 
-                :value="`${item.label}: ${item.value.toLocaleString()}`"
-                severity="secondary"
-              />
-            </div>
-          </div>
-        </div>
-      </div>
-      <div v-else class="loading-state">
-        <i class="pi pi-spin pi-spinner"></i>
-        <span>Loading configuration...</span>
-      </div>
-
-      <template #footer>
-        <div class="dialog-footer">
-          <Button 
-            label="Save Config" 
-            icon="pi pi-save"
-            severity="secondary"
-            :loading="savingConfig"
-            @click="saveConfig"
-          />
-          <Button 
-            v-if="selectedModelRunning"
-            label="Stop"
-            icon="pi pi-stop"
-            severity="danger"
-            :loading="dialogStopping"
-            @click="stopRuntime()"
-          />
-          <Button 
-            v-else
-            label="Start LMDeploy"
-            icon="pi pi-play"
-            severity="success"
-            :disabled="!lmdeployReady || !!lmdeployOperation"
-            :loading="dialogStarting"
-            @click="startRuntime"
-          />
-          <Button label="Close" text severity="secondary" @click="dialogVisible = false" />
-        </div>
-      </template>
-    </Dialog>
-  </div>
-</template>
-
-<script setup>
-import { computed, ref, reactive, watch, onMounted } from 'vue'
-import { useRouter } from 'vue-router'
-import Button from 'primevue/button'
-import Dialog from 'primevue/dialog'
-import Slider from 'primevue/slider'
-import InputNumber from 'primevue/inputnumber'
-import InputText from 'primevue/inputtext'
-import InputSwitch from 'primevue/inputswitch'
-import Dropdown from 'primevue/dropdown'
-import Divider from 'primevue/divider'
-import Tag from 'primevue/tag'
-import { toast } from 'vue3-toastify'
-import { useModelStore } from '@/stores/models'
-import axios from 'axios'
-import { formatFileSize, formatDate } from '@/utils/formatting'
-
-const router = useRouter()
-const props = defineProps({
-  models: {
-    type: Array,
-    default: () => []
-  },
-  loading: {
-    type: Boolean,
-    default: false
-  }
-})
-
-defineEmits(['refresh', 'delete'])
-
-const modelStore = useModelStore()
-const dialogVisible = ref(false)
-const selectedModel = ref(null)
-const savingConfig = ref(false)
-const reloadingFromDisk = ref(false)
-
-const dtypeOptions = [
-  { label: 'Auto', value: 'auto' },
-  { label: 'float16', value: 'float16' },
-  { label: 'bfloat16', value: 'bfloat16' }
-]
-const modelFormatOptions = [
-  { label: 'Auto', value: '' },
-  { label: 'HF', value: 'hf' },
-  { label: 'AWQ', value: 'awq' },
-  { label: 'GPTQ', value: 'gptq' },
-  { label: 'FP8', value: 'fp8' },
-  { label: 'MXFP4', value: 'mxfp4' }
-]
-const quantPolicyOptions = [
-  { label: '0 • No kv quant', value: 0 },
-  { label: '4 • 4-bit kv', value: 4 },
-  { label: '8 • 8-bit kv', value: 8 }
-]
-const communicatorOptions = [
-  { label: 'NCCL', value: 'nccl' },
-  { label: 'Native', value: 'native' },
-  { label: 'CUDA IPC', value: 'cuda-ipc' }
-]
-const logLevelOptions = [
-  { label: 'None (Default)', value: null },
-  { label: 'CRITICAL', value: 'CRITICAL' },
-  { label: 'FATAL', value: 'FATAL' },
-  { label: 'ERROR', value: 'ERROR' },
-  { label: 'WARN', value: 'WARN' },
-  { label: 'WARNING', value: 'WARNING' },
-  { label: 'INFO', value: 'INFO' },
-  { label: 'DEBUG', value: 'DEBUG' },
-  { label: 'NOTSET', value: 'NOTSET' }
-]
-const deviceOptions = [
-  { label: 'None (Default: cuda)', value: null },
-  { label: 'CUDA', value: 'cuda' },
-  { label: 'Ascend', value: 'ascend' },
-  { label: 'MACA', value: 'maca' },
-  { label: 'CAMB', value: 'camb' }
-]
-const logprobsModeOptions = [
-  { label: 'None', value: null },
-  { label: 'Raw Logits', value: 'raw_logits' },
-  { label: 'Raw Logprobs', value: 'raw_logprobs' }
-]
-const dllmUnmaskingStrategyOptions = [
-  { label: 'None (Default: low_confidence_dynamic)', value: null },
-  { label: 'Low Confidence Dynamic', value: 'low_confidence_dynamic' },
-  { label: 'Low Confidence Static', value: 'low_confidence_static' },
-  { label: 'Sequential', value: 'sequential' }
-]
-const roleOptions = [
-  { label: 'None (Default: Hybrid)', value: null },
-  { label: 'Hybrid', value: 'Hybrid' },
-  { label: 'Prefill', value: 'Prefill' },
-  { label: 'Decode', value: 'Decode' }
-]
-const migrationBackendOptions = [
-  { label: 'None (Default: DLSlime)', value: null },
-  { label: 'DLSlime', value: 'DLSlime' },
-  { label: 'Mooncake', value: 'Mooncake' }
-]
-const distributedExecutorBackendOptions = [
-  { label: 'None (Auto-select)', value: null },
-  { label: 'Uni', value: 'uni' },
-  { label: 'MP', value: 'mp' },
-  { label: 'Ray', value: 'ray' }
-]
-const speculativeAlgorithmOptions = [
-  { label: 'None (Disabled)', value: null },
-  { label: 'Eagle', value: 'eagle' },
-  { label: 'Eagle3', value: 'eagle3' },
-  { label: 'DeepSeek MTP', value: 'deepseek_mtp' }
-]
-
-const formState = reactive({
-  session_len: 4096,
-  max_prefill_token_num: 8192,
-  tensor_parallel: 1,
-  max_batch_size: 4,
-  dtype: 'auto',
-  cache_max_entry_count: 0.8,
-  cache_block_seq_len: 64,
-  enable_prefix_caching: false,
-  quant_policy: 0,
-  model_format: '',
-  enable_metrics: true,
-  rope_scaling_mode: 'disabled',
-  rope_scaling_factor: 1,
-  hf_override_rope_type: '',
-  hf_override_rope_factor: null,
-  hf_override_rope_original_max: null,
-  num_tokens_per_iter: 0,
-  max_prefill_iters: 1,
-  communicator: 'nccl',
-  model_name: '',
-  // Server configuration
-  allow_origins: '',
-  allow_credentials: false,
-  allow_methods: '',
-  allow_headers: '',
-  proxy_url: '',
-  max_concurrent_requests: null,
-  log_level: null,
-  api_keys: '',
-  ssl: false,
-  max_log_len: null,
-  disable_fastapi_docs: false,
-  allow_terminate_by_client: false,
-  enable_abort_handling: false,
-  // Model configuration
-  chat_template: '',
-  tool_call_parser: '',
-  reasoning_parser: '',
-  revision: '',
-  download_dir: '',
-  adapters: '',
-  device: null,
-  eager_mode: false,
-  disable_vision_encoder: false,
-  logprobs_mode: null,
-  // DLLM parameters
-  dllm_block_length: null,
-  dllm_unmasking_strategy: null,
-  dllm_denoising_steps: null,
-  dllm_confidence_threshold: null,
-  // Distributed/Multi-node parameters
-  dp: null,
-  ep: null,
-  enable_microbatch: false,
-  enable_eplb: false,
-  role: null,
-  migration_backend: null,
-  node_rank: null,
-  nnodes: null,
-  cp: null,
-  enable_return_routed_experts: false,
-  distributed_executor_backend: null,
-  // Vision parameters
-  vision_max_batch_size: null,
-  // Speculative decoding parameters
-  speculative_algorithm: null,
-  speculative_draft_model: null,
-  speculative_num_draft_tokens: null,
-  additional_args: ''
-})
-
-const getEntryModelId = (entry) => entry?.model_id ?? entry?.modelId ?? entry?.id
-
-const groupedModels = computed(() => {
-  if (!Array.isArray(props.models)) return []
-  return [...props.models].sort((a, b) => {
-    const aDate = new Date(a.latest_downloaded_at || 0).getTime()
-    const bDate = new Date(b.latest_downloaded_at || 0).getTime()
-    return bDate - aDate
-  })
-})
-
-const statusLoading = computed(() => modelStore.lmdeployStatusLoading)
-
-const currentInstanceId = computed(() => modelStore.lmdeployStatus?.running_instance?.model_id)
-const installerStatus = computed(() => modelStore.lmdeployStatus?.installer || null)
-const lmdeployReady = computed(() => !!installerStatus.value?.installed)
-const lmdeployOperation = computed(() => installerStatus.value?.operation || null)
-const selectedModelId = computed(() => getEntryModelId(selectedModel.value))
-const selectedRuntime = computed(() => {
-  const id = selectedModelId.value
-  if (!id) return null
-  return modelStore.safetensorsRuntime[id]
-})
-const metadata = computed(() => selectedRuntime.value?.metadata || {})
-
-// Base context length reported by metadata (typically the trained / sliding-window
-// context). This is informative only; users can exceed it via RoPE / YaRN scaling.
-const baseContextLength = computed(() => {
-  const runtime = selectedRuntime.value
-  if (!runtime) return 0
-  return (
-    runtime.max_context_length ||
-    runtime.metadata?.max_context_length ||
-    runtime.metadata?.context_length ||
-    0
-  )
-})
-
-const SESSION_FALLBACK_LIMIT = 256000
-const MAX_SCALING_FACTOR = 4
-const ropeScalingOptions = [
-  { label: 'Disabled', value: 'disabled' },
-  { label: 'YaRN (recommended)', value: 'yarn' },
-  { label: 'Generic scaling', value: 'generic' },
-]
-
-const isQwen3 = computed(() => {
-  const runtime = selectedRuntime.value
-  if (!runtime) return false
-  const config = runtime.metadata?.config || {}
-  const modelType = (config.model_type || '').toLowerCase()
-  const huggingfaceId = (selectedModel.value?.huggingface_id || '').toLowerCase()
-  return modelType.includes('qwen3') || huggingfaceId.includes('qwen3')
-})
-
-// Model max length from tokenizer_config.json (clamps RoPE scaling)
-// Model max length from metadata (required for rope scaling)
-const modelMaxLength = computed(() => {
-  const runtime = selectedRuntime.value
-  if (!runtime) return null
-  return runtime.metadata?.model_max_length || null
-})
-
-// Max position embeddings from config
-const maxPositionEmbeddings = computed(() => {
-  const runtime = selectedRuntime.value
-  if (!runtime) return null
-  const config = runtime.metadata?.config || {}
-  return config.max_position_embeddings || null
-})
-
-// Check if scaling should be available
-const canUseScaling = computed(() => {
-  const baseLimit = Number(baseContextLength.value) || 0
-  if (baseLimit <= 0) return false
-  
-  // Allow scaling if we have base context
-  return true
-})
-
-// Warning if model_max_length is missing
-const scalingWarning = computed(() => {
-  if (!modelMaxLength.value && canUseScaling.value) {
-    return "RoPE scaling is not recommended without model_max_length. Use max_position_embeddings as fallback."
-  }
-  return null
-})
-
-// Adapted base context for scaling (model_max_length / 4 when model_max_length > max_position_embeddings)
-const adaptedBaseContext = computed(() => {
-  const modelMax = modelMaxLength.value
-  const maxPos = maxPositionEmbeddings.value
-  if (modelMax && maxPos && modelMax > maxPos) {
-    // If model_max_length > max_position_embeddings, it means rope scaling can achieve model_max_length
-    // Adapt base context to model_max_length / 4 (allows 4x scaling to reach model_max_length)
-    return Math.floor(modelMax / 4)
-  }
-  return null
-})
-
-const sessionLimit = computed(() => {
-  const baseLimit = Number(baseContextLength.value) || 0
-  if (baseLimit > 0) {
-    return baseLimit
-  }
-  return SESSION_FALLBACK_LIMIT
-})
-const scalingEnabled = computed(() => {
-  const mode = (formState.rope_scaling_mode || '').toLowerCase()
-  return canUseScaling.value && mode !== '' && mode !== 'disabled'
-})
-const effectiveContextLength = computed(() => {
-  const base = Number(formState.session_len) || 0
-  if (base <= 0) {
-    return 0
-  }
-  if (!scalingEnabled.value) {
-    return base
-  }
-  const rawFactor = Number(formState.rope_scaling_factor) || 1
-  const clampedFactor = Math.min(Math.max(rawFactor, 1), MAX_SCALING_FACTOR)
-  let effective = Math.round(base * clampedFactor)
-  // Clamp to model_max_length if available, otherwise max_position_embeddings
-  if (modelMaxLength.value && effective > modelMaxLength.value) {
-    effective = modelMaxLength.value
-  } else if (maxPositionEmbeddings.value && effective > maxPositionEmbeddings.value) {
-    effective = maxPositionEmbeddings.value
-  }
-  return effective
-})
-const metadataRefreshing = computed(() => {
-  const id = selectedModelId.value
-  if (!id) return false
-  return !!modelStore.safetensorsMetadataRefreshing[id]
-})
-const dtypeEntries = computed(() => {
-  const summary = selectedRuntime.value?.tensor_summary?.dtype_counts || {}
-  return Object.entries(summary).map(([label, value]) => ({ label, value }))
-})
-const selectedModelRunning = computed(() => {
-  if (!selectedModelId.value || !currentInstanceId.value) return false
-  return currentInstanceId.value === selectedModelId.value
-})
-
-watch(sessionLimit, (limit) => {
-  const maxLimit = Number(limit) || 0
-  if (!maxLimit) return
-  if (formState.session_len > maxLimit) {
-    formState.session_len = maxLimit
-  }
-})
-
-watch(
-  () => [scalingEnabled.value, sessionLimit.value, adaptedBaseContext.value],
-  ([enabled, limit, adaptedBase]) => {
-    if (!enabled) return
-    // Use adapted base context if available (model_max_length / 4), otherwise use session limit
-    const targetLimit = adaptedBase && adaptedBase >= 1024 ? adaptedBase : Number(limit) || 0
-    if (!targetLimit) return
-    if (formState.session_len !== targetLimit) {
-      formState.session_len = targetLimit
-    }
-    // Auto-set hf_override_rope_original_max to adapted base when scaling is enabled
-    if (adaptedBase && adaptedBase >= 1024) {
-      formState.hf_override_rope_original_max = adaptedBase
-    }
-  }
-)
-
-watch(
-  () => formState.rope_scaling_mode,
-  (mode) => {
-    if (!canUseScaling.value) {
-      if (mode !== 'disabled') {
-        formState.rope_scaling_mode = 'disabled'
-      }
-      if (formState.rope_scaling_factor !== 1) {
-        formState.rope_scaling_factor = 1
-      }
-      return
-    }
-    if (mode && mode !== 'disabled') {
-      if (formState.rope_scaling_factor <= 1) {
-        formState.rope_scaling_factor = Math.min(2, MAX_SCALING_FACTOR)
-      }
-      const limit = Number(sessionLimit.value) || 0
-      if (limit && formState.session_len !== limit) {
-        formState.session_len = limit
-      }
-    } else if (formState.rope_scaling_factor !== 1) {
-      formState.rope_scaling_factor = 1
-    }
-  }
-)
-
-watch(
-  () => formState.rope_scaling_factor,
-  (factor) => {
-    if (factor > MAX_SCALING_FACTOR) {
-      formState.rope_scaling_factor = MAX_SCALING_FACTOR
-    } else if (factor < 1) {
-      formState.rope_scaling_factor = 1
-    }
-  }
-)
-
-
-
-const isConfigLoading = (entry) => {
-  const id = getEntryModelId(entry)
-  return !!modelStore.safetensorsRuntimeLoading[id]
-}
-
-const isGroupConfigLoading = (group) => {
-  if (!group?.files?.length) return false
-  return group.files.some(file => isConfigLoading(file))
-}
-
-const isStopping = (entry) => {
-  const id = getEntryModelId(entry)
-  return !!modelStore.lmdeployStopping[id]
-}
-
-const isGroupStopping = (group) => {
-  if (!group?.files?.length) return false
-  return group.files.some(file => isStopping(file))
-}
-
-const openGroupConfig = (group) => {
-  if (!group) return
-  // Pass the unified group directly - it has all the necessary info
-  openConfig(group)
-}
-
-const stopGroupRuntime = (group) => {
-  if (!group?.files?.length) return
-  // Use the group's model_id to stop the runtime
-  const groupModelId = group?.model_id
-  if (groupModelId) {
-    stopRuntime({ model_id: groupModelId })
-  } else {
-    stopRuntime(group.files[0])
-  }
-}
-
-const dialogStarting = computed(() => {
-  if (!selectedModelId.value) return false
-  return !!modelStore.lmdeployStarting[selectedModelId.value]
-})
-const dialogStopping = computed(() => {
-  if (!selectedModelId.value) return false
-  return !!modelStore.lmdeployStopping[selectedModelId.value]
-})
-
-const refreshStatus = async () => {
-  try {
-    await modelStore.fetchLmdeployStatus()
-  } catch (error) {
-    console.error(error)
-  }
-}
-
-const reloadFromDisk = async () => {
-  if (reloadingFromDisk.value) return
-  
-  const confirmed = confirm(
-    'This will reset all safetensors database entries and reload them from disk storage.\n\n' +
-    'This action cannot be undone. Continue?'
-  )
-  if (!confirmed) return
-  
-  reloadingFromDisk.value = true
-  try {
-    const response = await axios.post('/api/models/safetensors/reload-from-disk')
-    const result = response.data
-    toast.success(
-      `Reloaded ${result.reloaded} safetensors models from disk` +
-      (result.error_count ? ` (${result.error_count} errors)` : '')
-    )
-    if (result.errors && result.errors.length > 0) {
-      console.error('Reload errors:', result.errors)
-    }
-    // Refresh the model list
-    await modelStore.fetchSafetensorsModels()
-  } catch (error) {
-    console.error('Failed to reload safetensors from disk:', error)
-    toast.error(error.response?.data?.detail || 'Failed to reload safetensors from disk')
-  } finally {
-    reloadingFromDisk.value = false
-  }
-}
-
-const regenerateMetadata = async () => {
-  const modelId = selectedModelId.value
-  if (!modelId) return
-  try {
-    await modelStore.regenerateSafetensorsMetadata(modelId)
-    toast.success('Metadata regenerated')
-  } catch (error) {
-    console.error(error)
-    toast.error('Failed to regenerate metadata')
-  }
-}
-
-const openConfig = async (model) => {
-  selectedModel.value = model
-  dialogVisible.value = true
-  const modelId = getEntryModelId(model)
-  try {
-    await modelStore.fetchSafetensorsRuntimeConfig(modelId)
-  } catch (error) {
-    toast.error('Failed to load LMDeploy config')
-  }
-}
-
-const applyRuntimeConfig = (config) => {
-  if (!config) return
-  const normalized = { ...config }
-  if (normalized.context_length && normalized.session_len === undefined) {
-    normalized.session_len = normalized.context_length
-  }
-  if (normalized.max_batch_tokens && normalized.max_prefill_token_num === undefined) {
-    normalized.max_prefill_token_num = normalized.max_batch_tokens
-  }
-  Object.keys(formState).forEach((key) => {
-    if (normalized[key] !== undefined) {
-      // Handle array/string conversion for list fields (store as comma-separated string in formState)
-      if (['allow_origins', 'allow_methods', 'allow_headers', 'api_keys', 'adapters'].includes(key)) {
-        if (Array.isArray(normalized[key])) {
-          formState[key] = normalized[key].join(', ')
-        } else if (typeof normalized[key] === 'string') {
-          formState[key] = normalized[key]
-        } else {
-          formState[key] = ''
-        }
-      } else if (Array.isArray(normalized[key])) {
-        formState[key] = [...normalized[key]]
-      } else {
-        formState[key] = normalized[key]
-      }
-    }
-  })
-  hydrateHfOverrideFields(normalized.hf_overrides)
-}
-
-watch(selectedRuntime, (runtime) => {
-  if (runtime?.config) {
-    applyRuntimeConfig(runtime.config)
-  }
-}, { immediate: true })
-
-function hydrateHfOverrideFields(overrides) {
-  const rope = overrides?.rope_scaling || {}
-  formState.hf_override_rope_type = rope.rope_type || ''
-  const factorCandidate = rope.factor ?? rope.scale ?? null
-  formState.hf_override_rope_factor = factorCandidate !== undefined ? factorCandidate : null
-  const originalMax = rope.original_max_position_embeddings ?? rope.original_max_position_embedding ?? null
-  formState.hf_override_rope_original_max = originalMax !== undefined ? originalMax : null
-}
-
-function buildHfOverrides() {
-  const overrides = {}
-  const rope = {}
-  
-  // If scaling is enabled and we have adapted base context, use it
-  if (scalingEnabled.value && adaptedBaseContext.value && adaptedBaseContext.value >= 1024) {
-    rope.original_max_position_embeddings = adaptedBaseContext.value
-    // Set rope_type if scaling mode is yarn
-    if (formState.rope_scaling_mode === 'yarn') {
-      rope.rope_type = 'yarn'
-    }
-    // Set factor from scaling factor
-    if (formState.rope_scaling_factor && Number(formState.rope_scaling_factor) > 1) {
-      rope.factor = Number(formState.rope_scaling_factor)
-    }
-  } else {
-    // Use manual overrides if provided
-    if (formState.hf_override_rope_type) {
-      rope.rope_type = formState.hf_override_rope_type
-    }
-    if (formState.hf_override_rope_factor && Number(formState.hf_override_rope_factor) > 0) {
-      rope.factor = Number(formState.hf_override_rope_factor)
-    }
-    if (formState.hf_override_rope_original_max && Number(formState.hf_override_rope_original_max) > 0) {
-      rope.original_max_position_embeddings = Number(formState.hf_override_rope_original_max)
-    }
-  }
-  
-  if (Object.keys(rope).length) {
-    overrides.rope_scaling = rope
-  }
-  return overrides
-}
-
-const buildPayload = () => {
-  const payload = {
-    session_len: formState.session_len,
-    max_prefill_token_num: formState.max_prefill_token_num,
-    tensor_parallel: formState.tensor_parallel,
-    max_batch_size: formState.max_batch_size,
-    dtype: formState.dtype,
-    cache_max_entry_count: formState.cache_max_entry_count,
-    cache_block_seq_len: formState.cache_block_seq_len,
-    enable_prefix_caching: formState.enable_prefix_caching,
-    quant_policy: formState.quant_policy,
-    model_format: formState.model_format,
-    hf_overrides: buildHfOverrides(),
-    enable_metrics: formState.enable_metrics,
-    rope_scaling_mode: formState.rope_scaling_mode,
-    rope_scaling_factor: formState.rope_scaling_factor,
-    num_tokens_per_iter: formState.num_tokens_per_iter,
-    max_prefill_iters: formState.max_prefill_iters,
-    communicator: formState.communicator,
-    model_name: formState.model_name || null,
-    // Server configuration
-    allow_origins: formState.allow_origins && formState.allow_origins.length > 0 ? (typeof formState.allow_origins === 'string' ? formState.allow_origins.split(',').map(s => s.trim()).filter(s => s) : formState.allow_origins) : null,
-    allow_credentials: formState.allow_credentials || null,
-    allow_methods: formState.allow_methods && formState.allow_methods.length > 0 ? (typeof formState.allow_methods === 'string' ? formState.allow_methods.split(',').map(s => s.trim()).filter(s => s) : formState.allow_methods) : null,
-    allow_headers: formState.allow_headers && formState.allow_headers.length > 0 ? (typeof formState.allow_headers === 'string' ? formState.allow_headers.split(',').map(s => s.trim()).filter(s => s) : formState.allow_headers) : null,
-    proxy_url: formState.proxy_url || null,
-    max_concurrent_requests: formState.max_concurrent_requests || null,
-    log_level: formState.log_level || null,
-    api_keys: formState.api_keys && formState.api_keys.length > 0 ? (typeof formState.api_keys === 'string' ? formState.api_keys.split(',').map(s => s.trim()).filter(s => s) : formState.api_keys) : null,
-    ssl: formState.ssl || null,
-    max_log_len: formState.max_log_len || null,
-    disable_fastapi_docs: formState.disable_fastapi_docs || null,
-    allow_terminate_by_client: formState.allow_terminate_by_client || null,
-    enable_abort_handling: formState.enable_abort_handling || null,
-    // Model configuration
-    chat_template: formState.chat_template || null,
-    tool_call_parser: formState.tool_call_parser || null,
-    reasoning_parser: formState.reasoning_parser || null,
-    revision: formState.revision || null,
-    download_dir: formState.download_dir || null,
-    adapters: formState.adapters && formState.adapters.length > 0 ? (typeof formState.adapters === 'string' ? formState.adapters.split(',').map(s => s.trim()).filter(s => s) : formState.adapters) : null,
-    device: formState.device || null,
-    eager_mode: formState.eager_mode || null,
-    disable_vision_encoder: formState.disable_vision_encoder || null,
-    logprobs_mode: formState.logprobs_mode || null,
-    // DLLM parameters
-    dllm_block_length: formState.dllm_block_length || null,
-    dllm_unmasking_strategy: formState.dllm_unmasking_strategy || null,
-    dllm_denoising_steps: formState.dllm_denoising_steps || null,
-    dllm_confidence_threshold: formState.dllm_confidence_threshold || null,
-    // Distributed/Multi-node parameters
-    dp: formState.dp || null,
-    ep: formState.ep || null,
-    enable_microbatch: formState.enable_microbatch || null,
-    enable_eplb: formState.enable_eplb || null,
-    role: formState.role || null,
-    migration_backend: formState.migration_backend || null,
-    node_rank: formState.node_rank || null,
-    nnodes: formState.nnodes || null,
-    cp: formState.cp || null,
-    enable_return_routed_experts: formState.enable_return_routed_experts || null,
-    distributed_executor_backend: formState.distributed_executor_backend || null,
-    // Vision parameters
-    vision_max_batch_size: formState.vision_max_batch_size || null,
-    // Speculative decoding parameters
-    speculative_algorithm: formState.speculative_algorithm || null,
-    speculative_draft_model: formState.speculative_draft_model || null,
-    speculative_num_draft_tokens: formState.speculative_num_draft_tokens || null,
-    additional_args: formState.additional_args
-  }
-  
-  // Remove null/undefined values to keep payload clean
-  Object.keys(payload).forEach(key => {
-    if (payload[key] === null || payload[key] === undefined || payload[key] === '') {
-      delete payload[key]
-    }
-  })
-  
-  return payload
-}
-
-const saveConfig = async () => {
-  if (!selectedModelId.value) return
-  savingConfig.value = true
-  try {
-    await modelStore.updateSafetensorsRuntimeConfig(selectedModelId.value, buildPayload())
-    toast.success('LMDeploy config saved')
-  } catch (error) {
-    toast.error('Failed to save config')
-  } finally {
-    savingConfig.value = false
-  }
-}
-
-const startRuntime = async () => {
-  if (!selectedModelId.value) return
-  if (!lmdeployReady.value) {
-    toast.error('Install LMDeploy before starting a runtime')
-    return
-  }
-  if (lmdeployOperation.value) {
-    toast.info('LMDeploy installer is running—wait until it finishes')
-    return
-  }
-  try {
-    await modelStore.startSafetensorsRuntime(selectedModelId.value, buildPayload())
-    toast.success('LMDeploy starting…')
-  } catch (error) {
-    toast.error('Failed to start LMDeploy')
-  }
-}
-
-const stopRuntime = async (entry = null) => {
-  const targetId = entry ? getEntryModelId(entry) : selectedModelId.value
-  if (!targetId) return
-  try {
-    await modelStore.stopSafetensorsRuntime(targetId)
-    toast.success('LMDeploy stopped')
-  } catch (error) {
-    toast.error('Failed to stop LMDeploy')
-  }
-}
-
-const isModelRunning = (model) => {
-  const modelId = getEntryModelId(model)
-  return currentInstanceId.value === modelId
-}
-
-const isGroupRunning = (group) => {
-  // Check if the group's model_id matches the running instance
-  const groupModelId = group?.model_id
-  if (!groupModelId || !currentInstanceId.value) return false
-  return currentInstanceId.value === groupModelId
-}
-
-// Formatting functions are now imported from utils
-
-const openLmdeployPage = () => {
-  router.push('/lmdeploy')
-}
-
-onMounted(() => {
-  refreshStatus()
-})
-</script>
-
-<style scoped>
-.lmdeploy-alert {
-  display: flex;
-  gap: var(--spacing-md);
-  padding: var(--spacing-md);
-  background: var(--status-warning-soft);
-  border: 1px solid rgba(245, 158, 11, 0.3);
-  border-radius: var(--radius-md);
-  margin-bottom: var(--spacing-lg);
-  align-items: center;
-}
-
-.lmdeploy-alert.info {
-  background: var(--status-info-soft);
-  border-color: rgba(34, 211, 238, 0.3);
-}
-
-.alert-icon {
-  font-size: 1.5rem;
-  color: var(--status-warning);
-}
-
-.alert-content h3 {
-  margin: 0 0 var(--spacing-xs) 0;
-}
-
-.alert-content p {
-  margin: 0 0 var(--spacing-sm) 0;
-  color: var(--text-secondary);
-}
-
-.safetensors-card {
-  margin-top: var(--spacing-xl);
-  padding: var(--spacing-lg);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  background: var(--bg-card);
-  box-shadow: var(--shadow-sm);
-}
-
-.card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: var(--spacing-md);
-  gap: var(--spacing-md);
-}
-
-.card-header h2 {
-  margin: 0;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.subtitle {
-  margin: 4px 0 0;
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-}
-
-.actions {
-  display: flex;
-  gap: var(--spacing-xs);
-}
-
-.loading-state,
-.empty-state {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-xl);
-  color: var(--text-secondary);
-  text-align: center;
-}
-
-.model-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-  gap: var(--spacing-md);
-}
-
-.model-card {
-  border: 1px solid var(--border-secondary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  box-shadow: var(--shadow-sm);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.grouped-card {
-  gap: var(--spacing-md);
-}
-
-.model-card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  gap: var(--spacing-sm);
-}
-
-.model-name {
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 2px;
-}
-
-.model-path {
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-}
-
-.group-header {
-  border-bottom: 1px solid var(--border-secondary);
-  padding-bottom: var(--spacing-sm);
-}
-
-.group-summary {
-  display: flex;
-  align-items: center;
-  gap: 6px;
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-}
-
-.group-header-main {
-  display: flex;
-  flex-direction: column;
-  gap: 2px;
-}
-
-.group-status-row {
-  margin-top: var(--spacing-xs);
-  margin-bottom: var(--spacing-sm);
-}
-
-.group-status-row :deep(.status-indicator) {
-  font-size: 0.7rem;
-  padding: 2px 6px;
-}
-
-.grouped-body {
-  padding-top: var(--spacing-sm);
-}
-
-.file-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.plain-file-list {
-  gap: 2px;
-}
-
-.file-name-plain {
-  font-family: monospace;
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-  word-break: break-all;
-}
-
-.file-row {
-  border: 1px solid var(--border-secondary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-sm);
-  background: var(--bg-surface-2, var(--bg-card));
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  cursor: default;
-}
-
-.file-row-header {
-  display: flex;
-  justify-content: space-between;
-  gap: var(--spacing-sm);
-}
-
-.file-name {
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 4px;
-}
-
-.model-meta {
-  display: flex;
-  flex-direction: column;
-  gap: 4px;
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-}
-
-.meta-row {
-  display: flex;
-  align-items: center;
-  gap: 6px;
-}
-
-.endpoint {
-  display: flex;
-  align-items: center;
-  gap: 6px;
-  color: var(--accent-cyan);
-  font-size: 0.85rem;
-}
-
-.model-actions {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin-top: var(--spacing-sm);
-}
-
-.group-actions {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-top: var(--spacing-md);
-  gap: var(--spacing-sm);
-  flex-wrap: wrap;
-}
-
-.config-section {
-  border: 1px solid var(--border-secondary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  margin-bottom: var(--spacing-lg);
-}
-
-.config-section h4 {
-  margin: 0 0 var(--spacing-sm);
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.action-group {
-  display: flex;
-  gap: var(--spacing-xs);
-}
-
-.dot {
-  opacity: 0.6;
-}
-
-.dialog-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  width: 100%;
-  gap: var(--spacing-md);
-}
-
-.dialog-header h3 {
-  margin: 0;
-  font-weight: 600;
-}
-
-.dialog-header p {
-  margin: 2px 0 0;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.dialog-header-actions {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.config-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
-  gap: var(--spacing-md);
-}
-
-.config-field {
-  display: flex;
-  flex-direction: column;
-  gap: 6px;
-}
-
-.config-field label {
-  font-size: 0.9rem;
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 2px;
-}
-
-.field-help {
-  font-size: 0.8rem;
-  color: var(--text-secondary);
-  line-height: 1.4;
-  margin-top: 4px;
-  opacity: 0.85;
-}
-
-.qwen3-note {
-  display: block;
-  margin-top: 6px;
-  padding: 6px 8px;
-  background: var(--bg-secondary);
-  border-left: 3px solid var(--accent-cyan);
-  border-radius: 4px;
-  color: var(--text-primary);
-  opacity: 1;
-}
-
-.switch-label-group {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  width: 100%;
-  gap: var(--spacing-sm);
-}
-
-.switch-field .field-help {
-  margin-top: 6px;
-}
-
-.slider-row {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  width: 100%;
-}
-
-.slider-row :deep(.p-slider) {
-  flex: 1 1 auto;
-  min-width: 200px;
-  max-width: none;
-}
-
-.slider-row :deep(.p-slider .p-slider-range) {
-  background: var(--accent-cyan);
-}
-
-.slider-row :deep(.p-slider .p-slider-handle) {
-  border-color: var(--accent-cyan);
-}
-
-.slider-row :deep(.p-inputnumber) {
-  width: 140px;
-  flex-shrink: 0;
-  min-width: 140px;
-}
-
-.config-field .slider-row ~ * :deep(.p-inputnumber),
-.config-field > :not(.slider-row) :deep(.p-inputnumber) {
-  width: 100%;
-}
-
-.config-field :deep(.p-inputnumber .p-inputnumber-input) {
-  padding: 0.5rem;
-}
-
-.slider-row :deep(.p-inputnumber .p-inputnumber-input) {
-  width: 100%;
-}
-
-.config-field :deep(.p-inputtext) {
-  width: 100%;
-  padding: 0.5rem;
-}
-
-.config-field :deep(.p-dropdown) {
-  width: 100%;
-}
-
-.switch-field {
-  display: flex;
-  flex-direction: column;
-  gap: 6px;
-}
-
-.span-2 {
-  grid-column: span 2;
-}
-
-@media (max-width: 640px) {
-  .span-2 {
-    grid-column: span 1;
-  }
-}
-
-.metadata-section {
-  display: flex;
-  flex-wrap: wrap;
-  gap: var(--spacing-xl);
-  margin-top: var(--spacing-lg);
-}
-
-.metadata-section ul {
-  list-style: none;
-  padding: 0;
-  margin: 0;
-  display: flex;
-  flex-direction: column;
-  gap: 4px;
-  font-size: 0.9rem;
-}
-
-.rope-scaling-controls {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.rope-mode-dropdown {
-  width: 100%;
-}
-
-.rope-factor-row {
-  align-items: center;
-}
-
-.dtype-panel {
-  min-width: 220px;
-}
-
-.dtype-tags {
-  display: flex;
-  flex-wrap: wrap;
-  gap: var(--spacing-xs);
-}
-
-.hf-overrides-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-  gap: var(--spacing-sm);
-}
-
-.hf-override-field {
-  display: flex;
-  flex-direction: column;
-  gap: 6px;
-}
-
-.hf-override-field .field-label {
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-}
-
-.dialog-footer {
-  display: flex;
-  justify-content: flex-end;
-  gap: var(--spacing-sm);
-}
-</style>
diff --git a/frontend/src/components/SliderInput.vue b/frontend/src/components/SliderInput.vue
deleted file mode 100644
index 486b1d8..0000000
--- a/frontend/src/components/SliderInput.vue
+++ /dev/null
@@ -1,477 +0,0 @@
-<template>
-  <div class="slider-input">
-    <div class="slider-container">
-      <div class="slider-track">
-        <div class="slider-fill" :style="{ width: fillPercentage + '%' }"></div>
-        <!-- Markers for preset values -->
-        <div v-if="markers && markers.length > 0" class="slider-markers">
-          <div
-            v-for="(marker, index) in markers"
-            :key="index"
-            class="slider-marker"
-            :class="getMarkerClass(marker)"
-            :style="{ left: getMarkerPosition(marker) + '%' }"
-            :title="marker.label"
-          >
-            <span class="marker-dot"></span>
-            <span v-if="showMarkerLabels" class="marker-label">{{ marker.label }}</span>
-          </div>
-        </div>
-        <!-- Recommended value indicator -->
-        <div
-          v-if="recommended !== null && recommended !== undefined"
-          class="slider-recommended"
-          :style="{ left: getRecommendedPosition() + '%' }"
-          :title="`Recommended: ${formatValue(recommended)}`"
-        >
-          <span class="recommended-dot"></span>
-          <span class="recommended-line"></span>
-        </div>
-        <input
-          type="range"
-          :min="min"
-          :max="max"
-          :step="step"
-          :value="modelValue"
-          @input="updateValue"
-          class="slider"
-          :class="{ 'slider-disabled': disabled, 'near-recommended': isNearRecommended }"
-          :disabled="disabled"
-        />
-      </div>
-      <div class="slider-labels">
-        <span class="min-label">{{ formatValue(min) }}</span>
-        <span class="max-label">{{ formatValue(max) }}</span>
-      </div>
-    </div>
-    <div class="value-display">
-      <input
-        type="number"
-        :min="min"
-        :max="max"
-        :step="step"
-        :value="modelValue"
-        @input="updateValue"
-        class="number-input"
-        :class="{ 'input-disabled': disabled, 'recommended-value': isAtRecommended }"
-        :disabled="disabled"
-        :maxFractionDigits="maxFractionDigits"
-      />
-      <span v-if="isAtRecommended && recommended !== null" class="recommended-badge" title="At recommended value">
-        ✓
-      </span>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-
-const props = defineProps({
-  modelValue: {
-    type: [Number, String],
-    required: true
-  },
-  min: {
-    type: Number,
-    default: 0
-  },
-  max: {
-    type: Number,
-    default: 100
-  },
-  step: {
-    type: Number,
-    default: 1
-  },
-  maxFractionDigits: {
-    type: Number,
-    default: 0
-  },
-  disabled: {
-    type: Boolean,
-    default: false
-  },
-  markers: {
-    type: Array,
-    default: () => []
-  },
-  recommended: {
-    type: Number,
-    default: null
-  },
-  showMarkerLabels: {
-    type: Boolean,
-    default: true
-  }
-})
-
-const emit = defineEmits(['update:modelValue', 'input'])
-
-const updateValue = (event) => {
-  const value = parseFloat(event.target.value)
-  emit('update:modelValue', value)
-  emit('input', value)
-}
-
-const formatValue = (value) => {
-  if (props.maxFractionDigits === 0) {
-    return Math.round(value).toString()
-  }
-  return value.toFixed(props.maxFractionDigits)
-}
-
-const fillPercentage = computed(() => {
-  const range = props.max - props.min
-  const value = parseFloat(props.modelValue) - props.min
-  return Math.min(100, Math.max(0, (value / range) * 100))
-})
-
-const getMarkerPosition = (marker) => {
-  const range = props.max - props.min
-  const value = parseFloat(marker.value) - props.min
-  
-  // Calculate linear position
-  const linearPos = (value / range) * 100
-  
-  // Apply correction: browser's native range slider thumb positioning is non-linear
-  // Browser compensates for thumb width (20px), creating position-dependent offset
-  // Observed: 19.598% needs 1.0205x, 36.842% needs 0.980x, 88.89% needs 0.955x
-  // Quadratic fit: factor = 2.713e-05 * pos² - 0.003889 * pos + 1.0863
-  const correctionFactor = 2.713e-05 * linearPos * linearPos - 0.003889 * linearPos + 1.0863
-  
-  return Math.min(100, Math.max(0, linearPos * correctionFactor))
-}
-
-const getMarkerClass = (marker) => {
-  return marker.color ? `marker-${marker.color}` : ''
-}
-
-const getRecommendedPosition = () => {
-  if (props.recommended === null || props.recommended === undefined) return 0
-  const range = props.max - props.min
-  const value = parseFloat(props.recommended) - props.min
-  
-  // Calculate linear position
-  const linearPos = (value / range) * 100
-  
-  // Apply correction: browser's native range slider thumb positioning is non-linear
-  // Browser compensates for thumb width (20px), creating position-dependent offset
-  // Observed: 19.598% needs 1.0205x, 36.842% needs 0.980x, 88.89% needs 0.955x
-  // Quadratic fit: factor = 2.713e-05 * pos² - 0.003889 * pos + 1.0863
-  const correctionFactor = 2.713e-05 * linearPos * linearPos - 0.003889 * linearPos + 1.0863
-  
-  return Math.min(100, Math.max(0, linearPos * correctionFactor))
-}
-
-const isAtRecommended = computed(() => {
-  if (props.recommended === null || props.recommended === undefined) return false
-  const current = parseFloat(props.modelValue)
-  const rec = parseFloat(props.recommended)
-  const threshold = props.step || 1
-  return Math.abs(current - rec) < threshold
-})
-
-const isNearRecommended = computed(() => {
-  if (props.recommended === null || props.recommended === undefined) return false
-  if (isAtRecommended.value) return true
-  const current = parseFloat(props.modelValue)
-  const rec = parseFloat(props.recommended)
-  const range = props.max - props.min
-  const threshold = range * 0.05 // Within 5% of range
-  return Math.abs(current - rec) < threshold
-})
-</script>
-
-<style scoped>
-.slider-input {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  width: 100%;
-}
-
-.slider-container {
-  position: relative;
-  width: 100%;
-}
-
-.slider-track {
-  position: relative;
-  width: 100%;
-  height: 32px;
-  background: transparent;
-  border-radius: var(--radius-sm);
-  display: flex;
-  align-items: center;
-}
-
-.slider-track::before {
-  content: '';
-  position: absolute;
-  top: 50%;
-  left: 0;
-  right: 0;
-  height: 8px;
-  transform: translateY(-50%);
-  background: var(--bg-secondary);
-  border-radius: var(--radius-sm);
-  border: 2px solid var(--border-secondary);
-  box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.2);
-  z-index: 0;
-}
-
-.slider-fill {
-  position: absolute;
-  top: 50%;
-  left: 0;
-  height: 8px;
-  transform: translateY(-50%);
-  background: var(--gradient-primary);
-  border-radius: var(--radius-sm);
-  pointer-events: none;
-  z-index: 1;
-  transition: width var(--transition-normal);
-}
-
-.slider {
-  position: absolute;
-  top: 50%;
-  left: 0;
-  width: 100%;
-  height: 8px;
-  transform: translateY(-50%);
-  border-radius: var(--radius-sm);
-  background: transparent;
-  outline: none;
-  -webkit-appearance: none;
-  appearance: none;
-  cursor: pointer;
-  transition: all var(--transition-normal);
-  z-index: 2;
-}
-
-.slider::-webkit-slider-thumb {
-  -webkit-appearance: none;
-  appearance: none;
-  width: 20px;
-  height: 20px;
-  border-radius: 50%;
-  background: var(--gradient-primary);
-  cursor: pointer;
-  border: 2px solid var(--bg-primary);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  z-index: 3;
-}
-
-.slider::-webkit-slider-thumb:hover {
-  transform: scale(1.2);
-  box-shadow: var(--shadow-lg);
-}
-
-.slider::-moz-range-thumb {
-  width: 20px;
-  height: 20px;
-  border-radius: 50%;
-  background: var(--gradient-primary);
-  cursor: pointer;
-  border: 2px solid var(--bg-primary);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  z-index: 3;
-}
-
-.slider::-moz-range-thumb:hover {
-  transform: scale(1.2);
-  box-shadow: var(--shadow-lg);
-}
-
-.slider::-webkit-slider-track {
-  background: var(--bg-tertiary);
-  height: 8px;
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-}
-
-.slider::-moz-range-track {
-  background: var(--bg-tertiary);
-  height: 8px;
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-}
-
-.slider-disabled {
-  opacity: 0.5;
-  cursor: not-allowed;
-}
-
-.slider-disabled::-webkit-slider-thumb {
-  cursor: not-allowed;
-}
-
-.slider-disabled::-moz-range-thumb {
-  cursor: not-allowed;
-}
-
-.slider-labels {
-  display: flex;
-  justify-content: space-between;
-  margin-top: var(--spacing-xs);
-  font-size: 0.75rem;
-  color: var(--text-muted);
-}
-
-.value-display {
-  display: flex;
-  justify-content: center;
-}
-
-.number-input {
-  width: 80px;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  color: var(--text-primary);
-  text-align: center;
-  font-size: 0.9rem;
-  transition: all var(--transition-normal);
-}
-
-.number-input:focus {
-  outline: none;
-  border-color: var(--accent-cyan);
-  box-shadow: 0 0 0 2px var(--focus-ring);
-}
-
-.input-disabled {
-  opacity: 0.5;
-  cursor: not-allowed;
-}
-
-.slider-markers {
-  position: absolute;
-  top: 10px;
-  left: 0;
-  right: 0;
-  height: 100%;
-  pointer-events: none;
-  z-index: 2;
-}
-
-.slider-marker {
-  position: absolute;
-  top: 50%;
-  transform: translate(-50%, -50%);
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: 2px;
-}
-
-.marker-dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background: rgba(255, 255, 255, 0.6);
-  border: 1px solid rgba(255, 255, 255, 0.8);
-  box-shadow: 0 0 4px rgba(0, 0, 0, 0.3);
-}
-
-.marker-label {
-  font-size: 0.65rem;
-  color: var(--text-secondary);
-  white-space: nowrap;
-  margin-top: 4px;
-  font-weight: 500;
-}
-
-.marker-blue .marker-dot {
-  background: #3b82f6;
-  border-color: #60a5fa;
-}
-
-.marker-green .marker-dot {
-  background: #22c55e;
-  border-color: #4ade80;
-}
-
-.marker-purple .marker-dot {
-  background: #a855f7;
-  border-color: #c084fc;
-}
-
-.marker-yellow .marker-dot {
-  background: #f59e0b;
-  border-color: #fbbf24;
-}
-
-.slider-recommended {
-  position: absolute;
-  top: 0;
-  transform: translate(-50%, 0);
-  height: 100%;
-  pointer-events: none;
-  z-index: 3;
-}
-
-.recommended-dot {
-  position: absolute;
-  top: 50%;
-  left: 0;
-  transform: translateY(-50%);
-  width: 10px;
-  height: 10px;
-  border-radius: 50%;
-  background: var(--status-success);
-  border: 2px solid var(--bg-primary);
-  box-shadow: 0 0 8px rgba(16, 185, 129, 0.4), 0 2px 4px rgba(0, 0, 0, 0.2);
-  animation: pulse 2s ease-in-out infinite;
-}
-
-.recommended-line {
-  position: absolute;
-  top: 50%;
-  left: 50%;
-  transform: translate(-50%, -50%);
-  width: 2px;
-  height: 8px;
-  background: var(--status-success);
-  opacity: 0.5;
-}
-
-@keyframes pulse {
-  0%, 100% {
-    opacity: 1;
-    transform: translateY(-50%) scale(1);
-  }
-  50% {
-    opacity: 0.7;
-    transform: translateY(-50%) scale(1.1);
-  }
-}
-
-.slider.near-recommended {
-  opacity: 1;
-}
-
-.slider.near-recommended::-webkit-slider-thumb {
-  box-shadow: 0 0 16px rgba(16, 185, 129, 0.4), 0 4px 8px rgba(0, 0, 0, 0.2);
-}
-
-.recommended-value {
-  border-color: var(--status-success);
-  background: rgba(16, 185, 129, 0.1);
-}
-
-.recommended-badge {
-  position: absolute;
-  right: -24px;
-  top: 50%;
-  transform: translateY(-50%);
-  color: var(--status-success);
-  font-size: 1rem;
-  font-weight: bold;
-}
-</style>
diff --git a/frontend/src/components/common/BaseCard.vue b/frontend/src/components/common/BaseCard.vue
deleted file mode 100644
index 5ece0a3..0000000
--- a/frontend/src/components/common/BaseCard.vue
+++ /dev/null
@@ -1,35 +0,0 @@
-<template>
-  <div class="card" :class="cardClass">
-    <div v-if="$slots.header" class="card-header">
-      <slot name="header"></slot>
-    </div>
-    <div class="card-content">
-      <slot></slot>
-    </div>
-    <div v-if="$slots.footer" class="card-footer">
-      <slot name="footer"></slot>
-    </div>
-  </div>
-</template>
-
-<script setup>
-defineProps({
-  cardClass: {
-    type: String,
-    default: ''
-  }
-})
-</script>
-
-<style scoped>
-.card-content {
-  /* Content area styling handled by global .card class */
-}
-
-.card-footer {
-  margin-top: var(--spacing-lg);
-  padding-top: var(--spacing-md);
-  border-top: 1px solid var(--border-primary);
-}
-</style>
-
diff --git a/frontend/src/components/common/BaseDialog.vue b/frontend/src/components/common/BaseDialog.vue
deleted file mode 100644
index eb2fcc9..0000000
--- a/frontend/src/components/common/BaseDialog.vue
+++ /dev/null
@@ -1,70 +0,0 @@
-<template>
-  <Dialog
-    :visible="visible"
-    :header="header"
-    :modal="modal"
-    :style="dialogStyle"
-    :draggable="draggable"
-    :resizable="resizable"
-    :closable="closable"
-    :class="dialogClass"
-    @update:visible="$emit('update:visible', $event)"
-    @hide="$emit('hide')"
-  >
-    <template v-if="$slots.header" #header>
-      <slot name="header"></slot>
-    </template>
-    
-    <slot></slot>
-    
-    <template v-if="$slots.footer" #footer>
-      <slot name="footer"></slot>
-    </template>
-  </Dialog>
-</template>
-
-<script setup>
-import Dialog from 'primevue/dialog'
-
-defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  header: {
-    type: String,
-    default: ''
-  },
-  modal: {
-    type: Boolean,
-    default: true
-  },
-  dialogStyle: {
-    type: Object,
-    default: () => ({ width: '50vw', maxWidth: '600px' })
-  },
-  draggable: {
-    type: Boolean,
-    default: false
-  },
-  resizable: {
-    type: Boolean,
-    default: false
-  },
-  closable: {
-    type: Boolean,
-    default: true
-  },
-  dialogClass: {
-    type: String,
-    default: ''
-  }
-})
-
-defineEmits(['update:visible', 'hide'])
-</script>
-
-<style scoped>
-/* Dialog styling handled by global PrimeVue overrides */
-</style>
-
diff --git a/frontend/src/components/common/BaseFormField.vue b/frontend/src/components/common/BaseFormField.vue
deleted file mode 100644
index 1f3338e..0000000
--- a/frontend/src/components/common/BaseFormField.vue
+++ /dev/null
@@ -1,62 +0,0 @@
-<template>
-  <div class="form-field" :class="fieldClass">
-    <label v-if="label" :for="inputId">
-      {{ label }}
-      <span v-if="required" class="required-indicator">*</span>
-    </label>
-    <slot>
-      <!-- Default slot for input component -->
-    </slot>
-    <small v-if="helpText" class="help-text">{{ helpText }}</small>
-    <small v-if="error" class="error-text">{{ error }}</small>
-  </div>
-</template>
-
-<script setup>
-defineProps({
-  label: {
-    type: String,
-    default: ''
-  },
-  helpText: {
-    type: String,
-    default: ''
-  },
-  error: {
-    type: String,
-    default: ''
-  },
-  required: {
-    type: Boolean,
-    default: false
-  },
-  inputId: {
-    type: String,
-    default: ''
-  },
-  fieldClass: {
-    type: String,
-    default: ''
-  }
-})
-</script>
-
-<style scoped>
-.required-indicator {
-  color: var(--status-error);
-  margin-left: 0.25rem;
-}
-
-.help-text {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-  margin-top: var(--spacing-xs);
-}
-
-.error-text {
-  color: var(--status-error);
-  font-size: 0.8rem;
-  margin-top: var(--spacing-xs);
-}
-</style>
-
diff --git a/frontend/src/components/common/LogViewer.vue b/frontend/src/components/common/LogViewer.vue
deleted file mode 100644
index 31517c4..0000000
--- a/frontend/src/components/common/LogViewer.vue
+++ /dev/null
@@ -1,451 +0,0 @@
-<template>
-  <div class="log-viewer" :class="{ 'compact': compact, 'no-header': !showHeader }">
-    <div v-if="showHeader" class="log-viewer-header">
-      <div class="log-viewer-controls">
-        <Button 
-          v-if="showAutoScroll"
-          :icon="autoScroll ? 'pi pi-pause' : 'pi pi-play'"
-          @click="autoScroll = !autoScroll"
-          text
-          size="small"
-          :label="autoScroll ? 'Auto-scroll ON' : 'Auto-scroll OFF'"
-        />
-        <Button 
-          v-if="showClear"
-          icon="pi pi-trash" 
-          @click="handleClear"
-          text 
-          size="small"
-          label="Clear"
-        />
-        <span v-if="showCount" class="log-count">{{ logCount }} {{ logCount === 1 ? 'line' : 'lines' }}</span>
-      </div>
-    </div>
-    
-    <div class="log-viewer-content" ref="logContainer">
-      <!-- Structured logs mode (array of objects) -->
-      <template v-if="displayMode === 'structured' && structuredLogs.length > 0">
-        <div 
-          v-for="(log, index) in structuredLogs" 
-          :key="log.id || index"
-          class="log-entry"
-          :class="`log-${log.log_type || 'combined'}`"
-        >
-          <span v-if="log.timestamp" class="log-time">{{ formatTime(log.timestamp) }}</span>
-          <span v-if="log.log_type" class="log-type-badge">{{ log.log_type }}</span>
-          <span class="log-data">{{ log.data || log.line || log }}</span>
-        </div>
-      </template>
-      
-      <!-- Raw logs mode (array of strings or string) -->
-      <template v-else-if="displayMode === 'raw' && rawLogs.length > 0">
-        <pre 
-          v-for="(line, index) in rawLogs" 
-          :key="index"
-          :class="getLogLineClass(line)"
-        >{{ line }}</pre>
-      </template>
-      
-      <!-- Empty state -->
-      <div v-if="isEmpty" class="no-logs">
-        <i class="pi pi-info-circle"></i>
-        <span>{{ emptyMessage || 'No logs available' }}</span>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, computed, watch, nextTick, onMounted } from 'vue'
-import Button from 'primevue/button'
-
-const props = defineProps({
-  // Accept multiple input formats
-  logs: {
-    type: [Array, String],
-    default: () => []
-  },
-  // Display mode: 'auto' (detect), 'structured' (array of objects), 'raw' (strings)
-  mode: {
-    type: String,
-    default: 'auto'
-  },
-  // Display options
-  showHeader: {
-    type: Boolean,
-    default: true
-  },
-  showAutoScroll: {
-    type: Boolean,
-    default: true
-  },
-  showClear: {
-    type: Boolean,
-    default: true
-  },
-  showCount: {
-    type: Boolean,
-    default: true
-  },
-  compact: {
-    type: Boolean,
-    default: false
-  },
-  maxHeight: {
-    type: String,
-    default: '400px'
-  },
-  emptyMessage: {
-    type: String,
-    default: null
-  }
-})
-
-const emit = defineEmits(['clear'])
-
-const autoScroll = ref(true)
-const logContainer = ref(null)
-
-// Detect display mode
-const displayMode = computed(() => {
-  if (props.mode !== 'auto') {
-    return props.mode
-  }
-  
-  if (!props.logs || (Array.isArray(props.logs) && props.logs.length === 0)) {
-    return 'raw'
-  }
-  
-  // If it's a string, it's raw
-  if (typeof props.logs === 'string') {
-    return 'raw'
-  }
-  
-  // If it's an array, check first element
-  if (Array.isArray(props.logs) && props.logs.length > 0) {
-    const first = props.logs[0]
-    // If first element is an object with 'data' or 'line' property, it's structured
-    if (typeof first === 'object' && (first.data !== undefined || first.line !== undefined)) {
-      return 'structured'
-    }
-    // Otherwise it's raw strings
-    return 'raw'
-  }
-  
-  return 'raw'
-})
-
-// Process logs based on mode
-const structuredLogs = computed(() => {
-  if (displayMode.value !== 'structured') return []
-  if (!Array.isArray(props.logs)) return []
-  return props.logs.filter(Boolean)
-})
-
-const rawLogs = computed(() => {
-  if (displayMode.value !== 'raw') return []
-  
-  if (typeof props.logs === 'string') {
-    return props.logs.split('\n').filter(Boolean)
-  }
-  
-  if (Array.isArray(props.logs)) {
-    return props.logs.filter(Boolean)
-  }
-  
-  return []
-})
-
-const isEmpty = computed(() => {
-  if (displayMode.value === 'structured') {
-    return structuredLogs.value.length === 0
-  }
-  return rawLogs.value.length === 0
-})
-
-const logCount = computed(() => {
-  if (displayMode.value === 'structured') {
-    return structuredLogs.value.length
-  }
-  return rawLogs.value.length
-})
-
-// Auto-scroll functionality
-const scrollToBottom = () => {
-  if (logContainer.value && autoScroll.value) {
-    logContainer.value.scrollTop = logContainer.value.scrollHeight
-  }
-}
-
-// Watch for log changes
-watch(() => props.logs, async () => {
-  if (autoScroll.value && logContainer.value) {
-    await nextTick()
-    scrollToBottom()
-  }
-}, { deep: true })
-
-watch(() => logCount.value, async () => {
-  if (autoScroll.value && logContainer.value) {
-    await nextTick()
-    scrollToBottom()
-  }
-})
-
-// Scroll on mount
-onMounted(() => {
-  if (autoScroll.value && !isEmpty.value) {
-    nextTick(() => scrollToBottom())
-  }
-})
-
-const formatTime = (timestamp) => {
-  if (!timestamp) return ''
-  try {
-    return new Date(timestamp).toLocaleTimeString()
-  } catch {
-    return timestamp
-  }
-}
-
-const getLogLineClass = (line) => {
-  if (!line || typeof line !== 'string') return 'log-normal'
-  
-  const lowerLine = line.toLowerCase()
-  if (lowerLine.includes('error') || lowerLine.includes('failed') || lowerLine.includes('exception')) {
-    return 'log-error'
-  }
-  if (lowerLine.includes('warning') || lowerLine.includes('warn')) {
-    return 'log-warning'
-  }
-  if (lowerLine.includes('info') || lowerLine.includes('success')) {
-    return 'log-info'
-  }
-  return 'log-normal'
-}
-
-const handleClear = () => {
-  emit('clear')
-}
-</script>
-
-<style scoped>
-.log-viewer {
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  overflow: hidden;
-  display: flex;
-  flex-direction: column;
-}
-
-.log-viewer.compact {
-  border-radius: var(--radius-md);
-}
-
-.log-viewer.no-header {
-  border: none;
-  background: transparent;
-}
-
-.log-viewer-header {
-  background: var(--bg-secondary);
-  border-bottom: 1px solid var(--border-primary);
-  padding: var(--spacing-sm) var(--spacing-md);
-}
-
-.log-viewer-controls {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.log-count {
-  margin-left: auto;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.log-viewer-content {
-  overflow-y: auto;
-  font-family: 'Courier New', monospace;
-  font-size: 0.875rem;
-  background: var(--bg-tertiary);
-  color: var(--text-primary);
-  border-radius: var(--radius-md);
-  box-shadow: inset 0 2px 8px rgba(0, 0, 0, 0.3);
-}
-
-.log-viewer:not(.compact) .log-viewer-content {
-  max-height: v-bind(maxHeight);
-}
-
-.log-viewer.compact .log-viewer-content {
-  max-height: 200px;
-  padding: var(--spacing-xs);
-}
-
-/* Structured log entry styles */
-.log-entry {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-bottom: 1px solid var(--border-primary);
-  transition: all 0.3s ease;
-  position: relative;
-  animation: slideInUp 0.3s ease-out;
-}
-
-.log-entry:hover {
-  background: linear-gradient(90deg, rgba(34, 211, 238, 0.1) 0%, rgba(59, 130, 246, 0.1) 100%);
-  transform: translateX(5px);
-  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
-}
-
-.log-entry::before {
-  content: '';
-  position: absolute;
-  left: 0;
-  top: 0;
-  bottom: 0;
-  width: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity 0.3s ease;
-}
-
-.log-entry:hover::before {
-  opacity: 1;
-}
-
-.log-time {
-  color: #888;
-  font-size: 0.75rem;
-  min-width: 80px;
-  flex-shrink: 0;
-}
-
-.log-type-badge {
-  background: var(--accent-cyan);
-  color: white;
-  padding: var(--spacing-xs) var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  min-width: 60px;
-  text-align: center;
-  flex-shrink: 0;
-}
-
-.log-data {
-  flex: 1;
-  word-break: break-word;
-  line-height: 1.4;
-}
-
-/* Raw log pre styles */
-.log-viewer-content pre {
-  margin: 0;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  font-family: 'Courier New', monospace;
-  font-size: 0.875rem;
-  line-height: 1.4;
-  white-space: pre-wrap;
-  word-break: break-word;
-  border-bottom: 1px solid var(--border-primary);
-}
-
-.log-viewer.compact .log-viewer-content pre {
-  padding: 0.125rem var(--spacing-xs);
-  font-size: 0.75rem;
-}
-
-/* Log level classes */
-.log-error {
-  color: var(--status-error);
-  background: var(--status-error-soft);
-  padding: var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  margin: 0.125rem 0;
-}
-
-.log-warning {
-  color: var(--status-warning);
-  background: var(--status-warning-soft);
-  padding: var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  margin: 0.125rem 0;
-}
-
-.log-info {
-  color: var(--status-info);
-  background: var(--status-info-soft);
-  padding: var(--spacing-xs);
-  border-radius: var(--radius-sm);
-  margin: 0.125rem 0;
-}
-
-.log-normal {
-  color: var(--text-secondary);
-}
-
-.no-logs {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-xl);
-  color: var(--text-secondary);
-  font-style: italic;
-}
-
-/* Dark theme specific styles */
-.log-viewer-content {
-  background: #1a1a1a;
-}
-
-.log-entry {
-  border-bottom-color: #333;
-}
-
-.log-entry:hover {
-  background-color: #2a2a2a;
-}
-
-.log-viewer-content pre {
-  border-bottom-color: #333;
-}
-
-/* Light theme adjustments */
-:root[data-theme="light"] .log-viewer-content {
-  background: #f8f9fa;
-  color: #333;
-}
-
-:root[data-theme="light"] .log-entry {
-  border-bottom-color: #dee2e6;
-}
-
-:root[data-theme="light"] .log-entry:hover {
-  background-color: #e9ecef;
-}
-
-:root[data-theme="light"] .log-viewer-content pre {
-  border-bottom-color: #dee2e6;
-}
-
-:root[data-theme="light"] .log-time {
-  color: #6c757d;
-}
-
-@keyframes slideInUp {
-  from {
-    opacity: 0;
-    transform: translateY(10px);
-  }
-  to {
-    opacity: 1;
-    transform: translateY(0);
-  }
-}
-</style>
diff --git a/frontend/src/components/common/ProgressTracker.vue b/frontend/src/components/common/ProgressTracker.vue
new file mode 100644
index 0000000..31fcf45
--- /dev/null
+++ b/frontend/src/components/common/ProgressTracker.vue
@@ -0,0 +1,124 @@
+<template>
+  <div v-if="activeTasks.length > 0" class="progress-tracker">
+    <div
+      v-for="task in activeTasks"
+      :key="task.task_id"
+      class="progress-item"
+      :class="`status-${task.status}`"
+    >
+      <div class="progress-header">
+        <div class="task-info">
+          <i class="pi pi-spin pi-spinner" v-if="task.status === 'running'" />
+          <i class="pi pi-check-circle text-success" v-else-if="task.status === 'completed'" />
+          <i class="pi pi-times-circle text-danger" v-else-if="task.status === 'failed'" />
+          <span class="task-description">{{ task.description }}</span>
+        </div>
+        <span class="progress-percent">{{ Math.round(task.progress) }}%</span>
+      </div>
+      <ProgressBar :value="task.progress" :class="task.status === 'failed' ? 'p-progressbar-danger' : ''" />
+      <small v-if="task.message" class="task-message" :class="task.status === 'failed' ? 'text-danger' : 'text-muted'">
+        {{ task.message }}
+      </small>
+    </div>
+  </div>
+</template>
+
+<script setup>
+import { computed } from 'vue'
+import ProgressBar from 'primevue/progressbar'
+import { useProgressStore } from '@/stores/progress'
+
+const props = defineProps({
+  /** Single type string or array of types to show (e.g. ['build', 'install_release']) */
+  type: {
+    type: [String, Array],
+    default: null,
+  },
+  showCompleted: {
+    type: Boolean,
+    default: false,
+  },
+})
+
+const progressStore = useProgressStore()
+
+const activeTasks = computed(() => {
+  const allTasks = Object.values(progressStore.tasks)
+  const types = props.type == null
+    ? null
+    : Array.isArray(props.type)
+      ? props.type
+      : [props.type]
+  return allTasks.filter((t) => {
+    const typeMatch = !types || types.length === 0 || types.includes(t.type)
+    const statusMatch = t.status === 'running' || (props.showCompleted && t.status === 'completed') || t.status === 'failed'
+    return typeMatch && statusMatch
+  })
+})
+</script>
+
+<style scoped>
+.progress-tracker {
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-md, 0.75rem);
+  margin: var(--spacing-md, 0.75rem) 0;
+}
+
+.progress-item {
+  background: var(--bg-surface, #1e2235);
+  border: 1px solid var(--border-primary, #2a2f45);
+  border-radius: var(--radius-md, 0.5rem);
+  padding: var(--spacing-md, 0.75rem);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-sm, 0.5rem);
+}
+
+.progress-item.status-failed {
+  border-color: var(--color-error, #ef4444);
+  background: rgba(239, 68, 68, 0.05);
+}
+
+.progress-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: var(--spacing-sm, 0.5rem);
+}
+
+.task-info {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-sm, 0.5rem);
+  flex: 1;
+  min-width: 0;
+}
+
+.task-description {
+  font-weight: 500;
+  font-size: 0.875rem;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+.progress-percent {
+  font-size: 0.75rem;
+  font-weight: 600;
+  color: var(--text-secondary, #9ca3af);
+  flex-shrink: 0;
+}
+
+.task-message {
+  font-size: 0.75rem;
+  display: block;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+.text-success { color: #22c55e; }
+.text-danger  { color: #ef4444; }
+.text-muted   { color: var(--text-secondary, #9ca3af); }
+</style>
diff --git a/frontend/src/components/common/StatusBadge.vue b/frontend/src/components/common/StatusBadge.vue
deleted file mode 100644
index 0ca68fc..0000000
--- a/frontend/src/components/common/StatusBadge.vue
+++ /dev/null
@@ -1,99 +0,0 @@
-<template>
-  <span 
-    class="status-badge" 
-    :class="[`status-${status}`, badgeClass]"
-    :title="tooltip"
-  >
-    <i v-if="icon" :class="icon" class="status-icon"></i>
-    <span v-if="label">{{ label }}</span>
-    <slot></slot>
-  </span>
-</template>
-
-<script setup>
-defineProps({
-  status: {
-    type: String,
-    required: true,
-    validator: (value) => ['success', 'warning', 'error', 'info', 'running', 'stopped', 'downloading'].includes(value)
-  },
-  label: {
-    type: String,
-    default: ''
-  },
-  icon: {
-    type: String,
-    default: ''
-  },
-  tooltip: {
-    type: String,
-    default: ''
-  },
-  badgeClass: {
-    type: String,
-    default: ''
-  }
-})
-</script>
-
-<style scoped>
-.status-badge {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-md);
-  font-size: 0.75rem;
-  font-weight: 600;
-  text-transform: uppercase;
-  letter-spacing: 0.05em;
-  white-space: nowrap;
-}
-
-.status-icon {
-  font-size: 0.875rem;
-}
-
-.status-success {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--status-success);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.status-warning {
-  background: rgba(245, 158, 11, 0.1);
-  color: var(--status-warning);
-  border: 1px solid rgba(245, 158, 11, 0.2);
-}
-
-.status-error {
-  background: rgba(239, 68, 68, 0.1);
-  color: var(--status-error);
-  border: 1px solid rgba(239, 68, 68, 0.2);
-}
-
-.status-info {
-  background: rgba(34, 211, 238, 0.1);
-  color: var(--status-info);
-  border: 1px solid rgba(34, 211, 238, 0.2);
-}
-
-.status-running {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--status-success);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.status-stopped {
-  background: rgba(148, 163, 184, 0.1);
-  color: var(--text-muted);
-  border: 1px solid rgba(148, 163, 184, 0.2);
-}
-
-.status-downloading {
-  background: rgba(34, 211, 238, 0.1);
-  color: var(--accent-cyan);
-  border: 1px solid rgba(34, 211, 238, 0.2);
-}
-</style>
-
diff --git a/frontend/src/components/config/AdvancedSection.vue b/frontend/src/components/config/AdvancedSection.vue
deleted file mode 100644
index db8e08d..0000000
--- a/frontend/src/components/config/AdvancedSection.vue
+++ /dev/null
@@ -1,81 +0,0 @@
-<template>
-  <div class="advanced-section">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-wrench"></i>
-        RoPE & YARN Settings
-      </h4>
-      <ConfigField label="RoPE Freq Base" help-text="RoPE frequency base">
-        <template #input>
-          <InputNumber v-model="config.rope_freq_base" :min="0" :max="100000" />
-        </template>
-      </ConfigField>
-      <ConfigField label="RoPE Freq Scale" help-text="RoPE frequency scale">
-        <template #input>
-          <InputNumber v-model="config.rope_freq_scale" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YARN Ext Factor" help-text="YARN extension factor">
-        <template #input>
-          <InputNumber v-model="config.yarn_ext_factor" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YARN Attn Factor" help-text="YARN attention factor">
-        <template #input>
-          <InputNumber v-model="config.yarn_attn_factor" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="RoPE Scaling" help-text="RoPE scaling type">
-        <template #input>
-          <InputText v-model="config.rope_scaling" placeholder="linear, yarn" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YAML Config" help-text="Extra YAML config" full-width>
-        <template #input>
-          <Textarea v-model="config.yaml" rows="3" placeholder="Additional YAML configuration" />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-// PrimeVue
-import InputNumber from 'primevue/inputnumber'
-import InputText from 'primevue/inputtext'
-import Textarea from 'primevue/textarea'
-
-// Components
-import ConfigField from '@/components/config/ConfigField.vue'
-
-const props = defineProps({
-  config: {
-    type: Object,
-    required: true
-  }
-})
-</script>
-
-<style scoped>
-.advanced-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-</style>
diff --git a/frontend/src/components/config/AdvancedSettingsSection.vue b/frontend/src/components/config/AdvancedSettingsSection.vue
deleted file mode 100644
index ee33f05..0000000
--- a/frontend/src/components/config/AdvancedSettingsSection.vue
+++ /dev/null
@@ -1,83 +0,0 @@
-<template>
-  <div class="tab-content">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-wrench"></i>
-        RoPE & YARN Settings
-      </h4>
-      <ConfigField label="RoPE Freq Base" help-text="RoPE frequency base">
-        <template #input>
-          <InputNumber v-model="config.rope_freq_base" :min="0" :max="100000" />
-        </template>
-      </ConfigField>
-      <ConfigField label="RoPE Freq Scale" help-text="RoPE frequency scale">
-        <template #input>
-          <InputNumber v-model="config.rope_freq_scale" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YARN Ext Factor" help-text="YARN extension factor">
-        <template #input>
-          <InputNumber v-model="config.yarn_ext_factor" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YARN Attn Factor" help-text="YARN attention factor">
-        <template #input>
-          <InputNumber v-model="config.yarn_attn_factor" :min="0" :max="100" :step="0.1" :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="RoPE Scaling" help-text="RoPE scaling type">
-        <template #input>
-          <InputText v-model="config.rope_scaling" placeholder="linear, yarn" />
-        </template>
-      </ConfigField>
-      <ConfigField label="YAML Config" help-text="Extra YAML config" full-width>
-        <template #input>
-          <Textarea v-model="config.yaml" rows="3" placeholder="Additional YAML configuration" />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import ConfigField from './ConfigField.vue'
-import InputNumber from 'primevue/inputnumber'
-import InputText from 'primevue/inputtext'
-import Textarea from 'primevue/textarea'
-
-defineProps({
-  config: {
-    type: Object,
-    required: true
-  }
-})
-</script>
-
-<style scoped>
-.tab-content {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin: 0 0 0.75rem 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-}
-
-.tab-section-title i {
-  color: var(--accent-cyan);
-}
-</style>
-
diff --git a/frontend/src/components/config/ConfigChangePreview.vue b/frontend/src/components/config/ConfigChangePreview.vue
deleted file mode 100644
index d438316..0000000
--- a/frontend/src/components/config/ConfigChangePreview.vue
+++ /dev/null
@@ -1,329 +0,0 @@
-<template>
-  <Dialog 
-    :visible="visible" 
-    @update:visible="$emit('update:visible', $event)"
-    :modal="true"
-    :style="{ width: '600px' }"
-    :header="`Preview ${type} Changes`"
-    :dismissableMask="true"
-    class="config-preview-dialog"
-    aria-label="Configuration change preview"
-    @touchstart="handleTouchStart"
-    @touchmove="handleTouchMove"
-    @touchend="handleTouchEnd"
-  >
-    <div class="preview-content">
-      <div class="preview-header">
-        <i class="pi pi-info-circle" aria-hidden="true"></i>
-        <p>{{ type === 'preset' ? `${presetName} preset will change:` : 'Smart Auto will change:' }}</p>
-      </div>
-
-      <div class="changes-list">
-        <div 
-          v-for="change in changes" 
-          :key="change.field"
-          class="change-item"
-          :class="{ 'has-impact': change.impact }"
-        >
-          <div class="change-field">
-            <strong>{{ change.field }}</strong>
-          </div>
-          <div class="change-values">
-            <span class="value-before">{{ formatValue(change.before) }}</span>
-            <i class="pi pi-arrow-right" aria-hidden="true"></i>
-            <span class="value-after">{{ formatValue(change.after) }}</span>
-          </div>
-          <div v-if="change.description" class="change-description">
-            {{ change.description }}
-          </div>
-        </div>
-      </div>
-
-      <div v-if="impact" class="impact-preview">
-        <h4>Expected Impact:</h4>
-        <div class="impact-items">
-          <div v-if="impact.performance" class="impact-item performance">
-            <i class="pi pi-chart-line" aria-hidden="true"></i>
-            <span>{{ impact.performance }}</span>
-          </div>
-          <div v-if="impact.vram" class="impact-item vram">
-            <i class="pi pi-memory" aria-hidden="true"></i>
-            <span>{{ impact.vram }}</span>
-          </div>
-          <div v-if="impact.ram" class="impact-item ram">
-            <i class="pi pi-server" aria-hidden="true"></i>
-            <span>{{ impact.ram }}</span>
-          </div>
-        </div>
-      </div>
-
-      <div class="preview-warning" v-if="hasWarnings">
-        <i class="pi pi-exclamation-triangle" aria-hidden="true"></i>
-        <p>Some changes may affect memory usage. Review the impact above.</p>
-      </div>
-    </div>
-
-    <template #footer>
-      <Button 
-        label="Cancel" 
-        icon="pi pi-times" 
-        @click="$emit('cancel')"
-        severity="secondary"
-        outlined
-        aria-label="Cancel configuration changes"
-      />
-      <Button 
-        label="Apply Changes" 
-        icon="pi pi-check" 
-        @click="$emit('apply')"
-        :loading="applying"
-        aria-label="Apply configuration changes"
-      />
-    </template>
-  </Dialog>
-</template>
-
-<script setup>
-import { computed, ref } from 'vue'
-import Dialog from 'primevue/dialog'
-import Button from 'primevue/button'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  type: {
-    type: String,
-    default: 'smart-auto' // 'smart-auto' or 'preset'
-  },
-  presetName: {
-    type: String,
-    default: ''
-  },
-  changes: {
-    type: Array,
-    default: () => []
-  },
-  impact: {
-    type: Object,
-    default: null
-  },
-  applying: {
-    type: Boolean,
-    default: false
-  }
-})
-
-const emit = defineEmits(['update:visible', 'apply', 'cancel'])
-
-// Touch gesture handling for swipe to dismiss
-const touchStartX = ref(0)
-const touchStartY = ref(0)
-const touchThreshold = 50
-
-const handleTouchStart = (e) => {
-  if (e.touches && e.touches.length > 0) {
-    touchStartX.value = e.touches[0].clientX
-    touchStartY.value = e.touches[0].clientY
-  }
-}
-
-const handleTouchMove = (e) => {
-  if (e.touches && e.touches.length > 0) {
-    const deltaX = e.touches[0].clientX - touchStartX.value
-    const deltaY = e.touches[0].clientY - touchStartY.value
-    
-    if (deltaY > touchThreshold && Math.abs(deltaX) < Math.abs(deltaY)) {
-      e.preventDefault()
-    }
-  }
-}
-
-const handleTouchEnd = (e) => {
-  if (e.changedTouches && e.changedTouches.length > 0) {
-    const deltaX = e.changedTouches[0].clientX - touchStartX.value
-    const deltaY = e.changedTouches[0].clientY - touchStartY.value
-    
-    if (deltaY > touchThreshold && Math.abs(deltaX) < Math.abs(deltaY)) {
-      emit('cancel')
-    }
-  }
-  
-  touchStartX.value = 0
-  touchStartY.value = 0
-}
-
-const formatValue = (value) => {
-  if (value === null || value === undefined) return 'Not set'
-  if (typeof value === 'boolean') return value ? 'Enabled' : 'Disabled'
-  if (typeof value === 'number') {
-    if (value >= 1000 && value < 1000000) return `${(value / 1000).toFixed(1)}K`
-    if (value >= 1000000) return `${(value / 1000000).toFixed(1)}M`
-    return value.toString()
-  }
-  return value.toString()
-}
-
-const hasWarnings = computed(() => {
-  return props.impact && (props.impact.vram || props.impact.ram)
-})
-</script>
-
-<style scoped>
-.preview-content {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.preview-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-md);
-  background: rgba(34, 211, 238, 0.1);
-  border: 1px solid rgba(34, 211, 238, 0.2);
-  border-radius: var(--radius-md);
-}
-
-.preview-header i {
-  font-size: 1.5rem;
-  color: var(--accent-cyan);
-}
-
-.preview-header p {
-  margin: 0;
-  font-size: 1rem;
-  color: var(--text-primary);
-  font-weight: 500;
-}
-
-.changes-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-  max-height: 400px;
-  overflow-y: auto;
-}
-
-.change-item {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-md);
-  transition: all var(--transition-normal);
-}
-
-.change-item.has-impact {
-  border-left: 3px solid var(--accent-cyan);
-}
-
-.change-field {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.95rem;
-}
-
-.change-values {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  font-size: 0.9rem;
-}
-
-.value-before {
-  color: var(--text-secondary);
-  text-decoration: line-through;
-}
-
-.value-after {
-  color: var(--accent-cyan);
-  font-weight: 600;
-}
-
-.change-values i {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-}
-
-.change-description {
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-  font-style: italic;
-  margin-top: var(--spacing-xs);
-}
-
-.impact-preview {
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-md);
-}
-
-.impact-preview h4 {
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.impact-items {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.impact-item {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  border-radius: var(--radius-sm);
-}
-
-.impact-item.performance {
-  background: rgba(34, 197, 94, 0.1);
-  color: #22c55e;
-}
-
-.impact-item.vram {
-  background: rgba(245, 158, 11, 0.1);
-  color: #f59e0b;
-}
-
-.impact-item.ram {
-  background: rgba(59, 130, 246, 0.1);
-  color: var(--accent-blue);
-}
-
-.impact-item i {
-  font-size: 1.1rem;
-}
-
-.preview-warning {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: rgba(245, 158, 11, 0.1);
-  border: 1px solid rgba(245, 158, 11, 0.3);
-  border-radius: var(--radius-md);
-}
-
-.preview-warning i {
-  color: #f59e0b;
-  font-size: 1.2rem;
-  flex-shrink: 0;
-}
-
-.preview-warning p {
-  margin: 0;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-}
-</style>
-
diff --git a/frontend/src/components/config/ConfigField.vue b/frontend/src/components/config/ConfigField.vue
deleted file mode 100644
index 832a5c9..0000000
--- a/frontend/src/components/config/ConfigField.vue
+++ /dev/null
@@ -1,91 +0,0 @@
-<template>
-  <div class="config-field" :class="{ 'full-width': fullWidth }">
-    <div v-if="label" class="label-wrapper">
-      <label :for="fieldId" :id="`label-${fieldId}`">{{ label }}</label>
-      <SettingsTooltip 
-        v-if="tooltip"
-        :title="label"
-        :description="tooltip.description || ''"
-        :when-to-adjust="tooltip.whenToAdjust"
-        :tradeoffs="tooltip.tradeoffs || []"
-        :recommended="tooltip.recommended"
-        :ranges="tooltip.ranges || []"
-      />
-    </div>
-    <div :aria-labelledby="label ? `label-${fieldId}` : undefined" :aria-describedby="getAriaDescribedBy()">
-      <slot name="input"></slot>
-    </div>
-    <small v-if="helpText" :id="`help-${fieldId}`" class="help-text">{{ helpText }}</small>
-    <slot name="validation"></slot>
-  </div>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-import SettingsTooltip from './SettingsTooltip.vue'
-
-const props = defineProps({
-  label: {
-    type: String,
-    default: null
-  },
-  helpText: {
-    type: String,
-    default: null
-  },
-  fullWidth: {
-    type: Boolean,
-    default: false
-  },
-  tooltip: {
-    type: Object,
-    default: null
-  },
-  fieldId: {
-    type: String,
-    default: () => `field-${Math.random().toString(36).substr(2, 9)}`
-  }
-})
-
-const getAriaDescribedBy = () => {
-  const ids = []
-  if (props.helpText) {
-    ids.push(`help-${props.fieldId}`)
-  }
-  return ids.length > 0 ? ids.join(' ') : undefined
-}
-</script>
-
-<style scoped>
-.config-field {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  min-width: 0;
-  width: 100%;
-}
-
-.config-field.full-width {
-  grid-column: 1 / -1;
-}
-
-.label-wrapper {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-}
-
-.config-field label {
-  font-weight: 500;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-  margin: 0;
-}
-
-.config-field small {
-  color: var(--text-secondary);
-  font-size: 0.75rem;
-  line-height: 1.3;
-}
-</style>
-
diff --git a/frontend/src/components/config/ConfigSection.vue b/frontend/src/components/config/ConfigSection.vue
deleted file mode 100644
index 5262a8d..0000000
--- a/frontend/src/components/config/ConfigSection.vue
+++ /dev/null
@@ -1,145 +0,0 @@
-<template>
-  <details 
-    class="config-section" 
-    :open="expanded" 
-    v-bind="$attrs" 
-    :aria-label="`${title} configuration section`"
-    @toggle.capture.stop.prevent
-    @click.capture.stop
-  >
-    <summary 
-      class="section-title" 
-      @click.stop.prevent="$emit('toggle')"
-      @mousedown.stop
-      :aria-expanded="expanded"
-      :aria-controls="`section-${title.toLowerCase().replace(/\s+/g, '-')}`"
-    >
-      <div class="title-left">
-        <i :class="icon" aria-hidden="true"></i>
-        <span>{{ title }}</span>
-      </div>
-      <span v-if="badge" class="section-badge" :class="badgeClass" :aria-label="`${badge} section`">{{ badge }}</span>
-    </summary>
-    <div 
-      v-show="expanded"
-      class="section-grid" 
-      :id="`section-${title.toLowerCase().replace(/\s+/g, '-')}`" 
-      :aria-hidden="!expanded"
-    >
-      <slot></slot>
-    </div>
-  </details>
-</template>
-
-<script setup>
-defineProps({
-  title: {
-    type: String,
-    required: true
-  },
-  icon: {
-    type: String,
-    required: true
-  },
-  expanded: {
-    type: Boolean,
-    default: false
-  },
-  badge: {
-    type: String,
-    default: null
-  },
-  badgeClass: {
-    type: String,
-    default: ''
-  }
-})
-
-defineEmits(['toggle'])
-
-// Expose all attributes to allow data-section to be passed through
-defineOptions({
-  inheritAttrs: false
-})
-</script>
-
-<style scoped>
-.config-section {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.config-section > summary {
-  cursor: pointer;
-  padding: var(--spacing-xl);
-  list-style: none;
-  position: relative;
-  display: block;
-}
-
-.config-section > summary::-webkit-details-marker {
-  display: none !important;
-}
-
-.config-section > summary::marker {
-  display: none !important;
-}
-
-.config-section[open] > summary {
-  border-bottom: 1px solid var(--border-primary);
-}
-
-.section-title {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  margin: 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-  user-select: none;
-}
-
-.title-left {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.section-badge {
-  display: inline-block;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 600;
-  letter-spacing: 0.5px;
-  text-transform: uppercase;
-}
-
-.section-badge.essential-badge {
-  background: rgba(34, 197, 94, 0.15);
-  color: #22c55e;
-}
-
-.section-badge.advanced-badge {
-  background: rgba(245, 158, 11, 0.15);
-  color: #f59e0b;
-}
-
-.section-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
-  gap: var(--spacing-lg);
-  width: 100%;
-  min-width: 0;
-  padding: var(--spacing-xl);
-}
-</style>
-
diff --git a/frontend/src/components/config/ConfigWarnings.vue b/frontend/src/components/config/ConfigWarnings.vue
deleted file mode 100644
index 2065a37..0000000
--- a/frontend/src/components/config/ConfigWarnings.vue
+++ /dev/null
@@ -1,90 +0,0 @@
-<template>
-  <div v-if="warnings.length > 0" class="config-warnings">
-    <h3 class="section-title">
-      <i class="pi pi-exclamation-triangle"></i>
-      Configuration Warnings
-    </h3>
-    <div class="warnings-list">
-      <div 
-        v-for="warning in warnings" 
-        :key="warning.field" 
-        class="warning-item" 
-        :class="warning.type"
-      >
-        <i :class="warning.type === 'error' ? 'pi pi-times-circle' : 'pi pi-exclamation-triangle'"></i>
-        <span>{{ warning.message }}</span>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-defineProps({
-  warnings: {
-    type: Array,
-    default: () => []
-  }
-})
-</script>
-
-<style scoped>
-.config-warnings {
-  margin: var(--spacing-lg) 0;
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-}
-
-.section-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1.1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.section-title i {
-  color: var(--status-warning);
-}
-
-.warnings-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.warning-item {
-  display: flex;
-  align-items: flex-start;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  border-radius: var(--radius-md);
-  font-size: 0.875rem;
-  line-height: 1.5;
-}
-
-.warning-item.error {
-  background: var(--status-error-soft);
-  color: var(--status-error);
-  border-left: 3px solid var(--status-error);
-}
-
-.warning-item.warning {
-  background: var(--status-warning-soft);
-  color: var(--status-warning);
-  border-left: 3px solid var(--status-warning);
-}
-
-.warning-item i {
-  font-size: 1rem;
-  flex-shrink: 0;
-  margin-top: 2px;
-}
-
-.warning-item span {
-  flex: 1;
-}
-</style>
diff --git a/frontend/src/components/config/ConfigWizard.vue b/frontend/src/components/config/ConfigWizard.vue
deleted file mode 100644
index 8448f89..0000000
--- a/frontend/src/components/config/ConfigWizard.vue
+++ /dev/null
@@ -1,764 +0,0 @@
-<template>
-  <Dialog 
-    :visible="visible" 
-    modal 
-    :closable="true"
-    :dismissableMask="true"
-    :draggable="false"
-    class="config-wizard-dialog"
-    @update:visible="visible = $event"
-    @hide="$emit('close')"
-    @touchstart="handleTouchStart"
-    @touchmove="handleTouchMove"
-    @touchend="handleTouchEnd"
-  >
-    <template #header>
-      <div class="wizard-header">
-        <i class="pi pi-magic"></i>
-        <h2>Configuration Wizard</h2>
-      </div>
-    </template>
-
-    <div class="wizard-content">
-      <div class="wizard-steps">
-        <div 
-          v-for="(step, index) in steps" 
-          :key="index"
-          class="wizard-step"
-          :class="{ active: currentStep === index, completed: currentStep > index }"
-        >
-          <div class="step-number">{{ index + 1 }}</div>
-          <div class="step-label">{{ step.label }}</div>
-        </div>
-      </div>
-
-      <div class="wizard-step-content">
-        <!-- Step 1: Use Case Selection -->
-        <div v-if="currentStep === 0" class="step-panel">
-          <h3>What are you using this model for?</h3>
-          <p class="step-description">Select your primary use case to get optimized settings</p>
-          
-          <div class="use-case-grid">
-            <div 
-              v-for="useCase in useCases" 
-              :key="useCase.id"
-              class="use-case-card"
-              :class="{ active: wizardData.useCase === useCase.id }"
-              @click="wizardData.useCase = useCase.id"
-            >
-              <div class="use-case-icon">{{ useCase.icon }}</div>
-              <h4>{{ useCase.title }}</h4>
-              <p>{{ useCase.description }}</p>
-            </div>
-          </div>
-        </div>
-
-        <!-- Step 2: Resource Allocation -->
-        <div v-if="currentStep === 1" class="step-panel">
-          <h3>Resource Allocation</h3>
-          <p class="step-description">Balance between speed and quality based on your hardware</p>
-          
-          <div class="resource-section">
-            <div class="hardware-info">
-              <i class="pi pi-desktop"></i>
-              <div>
-                <strong>Detected Hardware</strong>
-                <p v-if="gpuInfo">
-                  {{ gpuInfo.name || `GPU ${gpuInfo.device_count || 0}` }}
-                  <span v-if="gpuInfo.total_vram">({{ formatFileSize(gpuInfo.total_vram) }} VRAM)</span>
-                </p>
-                <p v-else>CPU-only mode</p>
-              </div>
-            </div>
-            
-            <div class="speed-quality-slider">
-              <label>Speed ←→ Quality</label>
-              <SliderInput 
-                v-model="wizardData.speedQuality" 
-                :min="0" 
-                :max="100" 
-                :step="1"
-                :markers="[
-                  { value: 0, label: 'Max Speed', color: 'blue' },
-                  { value: 50, label: 'Balanced', color: 'green' },
-                  { value: 100, label: 'Max Quality', color: 'purple' }
-                ]"
-              />
-              <div class="slider-description">
-                <template v-if="wizardData.speedQuality < 34">
-                  <strong>Max Speed Mode</strong>
-                  <p>Lower context size, larger batches, optimized GPU layers for maximum throughput</p>
-                </template>
-                <template v-else-if="wizardData.speedQuality < 67">
-                  <strong>Balanced Mode</strong>
-                  <p>Optimal balance between speed and quality with moderate context and batch sizes</p>
-                </template>
-                <template v-else>
-                  <strong>Max Quality Mode</strong>
-                  <p>Higher context size, better quantization, full GPU offloading for maximum quality</p>
-                </template>
-              </div>
-            </div>
-          </div>
-        </div>
-
-        <!-- Step 3: Review & Preview -->
-        <div v-if="currentStep === 2" class="step-panel">
-          <h3>Review Generated Configuration</h3>
-          <p class="step-description">Review the settings we've generated for you. You can fine-tune them later.</p>
-          
-          <div class="config-preview">
-            <div class="preview-header">
-              <div class="preview-summary">
-                <h4>Configuration Summary</h4>
-                <div class="summary-badges">
-                  <span class="badge">Use Case: {{ getUseCaseTitle(wizardData.useCase) }}</span>
-                  <span class="badge">Mode: {{ wizardData.speedQuality < 50 ? 'Speed' : 'Quality' }}</span>
-                </div>
-              </div>
-            </div>
-            
-            <div class="preview-settings">
-              <div class="preview-item" v-for="(value, key) in generatedConfig" :key="key">
-                <span class="setting-label">{{ formatSettingName(key) }}:</span>
-                <span class="setting-value">{{ formatSettingValue(value) }}</span>
-              </div>
-            </div>
-            
-            <div class="preview-impact">
-              <div class="impact-item">
-                <i class="pi pi-tachometer-alt"></i>
-                <span>Estimated Performance: {{ estimatedPerformance }}</span>
-              </div>
-              <div class="impact-item">
-                <i class="pi pi-memory"></i>
-                <span>VRAM Usage: {{ estimatedVramUsage }}</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-    </div>
-
-    <template #footer>
-      <div class="wizard-footer">
-        <Button 
-          v-if="currentStep > 0"
-          label="Back" 
-          icon="pi pi-arrow-left" 
-          @click="currentStep--"
-          text
-        />
-        <Button 
-          v-if="currentStep < 2"
-          label="Next" 
-          icon="pi pi-arrow-right" 
-          iconPos="right"
-          @click="currentStep++"
-          :disabled="!canProceed"
-        />
-        <Button 
-          v-if="currentStep === 2"
-          label="Apply & Start" 
-          icon="pi pi-check" 
-          @click="applyConfig"
-          severity="success"
-          :loading="applying"
-        />
-        <Button 
-          v-if="currentStep === 2"
-          label="Advanced Mode" 
-          icon="pi pi-sliders-h" 
-          @click="goToAdvanced"
-          severity="secondary"
-          outlined
-        />
-      </div>
-    </template>
-  </Dialog>
-</template>
-
-<script setup>
-import { ref, computed, watch } from 'vue'
-import Dialog from 'primevue/dialog'
-import Button from 'primevue/button'
-import SliderInput from '@/components/SliderInput.vue'
-import { formatFileSize } from '@/utils/formatting'
-
-const props = defineProps({
-  modelVisible: {
-    type: Boolean,
-    default: false
-  },
-  model: {
-    type: Object,
-    default: null
-  },
-  gpuInfo: {
-    type: Object,
-    default: () => ({})
-  },
-  modelLayerInfo: {
-    type: Object,
-    default: null
-  }
-})
-
-const emit = defineEmits(['close', 'apply-config', 'go-to-advanced'])
-
-const visible = computed({
-  get: () => props.modelVisible,
-  set: (val) => {
-    if (!val) emit('close')
-  }
-})
-
-const currentStep = ref(0)
-const applying = ref(false)
-
-const steps = [
-  { label: 'Use Case' },
-  { label: 'Resources' },
-  { label: 'Review' }
-]
-
-// Touch gesture handling for swipe to dismiss
-const touchStartX = ref(0)
-const touchStartY = ref(0)
-const touchThreshold = 50 // Minimum swipe distance
-
-const handleTouchStart = (e) => {
-  if (e.touches && e.touches.length > 0) {
-    touchStartX.value = e.touches[0].clientX
-    touchStartY.value = e.touches[0].clientY
-  }
-}
-
-const handleTouchMove = (e) => {
-  // Prevent default to allow swipe detection
-  if (e.touches && e.touches.length > 0) {
-    const deltaX = e.touches[0].clientX - touchStartX.value
-    const deltaY = e.touches[0].clientY - touchStartY.value
-    
-    // If swiping down significantly, allow dismiss
-    if (deltaY > touchThreshold && Math.abs(deltaX) < Math.abs(deltaY)) {
-      e.preventDefault()
-    }
-  }
-}
-
-const handleTouchEnd = (e) => {
-  if (e.changedTouches && e.changedTouches.length > 0) {
-    const deltaX = e.changedTouches[0].clientX - touchStartX.value
-    const deltaY = e.changedTouches[0].clientY - touchStartY.value
-    
-    // Swipe down to dismiss
-    if (deltaY > touchThreshold && Math.abs(deltaX) < Math.abs(deltaY)) {
-      emit('close')
-    }
-  }
-  
-  touchStartX.value = 0
-  touchStartY.value = 0
-}
-
-const useCases = [
-  {
-    id: 'chat',
-    icon: '💬',
-    title: 'Chat/Conversation',
-    description: 'Natural conversations and Q&A',
-    preset: 'conversational'
-  },
-  {
-    id: 'code',
-    icon: '💻',
-    title: 'Code Generation',
-    description: 'Code completion and generation',
-    preset: 'coding'
-  },
-  {
-    id: 'creative',
-    icon: '✍️',
-    title: 'Creative Writing',
-    description: 'Stories, articles, creative content',
-    preset: 'creative'
-  },
-  {
-    id: 'analysis',
-    icon: '🔍',
-    title: 'Analysis/Research',
-    description: 'Document analysis and research',
-    preset: 'conversational'
-  }
-]
-
-const wizardData = ref({
-  useCase: null,
-  speedQuality: 50 // 0-100 scale
-})
-
-const canProceed = computed(() => {
-  if (currentStep.value === 0) {
-    return wizardData.value.useCase !== null
-  }
-  return true
-})
-
-// Configuration is now generated by the backend API
-// This computed property is kept for preview/estimation purposes only
-const generatedConfig = computed(() => {
-  // Return a minimal preview config based on selections
-  // Actual config will come from backend API
-  const useCase = useCases.find(uc => uc.id === wizardData.value.useCase)
-  const speedQuality = wizardData.value.speedQuality || 50
-  
-  return {
-    // Placeholder values for preview only
-    ctx_size: useCase?.id === 'analysis' ? 16384 : useCase?.id === 'code' ? 8192 : 4096,
-    batch_size: speedQuality < 34 ? 512 : speedQuality < 67 ? 384 : 256,
-    temp: useCase?.id === 'code' ? 0.3 : useCase?.id === 'creative' ? 1.2 : 0.8,
-    temperature: useCase?.id === 'code' ? 0.3 : useCase?.id === 'creative' ? 1.2 : 0.8,
-    n_gpu_layers: props.gpuInfo?.device_count > 0 ? (props.modelLayerInfo?.layer_count || 32) : 0
-  }
-})
-
-const estimatedPerformance = computed(() => {
-  const speedQuality = wizardData.value.speedQuality || 50
-  
-  // Rough estimate based on speed/quality setting
-  // Actual performance will depend on backend-generated config
-  if (speedQuality < 34) {
-    return 'Very Fast (~60+ tok/s)'
-  } else if (speedQuality < 67) {
-    return 'Fast (~40-50 tok/s)'
-  } else {
-    return 'Moderate (~25-35 tok/s)'
-  }
-})
-
-const estimatedVramUsage = computed(() => {
-  if (!props.gpuInfo?.total_vram) return 'N/A'
-  const hasGPU = props.gpuInfo?.device_count > 0
-  if (!hasGPU) return 'CPU-only (0 VRAM)'
-  
-  // Rough estimate - actual VRAM will be calculated by backend
-  const speedQuality = wizardData.value.speedQuality || 50
-  const qualityFactor = speedQuality / 100
-  
-  // Estimate based on quality factor (quality uses more VRAM)
-  const baseEstimate = props.gpuInfo.total_vram * (0.6 + (qualityFactor * 0.3)) // 60-90% of total VRAM
-  const percentage = Math.round(((baseEstimate / props.gpuInfo.total_vram) * 100))
-  
-  return `~${formatFileSize(baseEstimate)} (${percentage}%) / ${formatFileSize(props.gpuInfo.total_vram)}`
-})
-
-const getUseCaseTitle = (useCaseId) => {
-  const useCase = useCases.find(uc => uc.id === useCaseId)
-  return useCase?.title || 'Unknown'
-}
-
-const formatSettingName = (key) => {
-  const names = {
-    n_gpu_layers: 'GPU Layers',
-    ctx_size: 'Context Size',
-    batch_size: 'Batch Size',
-    temp: 'Temperature',
-    top_k: 'Top-K',
-    top_p: 'Top-P',
-    repeat_penalty: 'Repeat Penalty'
-  }
-  return names[key] || key
-}
-
-const formatSettingValue = (value) => {
-  if (typeof value === 'number') {
-    if (value >= 1000) return value.toLocaleString()
-    if (value < 1) return value.toFixed(2)
-    return Math.round(value)
-  }
-  return value
-}
-
-// formatFileSize is now imported from @/utils/formatting
-
-const applyConfig = async () => {
-  applying.value = true
-  try {
-    // Call backend Smart Auto API instead of generating config locally
-    if (!props.model?.id) {
-      console.error('No model ID available')
-      return
-    }
-    
-    const params = new URLSearchParams()
-    if (wizardData.value.speedQuality !== undefined) {
-      params.append('speed_quality', wizardData.value.speedQuality.toString())
-    }
-    if (wizardData.value.useCase) {
-      params.append('use_case', wizardData.value.useCase)
-    }
-    // Default to single_user usage mode for wizard
-    params.append('usage_mode', 'single_user')
-    
-    const url = `/api/models/${props.model.id}/smart-auto${params.toString() ? '?' + params.toString() : ''}`
-    const response = await fetch(url, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' }
-    })
-    
-    if (!response.ok) {
-      const error = await response.json().catch(() => ({ detail: 'Failed to generate configuration' }))
-      throw new Error(error.detail || 'Failed to generate configuration')
-    }
-    
-    const backendConfig = await response.json()
-    
-    // Emit the backend-generated configuration
-    emit('apply-config', backendConfig)
-    visible.value = false
-  } catch (error) {
-    console.error('Error generating configuration:', error)
-    // Fall back to local generation if API fails (graceful degradation)
-    emit('apply-config', generatedConfig.value)
-    visible.value = false
-  } finally {
-    applying.value = false
-  }
-}
-
-const goToAdvanced = () => {
-  emit('go-to-advanced')
-  visible.value = false
-}
-
-// Reset wizard when closed
-watch(visible, (newVal) => {
-  if (!newVal) {
-    currentStep.value = 0
-    wizardData.value = { useCase: null, speedQuality: 50 }
-  }
-})
-</script>
-
-<style scoped>
-.config-wizard-dialog {
-  max-width: 800px;
-}
-
-.wizard-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.wizard-header i {
-  font-size: 2rem;
-  color: var(--accent-cyan);
-}
-
-.wizard-header h2 {
-  margin: 0;
-  font-size: 1.75rem;
-  background: linear-gradient(135deg, #22d3ee, #3b82f6);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-}
-
-.wizard-content {
-  padding: var(--spacing-lg) 0;
-}
-
-.wizard-steps {
-  display: flex;
-  justify-content: center;
-  align-items: center;
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-xl);
-  padding: var(--spacing-lg) 0;
-}
-
-.wizard-step {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  position: relative;
-  flex: 1;
-  max-width: 150px;
-}
-
-.wizard-step:not(:last-child)::after {
-  content: '';
-  position: absolute;
-  top: 20px;
-  left: calc(50% + 30px);
-  width: calc(100% - 60px);
-  height: 2px;
-  background: var(--border-primary);
-  z-index: 0;
-}
-
-.wizard-step.completed:not(:last-child)::after {
-  background: var(--accent-cyan);
-}
-
-.step-number {
-  width: 40px;
-  height: 40px;
-  border-radius: 50%;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  background: var(--bg-surface);
-  border: 2px solid var(--border-primary);
-  color: var(--text-secondary);
-  font-weight: 600;
-  font-size: 1.1rem;
-  position: relative;
-  z-index: 1;
-  transition: all var(--transition-normal);
-}
-
-.wizard-step.active .step-number {
-  background: var(--accent-cyan);
-  border-color: var(--accent-cyan);
-  color: white;
-  transform: scale(1.1);
-  box-shadow: 0 0 0 4px rgba(34, 211, 238, 0.2);
-}
-
-.wizard-step.completed .step-number {
-  background: var(--accent-green);
-  border-color: var(--accent-green);
-  color: white;
-}
-
-.step-label {
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-  text-align: center;
-}
-
-.wizard-step.active .step-label {
-  color: var(--accent-cyan);
-  font-weight: 600;
-}
-
-.wizard-step-content {
-  min-height: 400px;
-}
-
-.step-panel {
-  padding: var(--spacing-lg);
-}
-
-.step-panel h3 {
-  margin: 0 0 var(--spacing-sm) 0;
-  font-size: 1.5rem;
-  color: var(--text-primary);
-}
-
-.step-description {
-  margin: 0 0 var(--spacing-xl) 0;
-  color: var(--text-secondary);
-  font-size: 0.95rem;
-}
-
-.use-case-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
-  gap: var(--spacing-lg);
-}
-
-.use-case-card {
-  background: var(--bg-surface);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-  cursor: pointer;
-  transition: all var(--transition-normal);
-  text-align: center;
-}
-
-.use-case-card:hover {
-  border-color: var(--accent-cyan);
-  transform: translateY(-2px);
-  box-shadow: var(--shadow-md);
-}
-
-.use-case-card.active {
-  border-color: var(--accent-primary);
-  background: rgba(34, 211, 238, 0.1);
-  box-shadow: 0 0 0 3px rgba(34, 211, 238, 0.2);
-}
-
-.use-case-icon {
-  font-size: 3rem;
-  margin-bottom: var(--spacing-sm);
-}
-
-.use-case-card h4 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.1rem;
-  color: var(--text-primary);
-}
-
-.use-case-card p {
-  margin: 0;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  line-height: 1.4;
-}
-
-.resource-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xl);
-}
-
-.hardware-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border-radius: var(--radius-lg);
-  border: 1px solid var(--border-primary);
-}
-
-.hardware-info i {
-  font-size: 2rem;
-  color: var(--accent-cyan);
-}
-
-.hardware-info strong {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-  color: var(--text-primary);
-}
-
-.hardware-info p {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.speed-quality-slider {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.speed-quality-slider label {
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.slider-description {
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-  text-align: center;
-}
-
-.slider-description strong {
-  display: block;
-  color: var(--text-primary);
-  font-size: 1rem;
-  margin-bottom: var(--spacing-xs);
-}
-
-.slider-description p {
-  margin: var(--spacing-xs) 0 0 0;
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.config-preview {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-}
-
-.preview-header {
-  margin-bottom: var(--spacing-lg);
-  padding-bottom: var(--spacing-lg);
-  border-bottom: 1px solid var(--border-primary);
-}
-
-.preview-header h4 {
-  margin: 0 0 var(--spacing-md) 0;
-  color: var(--text-primary);
-}
-
-.summary-badges {
-  display: flex;
-  gap: var(--spacing-sm);
-  flex-wrap: wrap;
-}
-
-.badge {
-  padding: var(--spacing-xs) var(--spacing-sm);
-  background: rgba(34, 211, 238, 0.15);
-  color: var(--accent-cyan);
-  border-radius: var(--radius-sm);
-  font-size: 0.875rem;
-  font-weight: 600;
-}
-
-.preview-settings {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-lg);
-}
-
-.preview-item {
-  display: flex;
-  justify-content: space-between;
-  padding: var(--spacing-sm);
-  background: var(--bg-primary);
-  border-radius: var(--radius-sm);
-}
-
-.setting-label {
-  font-weight: 500;
-  color: var(--text-secondary);
-}
-
-.setting-value {
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.preview-impact {
-  display: flex;
-  gap: var(--spacing-lg);
-  padding-top: var(--spacing-lg);
-  border-top: 1px solid var(--border-primary);
-}
-
-.impact-item {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-primary);
-}
-
-.impact-item i {
-  color: var(--accent-cyan);
-}
-
-.wizard-footer {
-  display: flex;
-  justify-content: space-between;
-  gap: var(--spacing-md);
-}
-</style>
-
diff --git a/frontend/src/components/config/ContextParamsSection.vue b/frontend/src/components/config/ContextParamsSection.vue
deleted file mode 100644
index 211cc6f..0000000
--- a/frontend/src/components/config/ContextParamsSection.vue
+++ /dev/null
@@ -1,163 +0,0 @@
-<template>
-  <div class="tab-content">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-memory"></i>
-        Context & Memory
-      </h4>
-      <ConfigField 
-        label="Context Size" 
-        :tooltip="contextSizeTooltip"
-        :help-text="`Max context length (max: ${maxContextSize.toLocaleString()})`"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.ctx_size" 
-            :min="512" 
-            :max="maxContextSize" 
-            :recommended="recommendedContextSize" 
-            @input="$emit('update-vram-estimate')" 
-          />
-        </template>
-        <template #validation>
-          <div v-if="contextSizeValidation" class="inline-validation" :class="contextSizeValidation.type">
-            <i class="pi pi-exclamation-triangle"></i>
-            <span>{{ contextSizeValidation.message }}</span>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField 
-        label="Batch Size" 
-        :tooltip="batchSizeTooltip"
-        :help-text="`Parallel tokens (max: ${maxBatchSize})`"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.batch_size" 
-            :min="1" 
-            :max="maxBatchSize" 
-            :recommended="recommendedBatchSize" 
-            @input="$emit('update-vram-estimate')" 
-          />
-        </template>
-        <template #validation>
-          <div v-if="batchSizeValidation" class="inline-validation" :class="batchSizeValidation.type">
-            <i class="pi pi-exclamation-triangle"></i>
-            <span>{{ batchSizeValidation.message }}</span>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField label="U-Batch Size" :help-text="`Unified batch (max: ${maxBatchSize})`">
-        <template #input>
-          <SliderInput v-model="config.ubatch_size" :min="1" :max="maxBatchSize" />
-        </template>
-      </ConfigField>
-      <ConfigField label="No Memory Map" help-text="Disable mmap">
-        <template #input>
-          <Checkbox v-model="config.no_mmap" binary />
-        </template>
-      </ConfigField>
-      <ConfigField label="Mlock" help-text="Lock model in RAM (prevent swapping)">
-        <template #input>
-          <Checkbox v-model="config.mlock" binary />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import ConfigField from './ConfigField.vue'
-import SliderInput from '@/components/SliderInput.vue'
-import Checkbox from 'primevue/checkbox'
-
-defineProps({
-  config: {
-    type: Object,
-    required: true
-  },
-  maxContextSize: {
-    type: Number,
-    default: 131072
-  },
-  maxBatchSize: {
-    type: Number,
-    default: 512
-  },
-  recommendedContextSize: {
-    type: Number,
-    default: null
-  },
-  recommendedBatchSize: {
-    type: Number,
-    default: null
-  },
-  contextSizeValidation: {
-    type: Object,
-    default: null
-  },
-  batchSizeValidation: {
-    type: Object,
-    default: null
-  },
-  contextSizeTooltip: {
-    type: String,
-    default: ''
-  },
-  batchSizeTooltip: {
-    type: String,
-    default: ''
-  }
-})
-
-defineEmits(['update-vram-estimate'])
-</script>
-
-<style scoped>
-.tab-content {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin: 0 0 0.75rem 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-}
-
-.tab-section-title i {
-  color: var(--accent-cyan);
-}
-
-.inline-validation {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-top: 0.5rem;
-  font-size: 0.875rem;
-}
-
-.inline-validation.error {
-  color: var(--status-error);
-}
-
-.inline-validation.warning {
-  color: var(--status-warning);
-}
-
-.inline-validation.success {
-  color: var(--status-success);
-}
-</style>
-
diff --git a/frontend/src/components/config/CustomArgsSection.vue b/frontend/src/components/config/CustomArgsSection.vue
deleted file mode 100644
index b6661d8..0000000
--- a/frontend/src/components/config/CustomArgsSection.vue
+++ /dev/null
@@ -1,32 +0,0 @@
-<template>
-  <div class="custom-args-section">
-    <ConfigField label="Custom Arguments" help-text="Additional command-line arguments" full-width>
-      <template #input>
-        <Textarea v-model="config.customArgs" rows="4" placeholder="Enter custom llama.cpp arguments..." />
-      </template>
-    </ConfigField>
-  </div>
-</template>
-
-<script setup>
-// PrimeVue
-import Textarea from 'primevue/textarea'
-
-// Components
-import ConfigField from '@/components/config/ConfigField.vue'
-
-const props = defineProps({
-  config: {
-    type: Object,
-    required: true
-  }
-})
-</script>
-
-<style scoped>
-.custom-args-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-</style>
diff --git a/frontend/src/components/config/EmptyState.vue b/frontend/src/components/config/EmptyState.vue
deleted file mode 100644
index f70186d..0000000
--- a/frontend/src/components/config/EmptyState.vue
+++ /dev/null
@@ -1,145 +0,0 @@
-<template>
-  <div v-if="visible" class="empty-state">
-    <div class="empty-state-content">
-      <div class="empty-icon" aria-hidden="true">{{ icon }}</div>
-      <h3 class="empty-title">{{ title }}</h3>
-      <p class="empty-description">{{ description }}</p>
-      
-      <div class="empty-actions">
-        <slot name="actions">
-          <Button 
-            v-if="showSmartAuto"
-            label="Use Smart Auto" 
-            icon="pi pi-bolt"
-            @click="$emit('smart-auto')"
-            severity="info"
-            size="large"
-            class="empty-action-primary"
-            aria-label="Use Smart Auto to automatically configure settings"
-          />
-          <Button 
-            v-if="showPresets"
-            label="Choose Preset" 
-            icon="pi pi-sliders-h"
-            @click="$emit('presets')"
-            severity="secondary"
-            outlined
-            size="large"
-            aria-label="Choose a preset configuration"
-          />
-          <Button 
-            label="Manual Setup" 
-            icon="pi pi-cog"
-            @click="$emit('manual')"
-            text
-            size="large"
-            aria-label="Configure settings manually"
-          />
-        </slot>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import Button from 'primevue/button'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  icon: {
-    type: String,
-    default: '🎯'
-  },
-  title: {
-    type: String,
-    default: 'Configure Your First Model'
-  },
-  description: {
-    type: String,
-    default: 'Start with a preset or let Smart Auto optimize for you'
-  },
-  showSmartAuto: {
-    type: Boolean,
-    default: true
-  },
-  showPresets: {
-    type: Boolean,
-    default: true
-  }
-})
-
-defineEmits(['smart-auto', 'presets', 'manual'])
-</script>
-
-<style scoped>
-.empty-state {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  min-height: 400px;
-  padding: var(--spacing-xl);
-  margin: var(--spacing-xl) 0;
-}
-
-.empty-state-content {
-  text-align: center;
-  max-width: 500px;
-  width: 100%;
-}
-
-.empty-icon {
-  font-size: 4rem;
-  line-height: 1;
-  margin-bottom: var(--spacing-lg);
-  animation: float 3s ease-in-out infinite;
-}
-
-@keyframes float {
-  0%, 100% {
-    transform: translateY(0);
-  }
-  50% {
-    transform: translateY(-10px);
-  }
-}
-
-.empty-title {
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1.75rem;
-  font-weight: 700;
-  color: var(--text-primary);
-  background: linear-gradient(135deg, #22d3ee, #3b82f6);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-}
-
-.empty-description {
-  margin: 0 0 var(--spacing-xl) 0;
-  font-size: 1rem;
-  color: var(--text-secondary);
-  line-height: 1.6;
-}
-
-.empty-actions {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-  align-items: center;
-}
-
-.empty-action-primary {
-  min-width: 200px;
-}
-
-@media (min-width: 600px) {
-  .empty-actions {
-    flex-direction: row;
-    justify-content: center;
-  }
-}
-</style>
-
diff --git a/frontend/src/components/config/EssentialSettingsSection.vue b/frontend/src/components/config/EssentialSettingsSection.vue
deleted file mode 100644
index 13580b4..0000000
--- a/frontend/src/components/config/EssentialSettingsSection.vue
+++ /dev/null
@@ -1,182 +0,0 @@
-<template>
-  <div class="essential-settings-section">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-microchip"></i>
-        Model Loading
-      </h4>
-      <ConfigField 
-        v-if="!systemStore.gpuInfo.cpu_only_mode" 
-        label="GPU Layers" 
-        :tooltip="gpuLayersTooltip"
-        :help-text="`Layers offloaded to GPU (max: ${maxGpuLayers})`"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.n_gpu_layers" 
-            :min="0" 
-            :max="maxGpuLayers" 
-            :recommended="recommendedGpuLayers" 
-            :disabled="!gpuAvailable"
-            @input="$emit('update-vram-estimate')" 
-          />
-        </template>
-        <template #validation>
-          <div v-if="gpuLayersValidation" class="inline-validation" :class="gpuLayersValidation.type">
-            <i :class="gpuLayersValidation.type === 'error' ? 'pi pi-times-circle' : 'pi pi-check-circle'"></i>
-            <span>{{ gpuLayersValidation.message }}</span>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!systemStore.gpuInfo.cpu_only_mode" label="Main GPU" help-text="Primary GPU">
-        <template #input>
-          <Dropdown 
-            v-model="config.main_gpu" 
-            :options="gpuOptions" 
-            optionLabel="label"
-            optionValue="value"
-            placeholder="Select GPU" 
-            :disabled="!gpuAvailable" 
-          />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!systemStore.gpuInfo.cpu_only_mode" label="Tensor Split" help-text="Multi-GPU ratios">
-        <template #input>
-          <InputText 
-            v-model="config.tensor_split" 
-            placeholder="0.5,0.5" 
-            :disabled="!gpuAvailable" 
-          />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!systemStore.gpuInfo.cpu_only_mode" label="Split Mode" help-text="How to split model across multiple GPUs">
-        <template #input>
-          <Dropdown 
-            v-model="config.split_mode" 
-            :options="splitModeOptions" 
-            optionLabel="label"
-            optionValue="value"
-            placeholder="Select split mode" 
-            :disabled="!gpuAvailable" 
-          />
-        </template>
-      </ConfigField>
-      <ConfigField label="CPU Threads" help-text="CPU threads for computation">
-        <template #input>gi
-          <SliderInput 
-            v-model="config.threads" 
-            :min="1" 
-            :max="systemStore.gpuInfo.cpu_threads" 
-          />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-// PrimeVue
-import Dropdown from 'primevue/dropdown'
-import InputText from 'primevue/inputtext'
-
-// Components
-import ConfigField from '@/components/config/ConfigField.vue'
-import SliderInput from '@/components/SliderInput.vue'
-
-// Stores
-import { useSystemStore } from '@/stores/system'
-import { computed } from 'vue'
-
-const props = defineProps({
-  config: {
-    type: Object,
-    required: true
-  },
-  maxGpuLayers: {
-    type: Number,
-    required: true
-  },
-  recommendedGpuLayers: {
-    type: Number,
-    default: null
-  },
-  gpuLayersTooltip: {
-    type: String,
-    default: ''
-  },
-  gpuLayersValidation: {
-    type: Object,
-    default: null
-  },
-  gpuAvailable: {
-    type: Boolean,
-    default: true
-  }
-})
-
-const emit = defineEmits(['update-vram-estimate'])
-
-const systemStore = useSystemStore()
-
-// GPU options
-const gpuOptions = computed(() => {
-  return Array.from({ length: systemStore.gpuInfo.device_count }, (_, i) => ({
-    label: `GPU ${i}`,
-    value: i
-  }))
-})
-
-// Split mode options
-// Note: "graph" mode is supported by ik_llama.cpp fork
-// Backend validation will handle if binary doesn't support it
-const splitModeOptions = [
-  { label: 'None (single GPU)', value: 'none' },
-  { label: 'Layer (default)', value: 'layer' },
-  { label: 'Row', value: 'row' },
-  { label: 'Graph (ik_llama.cpp)', value: 'graph' }
-]
-</script>
-
-<style scoped>
-.essential-settings-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.inline-validation {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  margin-top: var(--spacing-xs);
-  font-size: 0.875rem;
-}
-
-.inline-validation.warning {
-  color: var(--status-warning);
-}
-
-.inline-validation.error {
-  color: var(--status-error);
-}
-
-.inline-validation.success {
-  color: var(--status-success);
-}
-</style>
diff --git a/frontend/src/components/config/GenerationParamsSection.vue b/frontend/src/components/config/GenerationParamsSection.vue
deleted file mode 100644
index 445d9bc..0000000
--- a/frontend/src/components/config/GenerationParamsSection.vue
+++ /dev/null
@@ -1,244 +0,0 @@
-<template>
-  <div class="tab-content">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-cog"></i>
-        Sampling Parameters
-      </h4>
-      <ConfigField label="Max Predict" help-text="Max tokens (-1=unlimited)">
-        <template #input>
-          <InputNumber v-model="config.n_predict" :min="-1" :max="2048" />
-        </template>
-      </ConfigField>
-      <ConfigField 
-        label="Temperature" 
-        :tooltip="temperatureTooltip"
-        :help-text="getTemperatureTooltip()"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.temp" 
-            :min="0.1" 
-            :max="2.0" 
-            :step="0.1" 
-            :maxFractionDigits="1"
-            :markers="[
-              { value: 0.3, label: 'Code', color: 'blue' },
-              { value: 0.8, label: 'Chat', color: 'green' },
-              { value: 1.5, label: 'Creative', color: 'purple' }
-            ]"
-            :recommended="recommendedTemperature"
-          />
-        </template>
-      </ConfigField>
-      <ConfigField 
-        label="Top-K" 
-        :tooltip="topKTooltip"
-        :help-text="getTopKTooltip()"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.top_k" 
-            :min="1" 
-            :max="maxTopK"
-            :recommended="recommendedTopK"
-          />
-        </template>
-      </ConfigField>
-      <ConfigField 
-        label="Top-P" 
-        :tooltip="topPTooltip"
-        :help-text="getTopPTooltip()"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.top_p" 
-            :min="0.1" 
-            :max="1.0" 
-            :step="0.1" 
-            :maxFractionDigits="1"
-            :recommended="recommendedTopP"
-          />
-        </template>
-      </ConfigField>
-      <ConfigField 
-        label="Repeat Penalty" 
-        :tooltip="repeatPenaltyTooltip"
-        :help-text="getRepeatPenaltyTooltip()"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.repeat_penalty" 
-            :min="0.5" 
-            :max="2.0" 
-            :step="0.05"
-            :maxFractionDigits="2"
-            :recommended="null"
-          />
-        </template>
-      </ConfigField>
-    </div>
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-sliders-h"></i>
-        Advanced Generation Options
-      </h4>
-      <ConfigField v-if="isMinPSupported" label="Min-P">
-        <template #input>
-          <SliderInput v-model="config.min_p" :min="0.0" :max="1.0" :step="0.05" :maxFractionDigits="2" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="isTypicalPSupported" label="Typical-P">
-        <template #input>
-          <SliderInput v-model="config.typical_p" :min="0.0" :max="1.0" :step="0.05" :maxFractionDigits="2" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="isTfsZSupported" label="TFS-Z">
-        <template #input>
-          <SliderInput v-model="config.tfs_z" :min="0.0" :max="1.0" :step="0.05" :maxFractionDigits="2" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="isPresencePenaltySupported" label="Presence Penalty">
-        <template #input>
-          <SliderInput v-model="config.presence_penalty" :min="0.0" :max="2.0" :step="0.1"
-            :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="isFrequencyPenaltySupported" label="Frequency Penalty">
-        <template #input>
-          <SliderInput v-model="config.frequency_penalty" :min="0.0" :max="2.0" :step="0.1"
-            :maxFractionDigits="1" />
-        </template>
-      </ConfigField>
-      <ConfigField label="Mirostat Mode">
-        <template #input>
-          <Dropdown v-model="config.mirostat"
-            :options="[{ label: 'Off (0)', value: 0 }, { label: 'Mirostat (1)', value: 1 }, { label: 'Mirostat 2.0 (2)', value: 2 }]"
-            optionLabel="label" optionValue="value" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="config.mirostat > 0" label="Mirostat Eta">
-        <template #input>
-          <InputNumber v-model="config.mirostat_eta" :min="0" :max="10" :step="0.1" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="config.mirostat > 0" label="Mirostat Tau">
-        <template #input>
-          <InputNumber v-model="config.mirostat_tau" :min="0" :max="10" :step="0.1" />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-import ConfigField from './ConfigField.vue'
-import SliderInput from '@/components/SliderInput.vue'
-import InputNumber from 'primevue/inputnumber'
-import Dropdown from 'primevue/dropdown'
-
-const props = defineProps({
-  config: {
-    type: Object,
-    required: true
-  },
-  maxTopK: {
-    type: Number,
-    default: 200
-  },
-  recommendedTemperature: {
-    type: Number,
-    default: null
-  },
-  recommendedTopK: {
-    type: Number,
-    default: null
-  },
-  recommendedTopP: {
-    type: Number,
-    default: null
-  },
-  isMinPSupported: {
-    type: Boolean,
-    default: false
-  },
-  isTypicalPSupported: {
-    type: Boolean,
-    default: false
-  },
-  isTfsZSupported: {
-    type: Boolean,
-    default: false
-  },
-  isPresencePenaltySupported: {
-    type: Boolean,
-    default: false
-  },
-  isFrequencyPenaltySupported: {
-    type: Boolean,
-    default: false
-  },
-  temperatureTooltip: {
-    type: String,
-    default: ''
-  },
-  topKTooltip: {
-    type: String,
-    default: ''
-  },
-  topPTooltip: {
-    type: String,
-    default: ''
-  },
-  repeatPenaltyTooltip: {
-    type: String,
-    default: ''
-  }
-})
-
-const getTemperatureTooltip = () => {
-  return props.temperatureTooltip || 'Controls randomness. Lower = more deterministic, Higher = more creative'
-}
-
-const getTopKTooltip = () => {
-  return props.topKTooltip || 'Consider top K tokens. Lower = more focused, Higher = more diverse'
-}
-
-const getTopPTooltip = () => {
-  return props.topPTooltip || 'Nucleus sampling. Lower = more focused, Higher = more diverse'
-}
-
-const getRepeatPenaltyTooltip = () => {
-  return props.repeatPenaltyTooltip || 'Penalize repetition. Higher = less repetition'
-}
-</script>
-
-<style scoped>
-.tab-content {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin: 0 0 0.75rem 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-}
-
-.tab-section-title i {
-  color: var(--accent-cyan);
-}
-</style>
-
diff --git a/frontend/src/components/config/MemoryMonitor.vue b/frontend/src/components/config/MemoryMonitor.vue
deleted file mode 100644
index 2606aa4..0000000
--- a/frontend/src/components/config/MemoryMonitor.vue
+++ /dev/null
@@ -1,439 +0,0 @@
-<template>
-  <div class="memory-status-card" :class="statusClass" role="region" :aria-label="`${title} memory status`">
-    <div class="memory-card-header">
-      <div class="memory-status-icon">
-        <i v-if="status === 'good'" class="pi pi-check-circle"></i>
-        <i v-else-if="status === 'warning'" class="pi pi-exclamation-triangle"></i>
-        <i v-else class="pi pi-times-circle"></i>
-      </div>
-      <div class="memory-card-title">
-        <h4>{{ title }}</h4>
-        <span class="memory-status-badge">{{ statusText }}</span>
-      </div>
-    </div>
-    
-    <div class="memory-status-content" v-if="!loading">
-      <div class="memory-usage-display">
-        <div class="usage-item" v-if="currentValue !== null">
-          <span class="usage-label">Current:</span>
-          <span class="usage-value">{{ formatFileSize(currentValue) }}</span>
-        </div>
-        <div class="usage-item" v-if="estimatedValue !== null">
-          <span class="usage-label">+ Model:</span>
-          <span class="usage-value">{{ formatFileSize(estimatedValue) }}</span>
-        </div>
-        <div class="usage-item total">
-          <span class="usage-label">= Total:</span>
-          <span class="usage-value">{{ formatFileSize(totalValue) }}</span>
-          <span class="usage-fraction">/ {{ formatFileSize(totalCapacity) }}</span>
-        </div>
-      </div>
-      
-      <div class="memory-progress-bar">
-        <div class="stacked-bar" :class="statusClass">
-          <div class="bar-current" :style="{ width: currentPercent + '%' }"></div>
-          <div class="bar-additional"
-            :style="{ width: additionalPercent + '%', left: currentPercent + '%' }"></div>
-        </div>
-        <div class="progress-label">{{ progressText }}</div>
-      </div>
-      
-      <div class="memory-message" :class="statusClass">
-        {{ statusMessage }}
-      </div>
-    </div>
-    
-    <div v-else class="memory-loading">
-      <i class="pi pi-spin pi-spinner"></i>
-      <span>Loading data...</span>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-import { formatFileSize } from '@/utils/formatting'
-
-const props = defineProps({
-  title: {
-    type: String,
-    required: true
-  },
-  currentValue: {
-    type: Number,
-    default: null
-  },
-  estimatedValue: {
-    type: Number,
-    default: null
-  },
-  totalCapacity: {
-    type: Number,
-    required: true
-  },
-  loading: {
-    type: Boolean,
-    default: false
-  }
-})
-
-// formatFileSize is now imported from @/utils/formatting
-
-const totalValue = computed(() => {
-  const current = props.currentValue || 0
-  const estimated = props.estimatedValue || 0
-  return current + estimated
-})
-
-const status = computed(() => {
-  if (props.loading) return 'unknown'
-  const usagePercent = props.totalCapacity > 0 
-    ? (totalValue.value / props.totalCapacity) * 100
-    : 0
-  
-  if (usagePercent < 70) return 'good'
-  if (usagePercent < 90) return 'warning'
-  return 'critical'
-})
-
-const statusClass = computed(() => {
-  const s = status.value
-  if (s === 'good') return 'status-good'
-  if (s === 'warning') return 'status-warning'
-  if (s === 'critical') return 'status-critical'
-  return 'status-unknown'
-})
-
-const statusText = computed(() => {
-  const s = status.value
-  if (s === 'good') return 'Fits Comfortably'
-  if (s === 'warning') return 'Tight Fit'
-  if (s === 'critical') return 'Won\'t Fit'
-  return 'Unknown'
-})
-
-const currentPercent = computed(() => {
-  const total = props.totalCapacity || 1
-  const current = props.currentValue || 0
-  return Math.min(100, Math.max(0, Math.round((current / total) * 100)))
-})
-
-const additionalPercent = computed(() => {
-  const total = props.totalCapacity || 1
-  const additional = props.estimatedValue || 0
-  const pct = Math.round((additional / total) * 100)
-  return Math.max(0, Math.min(100 - currentPercent.value, pct))
-})
-
-const progressText = computed(() => {
-  return `${currentPercent.value}% used + ${additionalPercent.value}% est • ${formatFileSize(totalValue.value)} total`
-})
-
-const statusMessage = computed(() => {
-  const s = status.value
-  if (s === 'good') {
-    const available = props.totalCapacity - totalValue.value
-    return `✅ Fits Comfortably - ${formatFileSize(available)} buffer remaining`
-  }
-  if (s === 'warning') {
-    return '⚠️ Usage is high - consider optimizing configuration'
-  }
-  if (s === 'critical') {
-    return '❌ Usage exceeds capacity - configuration will not work'
-  }
-  return 'Loading information...'
-})
-</script>
-
-<style scoped>
-.memory-status-card {
-  background: var(--gradient-card);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.memory-status-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 4px;
-  background: var(--border-primary);
-  transition: all var(--transition-normal);
-}
-
-.memory-status-card.status-good::before {
-  background: linear-gradient(
-    90deg, 
-    var(--status-success), 
-    color-mix(in srgb, var(--status-success) 70%, var(--bg-primary))
-  );
-}
-
-.memory-status-card.status-warning::before {
-  background: linear-gradient(
-    90deg, 
-    var(--status-warning), 
-    color-mix(in srgb, var(--status-warning) 70%, var(--bg-primary))
-  );
-}
-
-.memory-status-card.status-critical::before {
-  background: linear-gradient(
-    90deg, 
-    var(--status-error), 
-    color-mix(in srgb, var(--status-error) 70%, var(--bg-primary))
-  );
-}
-
-.memory-card-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-lg);
-}
-
-.memory-status-icon {
-  width: 48px;
-  height: 48px;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  border-radius: var(--radius-lg);
-  font-size: 1.5rem;
-  flex-shrink: 0;
-}
-
-.memory-status-card.status-good .memory-status-icon {
-  background: var(--status-success-soft);
-  color: var(--status-success);
-}
-
-.memory-status-card.status-warning .memory-status-icon {
-  background: var(--status-warning-soft);
-  color: var(--status-warning);
-}
-
-.memory-status-card.status-critical .memory-status-icon {
-  background: var(--status-error-soft);
-  color: var(--status-error);
-}
-
-.memory-card-title {
-  flex: 1;
-}
-
-.memory-card-title h4 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.25rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.memory-status-badge {
-  display: inline-block;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.875rem;
-  font-weight: 600;
-  letter-spacing: 0.5px;
-}
-
-.memory-status-card.status-good .memory-status-badge {
-  background: var(--status-success-soft);
-  color: var(--status-success);
-}
-
-.memory-status-card.status-warning .memory-status-badge {
-  background: var(--status-warning-soft);
-  color: var(--status-warning);
-}
-
-.memory-status-card.status-critical .memory-status-badge {
-  background: var(--status-error-soft);
-  color: var(--status-error);
-}
-
-.memory-status-content {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.memory-usage-display {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  min-width: 0;
-  box-sizing: border-box;
-}
-
-.usage-item {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  font-size: 0.9rem;
-  min-width: 0;
-  flex-wrap: wrap;
-}
-
-.usage-value {
-  flex-shrink: 0;
-  white-space: nowrap;
-}
-
-.usage-fraction {
-  flex-shrink: 0;
-  white-space: nowrap;
-}
-
-.usage-item.total {
-  font-weight: 600;
-  padding-top: var(--spacing-sm);
-  border-top: 1px solid var(--border-primary);
-  font-size: 1rem;
-}
-
-.usage-label {
-  color: var(--text-secondary);
-  min-width: 80px;
-}
-
-.usage-value {
-  color: var(--text-primary);
-  font-weight: 500;
-  flex: 1;
-}
-
-.usage-item.total .usage-value {
-  font-weight: 700;
-  color: var(--accent-cyan);
-}
-
-.usage-fraction {
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.memory-progress-bar {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.progress-label {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  text-align: center;
-}
-
-.memory-message {
-  padding: var(--spacing-md);
-  border-radius: var(--radius-md);
-  font-size: 0.9rem;
-  line-height: 1.5;
-  font-weight: 500;
-}
-
-.memory-message.status-good {
-  background: var(--status-success-soft);
-  color: var(--status-success);
-  border: 1px solid color-mix(in srgb, var(--status-success) 40%, transparent);
-}
-
-.memory-message.status-warning {
-  background: var(--status-warning-soft);
-  color: var(--status-warning);
-  border: 1px solid color-mix(in srgb, var(--status-warning) 40%, transparent);
-}
-
-.memory-message.status-critical {
-  background: var(--status-error-soft);
-  color: var(--status-error);
-  border: 1px solid color-mix(in srgb, var(--status-error) 40%, transparent);
-}
-
-.memory-loading {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-xl);
-  color: var(--text-secondary);
-}
-
-.memory-loading i {
-  font-size: 2rem;
-  color: var(--accent-primary);
-  animation: spin 1s linear infinite;
-}
-
-.stacked-bar {
-  position: relative;
-  width: 100%;
-  height: 10px;
-  background: var(--bg-secondary);
-  border-radius: 5px;
-  overflow: hidden;
-}
-
-.stacked-bar .bar-current {
-  position: absolute;
-  left: 0;
-  top: 0;
-  bottom: 0;
-  background: var(--status-warning);
-}
-
-.stacked-bar .bar-additional {
-  position: absolute;
-  top: 0;
-  bottom: 0;
-  background: linear-gradient(90deg, var(--accent-blue), var(--accent-cyan));
-  opacity: 0.8;
-  transform-origin: left;
-}
-
-.stacked-bar.status-good .bar-current {
-  background: var(--status-success);
-}
-
-.stacked-bar.status-warning .bar-current {
-  background: var(--status-warning);
-}
-
-.stacked-bar.status-critical .bar-current {
-  background: var(--status-error);
-}
-
-.stacked-bar.status-good {
-  box-shadow: inset 0 0 0 1px color-mix(in srgb, var(--status-success) 45%, transparent);
-}
-
-.stacked-bar.status-warning {
-  box-shadow: inset 0 0 0 1px color-mix(in srgb, var(--status-warning) 45%, transparent);
-}
-
-.stacked-bar.status-critical {
-  box-shadow: inset 0 0 0 1px color-mix(in srgb, var(--status-error) 45%, transparent);
-}
-
-@keyframes spin {
-  from {
-    transform: rotate(0deg);
-  }
-  to {
-    transform: rotate(360deg);
-  }
-}
-</style>
-
diff --git a/frontend/src/components/config/MemoryParamsSection.vue b/frontend/src/components/config/MemoryParamsSection.vue
deleted file mode 100644
index 59d600a..0000000
--- a/frontend/src/components/config/MemoryParamsSection.vue
+++ /dev/null
@@ -1,148 +0,0 @@
-<template>
-  <div class="tab-content">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-microchip"></i>
-        Model Loading
-      </h4>
-      <ConfigField 
-        v-if="!cpuOnlyMode" 
-        label="GPU Layers" 
-        :tooltip="gpuLayersTooltip"
-        :help-text="`Layers offloaded to GPU (max: ${maxGpuLayers})`"
-      >
-        <template #input>
-          <SliderInput 
-            v-model="config.n_gpu_layers" 
-            :min="0" 
-            :max="maxGpuLayers" 
-            :recommended="recommendedGpuLayers" 
-            :disabled="!gpuAvailable"
-            @input="$emit('update-vram-estimate')" 
-          />
-        </template>
-        <template #validation>
-          <div v-if="gpuLayersValidation" class="inline-validation" :class="gpuLayersValidation.type">
-            <i :class="gpuLayersValidation.type === 'error' ? 'pi pi-times-circle' : 'pi pi-check-circle'"></i>
-            <span>{{ gpuLayersValidation.message }}</span>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!cpuOnlyMode" label="Main GPU" help-text="Primary GPU">
-        <template #input>
-          <Dropdown 
-            v-model="config.main_gpu" 
-            :options="gpuOptions" 
-            optionLabel="label" 
-            optionValue="value"
-            placeholder="Select GPU" 
-            :disabled="!gpuAvailable" 
-          />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!cpuOnlyMode" label="Tensor Split" help-text="Multi-GPU ratios">
-        <template #input>
-          <InputText v-model="config.tensor_split" placeholder="0.5,0.5" :disabled="!gpuAvailable" />
-        </template>
-      </ConfigField>
-      <ConfigField label="CPU Threads" help-text="CPU threads for computation">
-        <template #input>
-          <SliderInput v-model="config.threads" :min="1" :max="cpuThreads" />
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import ConfigField from './ConfigField.vue'
-import SliderInput from '@/components/SliderInput.vue'
-import InputText from 'primevue/inputtext'
-import Dropdown from 'primevue/dropdown'
-
-defineProps({
-  config: {
-    type: Object,
-    required: true
-  },
-  cpuOnlyMode: {
-    type: Boolean,
-    default: false
-  },
-  gpuAvailable: {
-    type: Boolean,
-    default: true
-  },
-  maxGpuLayers: {
-    type: Number,
-    default: 32
-  },
-  recommendedGpuLayers: {
-    type: Number,
-    default: null
-  },
-  cpuThreads: {
-    type: Number,
-    default: 8
-  },
-  gpuOptions: {
-    type: Array,
-    default: () => []
-  },
-  gpuLayersValidation: {
-    type: Object,
-    default: null
-  },
-  gpuLayersTooltip: {
-    type: String,
-    default: ''
-  }
-})
-
-defineEmits(['update-vram-estimate'])
-</script>
-
-<style scoped>
-.tab-content {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin: 0 0 0.75rem 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-}
-
-.tab-section-title i {
-  color: var(--accent-cyan);
-}
-
-.inline-validation {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-top: 0.5rem;
-  font-size: 0.875rem;
-}
-
-.inline-validation.error {
-  color: var(--status-error);
-}
-
-.inline-validation.success {
-  color: var(--status-success);
-}
-</style>
-
diff --git a/frontend/src/components/config/ModelInfoSection.vue b/frontend/src/components/config/ModelInfoSection.vue
deleted file mode 100644
index 37e0c0d..0000000
--- a/frontend/src/components/config/ModelInfoSection.vue
+++ /dev/null
@@ -1,177 +0,0 @@
-<template>
-  <div class="model-info-section">
-    <div class="model-info">
-      <h2 class="card-title">{{ model?.huggingface_id || model?.name || 'Model Configuration' }}</h2>
-      <div class="model-tags">
-        <span class="model-tag tag-size">{{ formatFileSize(model?.file_size) }}</span>
-        <span class="model-tag tag-quantization">{{ model?.quantization }}</span>
-        <span class="model-tag tag-type">{{ model?.model_type }}</span>
-        <span v-if="modelLayerInfo?.architecture && modelLayerInfo.architecture !== model?.model_type"
-          class="model-tag tag-architecture">
-          {{ modelLayerInfo.architecture }}
-        </span>
-        <span v-if="modelLayerInfo?.layer_count" class="model-tag tag-layers">
-          {{ modelLayerInfo.layer_count }} layers
-        </span>
-        <span v-if="isEmbeddingModel" class="model-tag tag-pipeline">Embedding</span>
-      </div>
-      <div class="embedding-notice" v-if="isEmbeddingModel">
-        <i class="pi pi-database"></i>
-        <div>
-          <strong>Embedding model detected</strong>
-          <p>This model automatically exposes the /v1/embeddings endpoint via llama.cpp.</p>
-        </div>
-      </div>
-    </div>
-    <div class="header-actions">
-      <div class="action-buttons">
-        <slot name="actions">
-          <Button 
-            label="Quick Start" 
-            icon="pi pi-bolt" 
-            size="small"
-            severity="info"
-            outlined
-            @click="$emit('quick-start')"
-            v-tooltip="'Choose a preset, use the wizard, or let Smart Auto optimize for you'"
-          />
-          <Button 
-            icon="pi pi-refresh" 
-            @click="$emit('regenerate-info')" 
-            :loading="regeneratingInfo"
-            severity="secondary" 
-            size="small" 
-            outlined
-            v-tooltip="'Regenerate model information from GGUF metadata'" 
-          />
-          <Button 
-            label="Save Config" 
-            icon="pi pi-save" 
-            @click="$emit('save-config')" 
-            :loading="saveLoading"
-            severity="success" 
-            size="small" 
-          />
-        </slot>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-import { formatFileSize } from '@/utils/formatting'
-import Button from 'primevue/button'
-
-const props = defineProps({
-  model: {
-    type: Object,
-    default: null
-  },
-  modelLayerInfo: {
-    type: Object,
-    default: null
-  },
-  hasHfMetadata: {
-    type: Boolean,
-    default: false
-  },
-  hfMetadata: {
-    type: Object,
-    default: null
-  },
-  hfMetadataLoading: {
-    type: Boolean,
-    default: false
-  },
-  regeneratingInfo: {
-    type: Boolean,
-    default: false
-  },
-  saveLoading: {
-    type: Boolean,
-    default: false
-  }
-})
-
-defineEmits(['quick-start', 'regenerate-info', 'save-config'])
-
-const isEmbeddingModel = computed(() => {
-  return props.model?.pipeline_tag === 'feature-extraction' || 
-         props.hfMetadata?.pipeline_tag === 'feature-extraction' ||
-         props.model?.model_type?.toLowerCase().includes('embedding')
-})
-</script>
-
-<style scoped>
-.model-info-section {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  gap: 1rem;
-}
-
-.model-info {
-  flex: 1;
-}
-
-.model-tags {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 0.5rem;
-  margin: 0.75rem 0;
-}
-
-.embedding-notice {
-  display: flex;
-  gap: 0.75rem;
-  padding: 1rem;
-  background: var(--status-info-soft);
-  border-radius: var(--radius-md);
-  border-left: 3px solid var(--accent-blue);
-  margin-top: 1rem;
-}
-
-.embedding-notice i {
-  font-size: 1.5rem;
-  color: var(--accent-blue);
-}
-
-.embedding-notice strong {
-  display: block;
-  margin-bottom: 0.25rem;
-  color: var(--text-primary);
-}
-
-.embedding-notice p {
-  margin: 0;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.header-actions {
-  display: flex;
-  align-items: center;
-}
-
-.action-buttons {
-  display: flex;
-  gap: 0.5rem;
-  flex-wrap: wrap;
-}
-
-@media (max-width: 768px) {
-  .model-info-section {
-    flex-direction: column;
-  }
-  
-  .action-buttons {
-    width: 100%;
-  }
-  
-  .action-buttons .p-button {
-    flex: 1;
-  }
-}
-</style>
-
diff --git a/frontend/src/components/config/OnboardingTour.vue b/frontend/src/components/config/OnboardingTour.vue
deleted file mode 100644
index 335f3d8..0000000
--- a/frontend/src/components/config/OnboardingTour.vue
+++ /dev/null
@@ -1,397 +0,0 @@
-<template>
-  <div v-if="visible && currentStep !== null" class="onboarding-tour">
-    <div class="tour-overlay" @click="skipTour"></div>
-    <div 
-      class="tour-tooltip" 
-      :style="tooltipStyle"
-      role="dialog"
-      aria-label="Onboarding tour"
-      aria-live="polite"
-    >
-      <div class="tour-header">
-        <div class="tour-step-indicator">
-          <span class="step-current">{{ currentStep + 1 }}</span>
-          <span class="step-separator">/</span>
-          <span class="step-total">{{ steps.length }}</span>
-        </div>
-        <Button 
-          icon="pi pi-times" 
-          @click="skipTour"
-          size="small"
-          text
-          rounded
-          severity="secondary"
-          aria-label="Skip tour"
-          class="tour-close"
-        />
-      </div>
-      
-      <div class="tour-content">
-        <h3 class="tour-title">{{ currentStepData.title }}</h3>
-        <p class="tour-description">{{ currentStepData.content }}</p>
-        
-        <div v-if="currentStepData.target" class="tour-target-hint">
-          <i class="pi pi-arrow-down" aria-hidden="true"></i>
-          <span>See highlighted area below</span>
-        </div>
-      </div>
-      
-      <div class="tour-footer">
-        <Button 
-          v-if="currentStep > 0"
-          label="Previous" 
-          icon="pi pi-arrow-left"
-          @click="previousStep"
-          size="small"
-          text
-          severity="secondary"
-          aria-label="Go to previous step"
-        />
-        <div class="tour-actions-right">
-          <Button 
-            v-if="currentStep < steps.length - 1"
-            label="Skip" 
-            @click="skipTour"
-            size="small"
-            text
-            severity="secondary"
-            aria-label="Skip remaining steps"
-          />
-          <Button 
-            v-if="currentStep < steps.length - 1"
-            label="Next" 
-            icon="pi pi-arrow-right"
-            icon-pos="right"
-            @click="nextStep"
-            aria-label="Go to next step"
-          />
-          <Button 
-            v-else
-            label="Get Started" 
-            icon="pi pi-check"
-            icon-pos="right"
-            @click="completeTour"
-            severity="success"
-            aria-label="Complete tour"
-          />
-        </div>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, computed, watch, onMounted, onUnmounted, nextTick } from 'vue'
-import Button from 'primevue/button'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  steps: {
-    type: Array,
-    required: true
-  }
-})
-
-const emit = defineEmits(['complete', 'skip', 'update:visible'])
-
-const currentStep = ref(null)
-const tooltipStyle = ref({})
-const targetElements = ref([])
-
-const currentStepData = computed(() => {
-  if (currentStep.value === null || !props.steps[currentStep.value]) return null
-  return props.steps[currentStep.value]
-})
-
-watch(() => props.visible, async (newVal) => {
-  if (newVal) {
-    currentStep.value = 0
-    await nextTick()
-    await updateTooltipPosition()
-  } else {
-    currentStep.value = null
-    removeHighlights()
-  }
-})
-
-watch(currentStep, async () => {
-  if (currentStep.value !== null) {
-    await nextTick()
-    await updateTooltipPosition()
-  }
-})
-
-const updateTooltipPosition = async () => {
-  if (!currentStepData.value?.target) {
-    tooltipStyle.value = {
-      position: 'fixed',
-      top: '50%',
-      left: '50%',
-      transform: 'translate(-50%, -50%)',
-      zIndex: 10000
-    }
-    removeHighlights()
-    return
-  }
-
-  const targetSelector = currentStepData.value.target
-  const targetElement = document.querySelector(targetSelector)
-  
-  if (!targetElement) {
-    console.warn(`Target element not found: ${targetSelector}`)
-    return
-  }
-
-  // Highlight target element
-  highlightTarget(targetElement)
-
-  // Calculate tooltip position
-  const rect = targetElement.getBoundingClientRect()
-  const tooltipWidth = 400
-  const tooltipHeight = 200
-  const spacing = 20
-
-  let top = rect.bottom + spacing
-  let left = rect.left + (rect.width / 2) - (tooltipWidth / 2)
-
-  // Adjust if tooltip goes off screen
-  if (left < spacing) left = spacing
-  if (left + tooltipWidth > window.innerWidth - spacing) {
-    left = window.innerWidth - tooltipWidth - spacing
-  }
-
-  // If tooltip would go below viewport, show above target
-  if (top + tooltipHeight > window.innerHeight - spacing) {
-    top = rect.top - tooltipHeight - spacing
-  }
-
-  // If still doesn't fit, center on screen
-  if (top < spacing) {
-    top = (window.innerHeight - tooltipHeight) / 2
-    left = (window.innerWidth - tooltipWidth) / 2
-  }
-
-  tooltipStyle.value = {
-    position: 'fixed',
-    top: `${top}px`,
-    left: `${left}px`,
-    zIndex: 10000
-  }
-}
-
-const highlightTarget = (element) => {
-  removeHighlights()
-  
-  element.classList.add('tour-highlight')
-  targetElements.value.push(element)
-  
-  // Scroll element into view if needed
-  element.scrollIntoView({ behavior: 'smooth', block: 'center', inline: 'nearest' })
-}
-
-const removeHighlights = () => {
-  targetElements.value.forEach(el => {
-    el.classList.remove('tour-highlight')
-  })
-  targetElements.value = []
-}
-
-const nextStep = () => {
-  if (currentStep.value < props.steps.length - 1) {
-    currentStep.value++
-  } else {
-    completeTour()
-  }
-}
-
-const previousStep = () => {
-  if (currentStep.value > 0) {
-    currentStep.value--
-  }
-}
-
-const skipTour = () => {
-  emit('skip')
-  emit('update:visible', false)
-  removeHighlights()
-}
-
-const completeTour = () => {
-  emit('complete')
-  emit('update:visible', false)
-  removeHighlights()
-  
-  // Store completion in localStorage
-  localStorage.setItem('model-config-onboarding-completed', 'true')
-}
-
-// Handle window resize
-const handleResize = () => {
-  if (props.visible && currentStep.value !== null) {
-    updateTooltipPosition()
-  }
-}
-
-onMounted(() => {
-  window.addEventListener('resize', handleResize)
-  window.addEventListener('scroll', handleResize, true)
-})
-
-onUnmounted(() => {
-  window.removeEventListener('resize', handleResize)
-  window.removeEventListener('scroll', handleResize, true)
-  removeHighlights()
-})
-
-// Expose removeHighlights for cleanup
-defineExpose({
-  removeHighlights
-})
-</script>
-
-<style scoped>
-.onboarding-tour {
-  position: fixed;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-  z-index: 9999;
-}
-
-.tour-overlay {
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-  background: rgba(0, 0, 0, 0.7);
-  backdrop-filter: blur(4px);
-}
-
-.tour-tooltip {
-  position: fixed;
-  width: 400px;
-  max-width: calc(100vw - 40px);
-  background: var(--bg-card);
-  border: 2px solid var(--accent-cyan);
-  border-radius: var(--radius-xl);
-  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.5), 0 0 0 4px rgba(34, 211, 238, 0.2);
-  padding: var(--spacing-lg);
-  animation: fadeInScale 0.3s ease-out;
-}
-
-@keyframes fadeInScale {
-  from {
-    opacity: 0;
-    transform: scale(0.9);
-  }
-  to {
-    opacity: 1;
-    transform: scale(1);
-  }
-}
-
-.tour-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-md);
-  padding-bottom: var(--spacing-md);
-  border-bottom: 1px solid var(--border-primary);
-}
-
-.tour-step-indicator {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.9rem;
-  font-weight: 600;
-  color: var(--text-secondary);
-}
-
-.step-current {
-  color: var(--accent-cyan);
-  font-size: 1.1rem;
-}
-
-.tour-close {
-  padding: var(--spacing-xs);
-}
-
-.tour-content {
-  margin-bottom: var(--spacing-lg);
-}
-
-.tour-title {
-  margin: 0 0 var(--spacing-sm) 0;
-  font-size: 1.25rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.tour-description {
-  margin: 0;
-  font-size: 0.95rem;
-  color: var(--text-secondary);
-  line-height: 1.5;
-}
-
-.tour-target-hint {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  margin-top: var(--spacing-md);
-  padding: var(--spacing-sm);
-  background: rgba(34, 211, 238, 0.1);
-  border-radius: var(--radius-sm);
-  font-size: 0.85rem;
-  color: var(--accent-cyan);
-}
-
-.tour-target-hint i {
-  animation: bounce 1s ease-in-out infinite;
-}
-
-@keyframes bounce {
-  0%, 100% {
-    transform: translateY(0);
-  }
-  50% {
-    transform: translateY(4px);
-  }
-}
-
-.tour-footer {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.tour-actions-right {
-  display: flex;
-  gap: var(--spacing-sm);
-  margin-left: auto;
-}
-
-/* Global style for highlighting target elements - using deep selector */
-:deep(.tour-highlight) {
-  position: relative;
-  z-index: 10000 !important;
-  box-shadow: 0 0 0 4px rgba(34, 211, 238, 0.5), 0 0 20px rgba(34, 211, 238, 0.3) !important;
-  border-radius: var(--radius-md);
-  animation: pulseHighlight 2s ease-in-out infinite;
-}
-
-@media (max-width: 600px) {
-  .tour-tooltip {
-    width: calc(100vw - 20px);
-    max-width: none;
-  }
-}
-</style>
-
diff --git a/frontend/src/components/config/PerformanceSection.vue b/frontend/src/components/config/PerformanceSection.vue
deleted file mode 100644
index fc1e38e..0000000
--- a/frontend/src/components/config/PerformanceSection.vue
+++ /dev/null
@@ -1,393 +0,0 @@
-<template>
-  <div class="performance-section">
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-tachometer"></i>
-        Performance Tuning
-      </h4>
-      <ConfigField label="Batch Threads" help-text="Threads for batch processing">
-        <template #input>
-          <SliderInput v-model="config.threads_batch" :min="1" :max="systemStore.gpuInfo.cpu_threads" />
-        </template>
-      </ConfigField>
-      <ConfigField label="Parallel" :help-text="`Parallel processing (max: ${maxParallel})`">
-        <template #input>
-          <SliderInput v-model="config.parallel" :min="1" :max="maxParallel" />
-        </template>
-        <template #validation>
-          <div v-if="parallelValidation" class="inline-validation" :class="parallelValidation.type">
-            <i class="pi pi-exclamation-triangle"></i>
-            <span>{{ parallelValidation.message }}</span>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!systemStore.gpuInfo.cpu_only_mode" label="Flash Attention" 
-                  help-text="Enable flash attn (enables V cache quantization)">
-        <template #input>
-          <Checkbox v-model="config.flash_attn" binary :disabled="!gpuAvailable" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="!systemStore.gpuInfo.cpu_only_mode" label="Low VRAM" 
-                  help-text="Optimize for low VRAM usage">
-        <template #input>
-          <Checkbox v-model="config.low_vram" binary :disabled="!gpuAvailable" />
-        </template>
-      </ConfigField>
-      <ConfigField label="Continuous Batching" help-text="Enable continuous/dynamic batching">
-        <template #input>
-          <Checkbox v-model="config.cont_batching" binary />
-        </template>
-      </ConfigField>
-      <ConfigField label="No KV Offload" help-text="Disable KV cache offloading">
-        <template #input>
-          <Checkbox v-model="config.no_kv_offload" binary />
-        </template>
-      </ConfigField>
-      <ConfigField label="Logits All" help-text="Return logits for all tokens">
-        <template #input>
-          <Checkbox v-model="config.logits_all" binary />
-        </template>
-      </ConfigField>
-      <ConfigField label="Embedding Mode" help-text="Enable embedding generation mode">
-        <template #input>
-          <Checkbox v-model="config.embedding" binary :disabled="isEmbeddingModel" />
-        </template>
-      </ConfigField>
-    </div>
-    
-    <div class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-database"></i>
-        KV Cache Optimization
-      </h4>
-      <div v-if="!config.flash_attn && (config.cache_type_v && config.cache_type_v !== 'f16')"
-        class="flash-attention-warning">
-        <i class="pi pi-exclamation-triangle"></i>
-        <div class="warning-content">
-          <strong>Flash Attention Required</strong>
-          <p>V cache quantization requires llama.cpp compiled with Flash Attention support (flag:
-            -DGGML_CUDA_FA_ALL_QUANTS=ON). Recompile your llama.cpp version or disable V cache quantization.</p>
-        </div>
-      </div>
-      <ConfigField label="K Cache Type" help-text="Key cache quantization (reduces memory usage)">
-        <template #input>
-          <Dropdown v-model="config.cache_type_k" :options="kvCacheOptions" optionLabel="label"
-            optionValue="value" placeholder="Select K cache type" />
-        </template>
-      </ConfigField>
-      <ConfigField 
-        v-if="config.flash_attn && !systemStore.gpuInfo.cpu_only_mode && isCacheTypeVSupported"
-        label="V Cache Type" 
-        help-text="Value cache quantization (requires Flash Attention)"
-      >
-        <template #input>
-          <Dropdown v-model="config.cache_type_v" :options="kvCacheOptions" optionLabel="label"
-            optionValue="value" placeholder="Select V cache type" />
-        </template>
-      </ConfigField>
-    </div>
-    
-    <div v-if="modelLayerInfo?.is_moe" class="tab-section">
-      <h4 class="tab-section-title">
-        <i class="pi pi-sitemap"></i>
-        MoE Expert Offloading
-      </h4>
-      <ConfigField label="Offload Pattern" help-text="Control which MoE layers go to CPU/GPU">
-        <template #input>
-          <Dropdown v-model="config.moe_offload_pattern" :options="moeOffloadPatterns" optionLabel="label"
-            optionValue="value" @change="handleMoEPatternChange" />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="config.moe_offload_pattern === 'n_layers'" label="Number of Layers" 
-                  help-text="Number of MoE layers to offload to CPU (--n-cpu-moe)">
-        <template #input>
-          <InputNumber 
-            v-model="config.n_cpu_moe" 
-            :min="1" 
-            :max="modelLayerInfo?.layer_count || 32"
-            placeholder="e.g., 5"
-          />
-        </template>
-      </ConfigField>
-      <ConfigField v-if="config.moe_offload_pattern === 'cpu'" label="CPU MoE" 
-                  help-text="Enable --cpu-moe flag (all MoE to CPU)">
-        <template #input>
-          <Checkbox v-model="config.cpu_moe" binary />
-        </template>
-      </ConfigField>
-      <ConfigField label="Custom Offload Pattern" full-width
-                  help-text="Advanced regex pattern for --override-tensor parameter">
-        <template #input>
-          <InputText 
-            v-model="config.moe_offload_custom" 
-            placeholder="e.g., exps=CPU or .ffn_.*_exps.=CPU"
-            :disabled="config.moe_offload_pattern !== 'custom' && config.moe_offload_pattern !== 'none'"
-          />
-        </template>
-        <template #help>
-          <div class="pattern-help">
-            <small v-if="config.moe_offload_pattern !== 'custom'">
-              Pattern is auto-generated from selection above. Select "Custom pattern" to edit manually.
-            </small>
-            <small v-else>
-              Enter a regex pattern matching tensor names. Use <code>=CPU</code> or <code>=CUDA0</code> to specify target.
-            </small>
-          </div>
-        </template>
-      </ConfigField>
-      <ConfigField label="Expert Info">
-        <template #input>
-          <div class="expert-info">
-            <span>{{ modelLayerInfo.expert_count }} experts</span>
-            <span>·</span>
-            <span>{{ modelLayerInfo.experts_used_count }} active per token</span>
-          </div>
-        </template>
-      </ConfigField>
-    </div>
-  </div>
-</template>
-
-<script setup>
-// Vue
-import { computed } from 'vue'
-
-// PrimeVue
-import Checkbox from 'primevue/checkbox'
-import Dropdown from 'primevue/dropdown'
-import InputText from 'primevue/inputtext'
-import InputNumber from 'primevue/inputnumber'
-
-// Components
-import ConfigField from '@/components/config/ConfigField.vue'
-import SliderInput from '@/components/SliderInput.vue'
-
-// Stores
-import { useSystemStore } from '@/stores/system'
-
-const props = defineProps({
-  config: {
-    type: Object,
-    required: true
-  },
-  modelLayerInfo: {
-    type: Object,
-    default: null
-  },
-  maxParallel: {
-    type: Number,
-    required: true
-  },
-  parallelValidation: {
-    type: Object,
-    default: null
-  },
-  gpuAvailable: {
-    type: Boolean,
-    default: true
-  },
-  isEmbeddingModel: {
-    type: Boolean,
-    default: false
-  },
-  isCacheTypeVSupported: {
-    type: Boolean,
-    default: false
-  }
-})
-
-const systemStore = useSystemStore()
-
-// KV cache options
-const kvCacheOptions = [
-  { label: 'No quantization (use llama.cpp default)', value: null },
-  { label: 'FP32 (full precision)', value: 'f32' },
-  { label: 'FP16 (half precision)', value: 'f16' },
-  { label: 'BF16 (bfloat16)', value: 'bf16' },
-  { label: 'Q8_0 (8-bit)', value: 'q8_0' },
-  { label: 'Q5_1 (5-bit high quality)', value: 'q5_1' },
-  { label: 'Q5_0 (5-bit)', value: 'q5_0' },
-  { label: 'Q4_1 (4-bit high quality)', value: 'q4_1' },
-  { label: 'Q4_0 (4-bit)', value: 'q4_0' },
-  { label: 'IQ4_NL (4-bit non-linear)', value: 'iq4_nl' }
-]
-
-// MoE offload patterns
-const moeOffloadPatterns = [
-  { label: 'None', value: 'none' },
-  { label: 'All MoE to CPU (--cpu-moe)', value: 'cpu' },
-  { label: 'N layers to CPU (--n-cpu-moe)', value: 'n_layers' },
-  { label: 'All MoE experts to CPU', value: 'all' },
-  { label: 'Up/Down projections to CPU', value: 'up_down' },
-  { label: 'Up projection only to CPU', value: 'up' },
-  { label: 'Down projection only to CPU', value: 'down' },
-  { label: 'Gate to CPU', value: 'gate' },
-  { label: 'Experts only to CPU (keep gate/up/down on GPU)', value: 'experts_only' },
-  { label: 'First half layers to CPU', value: 'first_half' },
-  { label: 'Last half layers to CPU', value: 'last_half' },
-  { label: 'Every other layer to CPU', value: 'alternating' },
-  { label: 'Custom pattern', value: 'custom' }
-]
-
-// Handle MoE pattern change
-const handleMoEPatternChange = () => {
-  // Automatically set the custom pattern based on selection
-  switch (props.config.moe_offload_pattern) {
-    case 'none':
-      props.config.moe_offload_custom = ''
-      props.config.cpu_moe = false
-      props.config.n_cpu_moe = null
-      break
-    case 'cpu':
-      // Use direct --cpu-moe flag
-      props.config.cpu_moe = true
-      props.config.n_cpu_moe = null
-      props.config.moe_offload_custom = ''
-      break
-    case 'n_layers':
-      // Use --n-cpu-moe flag, initialize with a default if not set
-      if (!props.config.n_cpu_moe) {
-        props.config.n_cpu_moe = Math.floor((props.modelLayerInfo?.layer_count || 32) / 2)
-      }
-      props.config.cpu_moe = false
-      props.config.moe_offload_custom = ''
-      break
-    case 'all':
-      // All MoE experts to CPU
-      props.config.moe_offload_custom = 'exps=CPU'
-      break
-    case 'up_down':
-      // Up and down projections to CPU
-      props.config.moe_offload_custom = '.ffn_(up|down)_exps.=CPU'
-      break
-    case 'up':
-      // Up projection only to CPU
-      props.config.moe_offload_custom = '.ffn_(up)_exps.=CPU'
-      break
-    case 'down':
-      // Down projection only to CPU
-      props.config.moe_offload_custom = '.ffn_(down)_exps.=CPU'
-      break
-    case 'gate':
-      // Gate to CPU
-      props.config.moe_offload_custom = '.ffn_.*_gate.=CPU'
-      break
-    case 'experts_only':
-      // Experts only, keep gate/up/down on GPU
-      props.config.moe_offload_custom = 'blk\\.\\d+\\.ffn_.*_exps\\.\\d+=CPU'
-      break
-    case 'first_half':
-      // First half of layers to CPU
-      props.config.moe_offload_custom = 'blk\\.(0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15)\\.ffn_.*_exps.=CPU'
-      break
-    case 'last_half':
-      // Last half of layers to CPU
-      props.config.moe_offload_custom = 'blk\\.(16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)\\.ffn_.*_exps.=CPU'
-      break
-    case 'alternating':
-      // Every other layer to CPU
-      props.config.moe_offload_custom = 'blk\\.(0|2|4|6|8|10|12|14|16|18|20|22|24|26|28|30)\\.ffn_.*_exps.=CPU'
-      break
-    case 'custom':
-      // User will input custom pattern - don't overwrite existing value
-      if (!props.config.moe_offload_custom) {
-        props.config.moe_offload_custom = ''
-      }
-      break
-    default:
-      props.config.moe_offload_custom = ''
-  }
-}
-</script>
-
-<style scoped>
-.performance-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.tab-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin: 0 0 var(--spacing-md) 0;
-  font-size: 1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.flash-attention-warning {
-  display: flex;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: var(--status-warning-soft);
-  border: 1px solid var(--status-warning);
-  border-radius: var(--radius-md);
-  color: var(--status-warning);
-  margin-bottom: var(--spacing-md);
-}
-
-.flash-attention-warning i {
-  font-size: 1.25rem;
-  flex-shrink: 0;
-}
-
-.warning-content {
-  flex: 1;
-}
-
-.warning-content strong {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-}
-
-.warning-content p {
-  margin: 0;
-  font-size: 0.875rem;
-  line-height: 1.5;
-}
-
-.expert-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.inline-validation {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  margin-top: var(--spacing-xs);
-  font-size: 0.875rem;
-}
-
-.inline-validation.warning {
-  color: var(--status-warning);
-}
-
-.inline-validation.error {
-  color: var(--status-error);
-}
-
-.pattern-help {
-  margin-top: 0.25rem;
-}
-
-.pattern-help code {
-  background: var(--gradient-surface);
-  padding: 0.125rem 0.25rem;
-  border-radius: var(--radius-sm);
-  font-family: monospace;
-  font-size: 0.85em;
-  color: var(--accent-cyan);
-}
-</style>
diff --git a/frontend/src/components/config/QuickStartModal.vue b/frontend/src/components/config/QuickStartModal.vue
deleted file mode 100644
index 0965d4a..0000000
--- a/frontend/src/components/config/QuickStartModal.vue
+++ /dev/null
@@ -1,352 +0,0 @@
-<template>
-  <Dialog 
-    :visible="visible" 
-    modal 
-    :closable="true"
-    :dismissableMask="true"
-    :draggable="false"
-    class="quick-start-modal"
-    @update:visible="$emit('update:visible', $event)"
-    @hide="$emit('update:visible', false)"
-  >
-    <template #header>
-      <div class="quick-start-modal-header">
-        <div class="quick-start-icon">🚀</div>
-        <div>
-          <h3>Quick Start</h3>
-          <p>Choose a preset, use the wizard, or let Smart Auto optimize for you</p>
-        </div>
-      </div>
-    </template>
-
-    <div class="quick-start-content">
-      <div class="preset-cards">
-        <div 
-          class="preset-card wizard-card" 
-          role="button"
-          tabindex="0"
-          aria-label="Configuration Wizard - Guided 3-step setup for new users"
-          @click="handleWizardClick"
-          @keydown.enter="handleWizardClick"
-          @keydown.space.prevent="handleWizardClick"
-        >
-          <div class="preset-icon" aria-hidden="true">✨</div>
-          <div class="preset-info">
-            <h4>Configuration Wizard</h4>
-            <p>Guided 3-step setup for new users</p>
-          </div>
-        </div>
-        <div 
-          class="preset-card" 
-          role="button"
-          tabindex="0"
-          aria-label="Coding preset - Low temperature, high precision for code generation"
-          @click="handlePresetClick('coding')"
-          @keydown.enter="handlePresetClick('coding')"
-          @keydown.space.prevent="handlePresetClick('coding')"
-        >
-          <div class="preset-icon" aria-hidden="true">💻</div>
-          <div class="preset-info">
-            <h4>Coding</h4>
-            <p>Low temperature, high precision for code generation</p>
-          </div>
-        </div>
-        <div 
-          class="preset-card" 
-          role="button"
-          tabindex="0"
-          aria-label="Chat preset - Balanced settings for natural conversation"
-          @click="handlePresetClick('conversational')"
-          @keydown.enter="handlePresetClick('conversational')"
-          @keydown.space.prevent="handlePresetClick('conversational')"
-        >
-          <div class="preset-icon" aria-hidden="true">💬</div>
-          <div class="preset-info">
-            <h4>Chat</h4>
-            <p>Balanced settings for natural conversation</p>
-          </div>
-        </div>
-      </div>
-      
-      <div class="smart-auto-section">
-        <div class="smart-auto-header">
-          <i class="pi pi-bolt"></i>
-          <h4>Smart Auto Configuration</h4>
-        </div>
-        <p class="smart-auto-description">Automatically optimize settings based on your hardware and use case</p>
-        
-        <div class="usage-mode-selector" role="radiogroup" aria-label="Usage mode selection">
-          <div 
-            class="radio-option" 
-            role="radio"
-            :aria-checked="localUsageMode === 'single_user'"
-            tabindex="0"
-            aria-label="Single User mode - Sequential requests, maximum context"
-            @click="localUsageMode = 'single_user'"
-            @keydown.enter="localUsageMode = 'single_user'"
-            @keydown.space.prevent="localUsageMode = 'single_user'"
-            :class="{ active: localUsageMode === 'single_user' }"
-          >
-            <i class="pi pi-user" aria-hidden="true"></i>
-            <div>
-              <strong>Single User</strong>
-              <small>Sequential requests, maximum context</small>
-            </div>
-          </div>
-          <div 
-            class="radio-option" 
-            role="radio"
-            :aria-checked="localUsageMode === 'multi_user'"
-            tabindex="0"
-            aria-label="Multi User Server mode - Parallel requests, optimized batching"
-            @click="localUsageMode = 'multi_user'"
-            @keydown.enter="localUsageMode = 'multi_user'"
-            @keydown.space.prevent="localUsageMode = 'multi_user'"
-            :class="{ active: localUsageMode === 'multi_user' }"
-          >
-            <i class="pi pi-users" aria-hidden="true"></i>
-            <div>
-              <strong>Multi User Server</strong>
-              <small>Parallel requests, optimized batching</small>
-            </div>
-          </div>
-        </div>
-        
-        <Button 
-          label="Generate Optimal Config" 
-          icon="pi pi-bolt" 
-          @click="handleSmartAuto"
-          :loading="autoConfigLoading" 
-          size="large" 
-          class="smart-auto-button"
-          aria-label="Generate optimal configuration automatically based on hardware and use case"
-        />
-      </div>
-    </div>
-    
-    <template #footer>
-      <Button label="Close" icon="pi pi-times" @click="$emit('update:visible', false)" severity="secondary" />
-    </template>
-  </Dialog>
-</template>
-
-<script setup>
-// Vue
-import { ref, watch } from 'vue'
-
-// PrimeVue
-import Dialog from 'primevue/dialog'
-import Button from 'primevue/button'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  autoConfigLoading: {
-    type: Boolean,
-    default: false
-  },
-  smartAutoUsageMode: {
-    type: String,
-    default: 'single_user'
-  }
-})
-
-const emit = defineEmits(['update:visible', 'wizard', 'preset', 'smart-auto'])
-
-const localUsageMode = ref(props.smartAutoUsageMode)
-
-watch(() => props.smartAutoUsageMode, (newVal) => {
-  localUsageMode.value = newVal
-})
-
-watch(() => props.visible, (newVal) => {
-  if (newVal) {
-    localUsageMode.value = props.smartAutoUsageMode
-  }
-})
-
-const handleWizardClick = () => {
-  emit('wizard')
-  emit('update:visible', false)
-}
-
-const handlePresetClick = (preset) => {
-  emit('preset', preset)
-  emit('update:visible', false)
-}
-
-const handleSmartAuto = () => {
-  emit('smart-auto', localUsageMode.value)
-}
-</script>
-
-<style scoped>
-.quick-start-modal-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.quick-start-icon {
-  font-size: 2rem;
-  line-height: 1;
-}
-
-.quick-start-modal-header h3 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.5rem;
-  font-weight: 700;
-  color: var(--text-primary);
-}
-
-.quick-start-modal-header p {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.quick-start-content {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xl);
-}
-
-.preset-cards {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-  gap: var(--spacing-md);
-}
-
-.preset-card {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  cursor: pointer;
-  transition: all var(--transition-normal);
-  text-align: center;
-}
-
-.preset-card:hover {
-  border-color: var(--accent-cyan);
-  transform: translateY(-2px);
-  box-shadow: var(--shadow-md);
-}
-
-.preset-card:focus {
-  outline: 2px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-.preset-icon {
-  font-size: 2.5rem;
-  line-height: 1;
-}
-
-.preset-info h4 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.preset-info p {
-  margin: 0;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.smart-auto-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border-radius: var(--radius-lg);
-  border: 1px solid var(--border-primary);
-}
-
-.smart-auto-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.smart-auto-header i {
-  font-size: 1.5rem;
-  color: var(--accent-cyan);
-}
-
-.smart-auto-header h4 {
-  margin: 0;
-  font-size: 1.25rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.smart-auto-description {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.usage-mode-selector {
-  display: flex;
-  gap: var(--spacing-md);
-}
-
-.radio-option {
-  flex: 1;
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: var(--bg-card);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-md);
-  cursor: pointer;
-  transition: all var(--transition-normal);
-}
-
-.radio-option:hover {
-  border-color: var(--accent-cyan);
-}
-
-.radio-option.active {
-  border-color: var(--accent-cyan);
-  background: var(--bg-surface);
-}
-
-.radio-option:focus {
-  outline: 2px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-.radio-option i {
-  font-size: 1.5rem;
-  color: var(--accent-cyan);
-}
-
-.radio-option strong {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-  color: var(--text-primary);
-  font-weight: 600;
-}
-
-.radio-option small {
-  display: block;
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-}
-
-.smart-auto-button {
-  width: 100%;
-}
-</style>
diff --git a/frontend/src/components/config/SettingsTooltip.vue b/frontend/src/components/config/SettingsTooltip.vue
deleted file mode 100644
index c669174..0000000
--- a/frontend/src/components/config/SettingsTooltip.vue
+++ /dev/null
@@ -1,129 +0,0 @@
-<template>
-  <span class="settings-tooltip-wrapper">
-    <i 
-      class="pi pi-info-circle tooltip-icon" 
-      v-tooltip.top.right="{ value: tooltipContent, escape: false, fitContent: true }"
-    ></i>
-  </span>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-
-const props = defineProps({
-  title: {
-    type: String,
-    required: true
-  },
-  description: {
-    type: String,
-    required: true
-  },
-  whenToAdjust: {
-    type: String,
-    default: null
-  },
-  tradeoffs: {
-    type: Array,
-    default: () => []
-  },
-  recommended: {
-    type: String,
-    default: null
-  },
-  ranges: {
-    type: Array,
-    default: () => []
-  }
-})
-
-const tooltipContent = computed(() => {
-  let content = `<div style="max-width: 380px; text-align: left; padding: 0;">`
-  content += `<strong style="font-size: 1rem; font-weight: 600; color: var(--accent-cyan); display: block; margin-bottom: 0.5rem;">${props.title}</strong>`
-  content += `<div style="margin-bottom: 0.75rem; line-height: 1.5;">`
-  content += `<strong style="font-weight: 600;">What it does:</strong> ${props.description}`
-  content += `</div>`
-  
-  if (props.whenToAdjust) {
-    content += `<div style="margin-bottom: 0.75rem; line-height: 1.5;">`
-    content += `<strong style="font-weight: 600;">When to adjust:</strong> ${props.whenToAdjust}`
-    content += `</div>`
-  }
-  
-  if (props.tradeoffs && props.tradeoffs.length > 0) {
-    content += `<div style="margin-bottom: 0.75rem;">`
-    content += `<strong style="font-weight: 600; display: block; margin-bottom: 0.25rem;">Trade-offs:</strong>`
-    content += `<ul style="margin: 0.25rem 0; padding-left: 1.25rem; list-style: disc; line-height: 1.5;">`
-    props.tradeoffs.forEach(tradeoff => {
-      content += `<li style="margin: 0.125rem 0;">${tradeoff}</li>`
-    })
-    content += `</ul></div>`
-  }
-  
-  if (props.ranges && props.ranges.length > 0) {
-    content += `<div style="margin-bottom: 0.75rem;">`
-    content += `<strong style="font-weight: 600; display: block; margin-bottom: 0.25rem;">Recommended ranges:</strong>`
-    content += `<ul style="margin: 0.25rem 0; padding-left: 1.25rem; list-style: disc; line-height: 1.5;">`
-    props.ranges.forEach(range => {
-      content += `<li style="margin: 0.125rem 0;">${range}</li>`
-    })
-    content += `</ul></div>`
-  }
-  
-  if (props.recommended) {
-    content += `<div style="margin-top: 0.5rem; padding-top: 0.75rem; border-top: 1px solid var(--border-primary); line-height: 1.5;">`
-    content += `<strong style="font-weight: 600; color: var(--accent-green);">For this model:</strong> ${props.recommended}`
-    content += `</div>`
-  }
-  
-  content += `</div>`
-  return content
-})
-</script>
-
-<style scoped>
-.settings-tooltip-wrapper {
-  display: inline-flex;
-  align-items: center;
-  margin-left: var(--spacing-xs);
-}
-
-.tooltip-icon {
-  color: var(--accent-cyan);
-  font-size: 0.9rem;
-  cursor: help;
-  transition: all var(--transition-normal);
-}
-
-.tooltip-icon:hover {
-  color: var(--accent-primary);
-  transform: scale(1.1);
-}
-</style>
-
-<style>
-/* Override tooltip positioning to prevent overflow */
-.p-tooltip {
-  max-width: min(380px, calc(100vw - 20px)) !important;
-  word-wrap: break-word !important;
-  overflow-wrap: break-word !important;
-}
-
-/* Ensure tooltips stay within viewport */
-.p-tooltip-top {
-  margin-top: 0.5rem !important;
-}
-
-.p-tooltip-right {
-  margin-left: 0.5rem !important;
-}
-
-.p-tooltip-left {
-  margin-right: 0.5rem !important;
-}
-
-.p-tooltip-bottom {
-  margin-bottom: 0.5rem !important;
-}
-</style>
-
diff --git a/frontend/src/components/layout/AppFooter.vue b/frontend/src/components/layout/AppFooter.vue
index b08db07..ca3a9db 100644
--- a/frontend/src/components/layout/AppFooter.vue
+++ b/frontend/src/components/layout/AppFooter.vue
@@ -1,47 +1,34 @@
-<template>
-  <footer class="layout-footer">
-    <div class="footer-content">
-      <span>llama.cpp Studio v1.0.0</span>
-      <div class="connection-status">
-        <i :class="connectionStatus.icon" :style="{ color: connectionStatus.color }"></i>
-        <span>{{ connectionStatus.label }}</span>
-      </div>
-    </div>
-  </footer>
-</template>
-
-<script setup>
-import { computed } from 'vue'
-import { useWebSocketStore } from '@/stores/websocket'
-
-const wsStore = useWebSocketStore()
-
-const connectionStatus = computed(() => {
-  if (wsStore.connectionStatus === 'connected') {
-    return {
-      icon: 'pi pi-check-circle',
-      color: 'var(--status-success)',
-      label: 'Connected'
-    }
-  }
-
-  if (wsStore.connectionStatus === 'reconnecting') {
-    return {
-      icon: 'pi pi-spin pi-spinner',
-      color: 'var(--status-warning)',
-      label: 'Reconnecting...'
-    }
-  }
-
-  return {
-    icon: 'pi pi-times-circle',
-    color: 'var(--status-error)',
-    label: 'Disconnected'
-  }
-})
-</script>
-
-<style scoped>
-/* Footer styles are in global _base.css */
-</style>
-
+<template>
+  <footer class="layout-footer">
+    <div class="footer-content">
+      <span>llama.cpp Studio v{{ appVersion }}</span>
+      <div class="live-status" :title="progressStore.isConnected ? 'Live updates (SSE)' : 'Reconnecting…'">
+        <i
+          v-if="progressStore.isConnected"
+          class="pi pi-check-circle"
+          style="color: var(--status-success)"
+          aria-hidden="true"
+        />
+        <i
+          v-else
+          class="pi pi-clock"
+          style="color: var(--status-warning)"
+          aria-hidden="true"
+        />
+        <span>{{ progressStore.isConnected ? 'Live' : 'Reconnecting…' }}</span>
+      </div>
+    </div>
+  </footer>
+</template>
+
+<script setup>
+import { useProgressStore } from '@/stores/progress'
+
+const progressStore = useProgressStore()
+const appVersion = typeof __APP_VERSION__ !== 'undefined' ? __APP_VERSION__ : '1.0.0'
+</script>
+
+<style scoped>
+/* Footer styles are in global _base.css */
+</style>
+
diff --git a/frontend/src/components/layout/AppNavigation.vue b/frontend/src/components/layout/AppNavigation.vue
index 3f6b932..3f52c84 100644
--- a/frontend/src/components/layout/AppNavigation.vue
+++ b/frontend/src/components/layout/AppNavigation.vue
@@ -1,73 +1,73 @@
-<template>
-  <nav class="layout-nav animate-slide-in-up">
-    <div class="nav-content">
-      <Button 
-        label="Models" 
-        icon="pi pi-database"
-        :class="{ 'p-button-outlined': $route.name !== 'models' }"
-        @click="$router.push('/models')"
-        class="nav-button"
-      />
-      <Button 
-        label="Search" 
-        icon="pi pi-search"
-        :class="{ 'p-button-outlined': $route.name !== 'search' }"
-        @click="$router.push('/search')"
-        class="nav-button"
-      />
-      <Button 
-        label="System" 
-        icon="pi pi-desktop"
-        :class="{ 'p-button-outlined': $route.name !== 'system' }"
-        @click="$router.push('/system')"
-        class="nav-button"
-      />
-    </div>
-  </nav>
-</template>
-
-<script setup>
-import { useRouter, useRoute } from 'vue-router'
-import Button from 'primevue/button'
-
-const router = useRouter()
-const $route = useRoute()
-</script>
-
-<style scoped>
-/* Navigation button styling */
-.nav-content .p-button {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.nav-content .p-button .p-button-icon {
-  margin-right: var(--spacing-sm);
-  transition: transform var(--transition-normal);
-}
-
-.nav-content .p-button:hover .p-button-icon {
-  transform: scale(1.1) rotate(5deg);
-}
-
-.nav-content .p-button:not(.p-button-outlined) {
-  background: var(--gradient-primary);
-  color: white;
-  border: none;
-  box-shadow: var(--shadow-md), var(--glow-primary);
-}
-
-.nav-content .p-button:not(.p-button-outlined):hover {
-  transform: translateY(-2px);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-}
-
-.nav-content .p-button.p-button-outlined:hover {
-  background: var(--gradient-primary);
-  color: white;
-  border-color: var(--accent-cyan);
-  transform: translateY(-2px);
-}
-</style>
-
+<template>
+  <nav class="layout-nav animate-slide-in-up">
+    <div class="nav-content">
+      <Button 
+        label="Models" 
+        icon="pi pi-database"
+        :class="{ 'p-button-outlined': $route.name !== 'models' }"
+        @click="$router.push('/models')"
+        class="nav-button"
+      />
+      <Button 
+        label="Search" 
+        icon="pi pi-search"
+        :class="{ 'p-button-outlined': $route.name !== 'search' }"
+        @click="$router.push('/search')"
+        class="nav-button"
+      />
+      <Button 
+        label="Engines" 
+        icon="pi pi-cog"
+        :class="{ 'p-button-outlined': $route.name !== 'engines' }"
+        @click="$router.push('/engines')"
+        class="nav-button"
+      />
+    </div>
+  </nav>
+</template>
+
+<script setup>
+import { useRouter, useRoute } from 'vue-router'
+import Button from 'primevue/button'
+
+const router = useRouter()
+const $route = useRoute()
+</script>
+
+<style scoped>
+/* Navigation button styling */
+.nav-content .p-button {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-sm);
+}
+
+.nav-content .p-button .p-button-icon {
+  margin-right: var(--spacing-sm);
+  transition: transform var(--transition-normal);
+}
+
+.nav-content .p-button:hover .p-button-icon {
+  transform: scale(1.1) rotate(5deg);
+}
+
+.nav-content .p-button:not(.p-button-outlined) {
+  background: var(--gradient-primary);
+  color: white;
+  border: none;
+  box-shadow: var(--shadow-md), var(--glow-primary);
+}
+
+.nav-content .p-button:not(.p-button-outlined):hover {
+  transform: translateY(-2px);
+  box-shadow: var(--shadow-lg), var(--glow-primary);
+}
+
+.nav-content .p-button.p-button-outlined:hover {
+  background: var(--gradient-primary);
+  color: white;
+  border-color: var(--accent-cyan);
+  transform: translateY(-2px);
+}
+</style>
+
diff --git a/frontend/src/components/system/CudaInstaller.vue b/frontend/src/components/system/CudaInstaller.vue
deleted file mode 100644
index a2d2bf1..0000000
--- a/frontend/src/components/system/CudaInstaller.vue
+++ /dev/null
@@ -1,534 +0,0 @@
-<template>
-  <div class="cuda-installer">
-    <div class="card-header">
-      <h3>CUDA Toolkit Manager</h3>
-      <Button 
-        icon="pi pi-refresh" 
-        @click="refreshStatus"
-        :loading="loading"
-        severity="secondary"
-        text
-        v-tooltip="'Refresh CUDA status'"
-      />
-    </div>
-
-    <!-- Current Installation Status -->
-    <div v-if="cudaStatus" class="status-section">
-      <div class="status-card" :class="{ 'installed': cudaStatus.installed }">
-        <div class="status-header">
-          <i :class="cudaStatus.installed ? 'pi pi-check-circle' : 'pi pi-times-circle'" 
-             :style="{ color: cudaStatus.installed ? 'var(--green-500)' : 'var(--red-500)' }"></i>
-          <h4>{{ cudaStatus.installed ? 'CUDA Installed' : 'CUDA Not Installed' }}</h4>
-        </div>
-        <div v-if="cudaStatus.installed" class="status-details">
-          <div class="detail-row">
-            <span class="detail-label">Current Version:</span>
-            <span class="detail-value">{{ cudaStatus.version }}</span>
-          </div>
-          <div v-if="cudaStatus.cuda_path" class="detail-row">
-            <span class="detail-label">Installation Path:</span>
-            <span class="detail-value">{{ cudaStatus.cuda_path }}</span>
-          </div>
-          <div v-if="cudaStatus.installed_at" class="detail-row">
-            <span class="detail-label">Installed:</span>
-            <span class="detail-value">{{ formatDate(cudaStatus.installed_at) }}</span>
-          </div>
-        </div>
-        <div v-else class="status-details">
-          <p>CUDA Toolkit is required for CUDA-enabled builds. Install a version below to enable CUDA support.</p>
-        </div>
-      </div>
-    </div>
-
-    <!-- Installed Versions List -->
-    <div v-if="cudaStatus?.installed_versions && cudaStatus.installed_versions.length > 0" class="installed-versions">
-      <h4>Installed Versions</h4>
-      <div class="version-list">
-        <div 
-          v-for="installed in cudaStatus.installed_versions" 
-          :key="installed.version"
-          class="version-card"
-          :class="{ 'current': installed.is_current }"
-        >
-          <div class="version-header">
-            <div class="version-info">
-              <span class="version-label">CUDA {{ installed.version }}</span>
-              <Tag v-if="installed.is_current" value="Current" severity="success" />
-            </div>
-            <Button
-              icon="pi pi-trash"
-              label="Uninstall"
-              severity="danger"
-              outlined
-              size="small"
-              @click="confirmUninstall(installed)"
-              :disabled="installing || cudaStatus?.operation"
-            />
-          </div>
-          <div class="version-details">
-            <div class="detail-row">
-              <span class="detail-label">Path:</span>
-              <span class="detail-value">{{ installed.path }}</span>
-            </div>
-            <div v-if="installed.installed_at" class="detail-row">
-              <span class="detail-label">Installed:</span>
-              <span class="detail-value">{{ formatDate(installed.installed_at) }}</span>
-            </div>
-          </div>
-        </div>
-      </div>
-    </div>
-
-    <!-- Installation Section -->
-    <div class="install-section">
-      <h4>Install CUDA Toolkit</h4>
-      <div class="install-form">
-        <div class="form-group">
-          <label>CUDA Version</label>
-          <Dropdown
-            v-model="selectedCudaVersion"
-            :options="availableVersions"
-            placeholder="Select CUDA version"
-            class="version-select"
-          />
-        </div>
-        <div class="form-info">
-          <p><strong>Platform:</strong> {{ cudaStatus?.platform?.[0] || 'Unknown' }} / {{ cudaStatus?.platform?.[1] || 'Unknown' }}</p>
-          <p v-if="cudaStatus?.platform?.[0] === 'linux'" class="info-text">
-            Linux installation may require appropriate permissions. The installer will run in silent mode.
-          </p>
-        </div>
-      </div>
-
-      <!-- Installation Progress -->
-      <div v-if="cudaInstallProgress" class="install-progress">
-        <ProgressBar :value="cudaInstallProgress.progress || 0" />
-        <p class="progress-message">{{ cudaInstallProgress.message || 'Preparing installation...' }}</p>
-      </div>
-
-      <!-- Installation Logs -->
-      <div v-if="cudaInstallLogs.length > 0" class="install-logs">
-        <div class="logs-header">
-          <h5>Installation Logs</h5>
-          <Button
-            icon="pi pi-times"
-            @click="cudaInstallLogs = []"
-            severity="secondary"
-            text
-            size="small"
-          />
-        </div>
-        <LogViewer 
-          :logs="cudaInstallLogs" 
-          mode="raw"
-          :show-header="false"
-          @clear="cudaInstallLogs = []"
-        />
-      </div>
-
-      <!-- Action Buttons -->
-      <div class="action-buttons">
-        <Button
-          v-if="!installing && !cudaStatus?.operation"
-          label="Install CUDA" 
-          icon="pi pi-download"
-          @click="handleInstall"
-          :disabled="!selectedCudaVersion"
-          severity="success"
-        />
-        <Button
-          v-if="installing || cudaStatus?.operation"
-          label="Installing..."
-          icon="pi pi-spin pi-spinner"
-          disabled
-        />
-      </div>
-    </div>
-
-    <!-- Operation Status -->
-    <div v-if="cudaStatus?.operation" class="operation-status">
-      <Message severity="info" :closable="false">
-        <template #messageicon>
-          <i class="pi pi-info-circle"></i>
-        </template>
-        Operation in progress: {{ cudaStatus.operation }}
-        <span v-if="cudaStatus.operation_started_at">
-          (Started: {{ formatDate(cudaStatus.operation_started_at) }})
-        </span>
-      </Message>
-    </div>
-
-    <!-- Error Message -->
-    <div v-if="cudaStatus?.last_error" class="error-message">
-      <Message severity="error" :closable="true" @close="cudaStatus.last_error = null">
-        <template #messageicon>
-          <i class="pi pi-exclamation-triangle"></i>
-        </template>
-        {{ cudaStatus.last_error }}
-      </Message>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, computed, onMounted, onUnmounted } from 'vue'
-import LogViewer from '@/components/common/LogViewer.vue'
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
-import { useConfirm } from 'primevue/useconfirm'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import Dropdown from 'primevue/dropdown'
-import ProgressBar from 'primevue/progressbar'
-import Tag from 'primevue/tag'
-import Message from 'primevue/message'
-
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
-const confirm = useConfirm()
-
-const cudaStatus = ref(null)
-const selectedCudaVersion = ref('12.9')
-const installing = ref(false)
-const loading = ref(false)
-const cudaInstallProgress = ref(null)
-const cudaInstallLogs = ref([])
-
-const availableVersions = computed(() => {
-  return cudaStatus.value?.available_versions || ['13.0', '12.9', '12.8', '12.7', '12.6', '12.5', '12.4', '12.3', '12.2', '12.1', '12.0', '11.9', '11.8']
-})
-
-const formatDate = (dateString) => {
-  if (!dateString) return 'Unknown'
-  try {
-    return new Date(dateString).toLocaleString()
-  } catch {
-    return dateString
-  }
-}
-
-const refreshStatus = async () => {
-  loading.value = true
-  try {
-    cudaStatus.value = await systemStore.getCudaStatus()
-  } catch (error) {
-    toast.error('Failed to fetch CUDA status')
-  } finally {
-    loading.value = false
-  }
-}
-
-const handleInstall = async () => {
-  if (!selectedCudaVersion.value) {
-    toast.error('Please select a CUDA version')
-    return
-  }
-
-  installing.value = true
-  cudaInstallProgress.value = null
-  cudaInstallLogs.value = []
-
-  try {
-    await systemStore.installCuda(selectedCudaVersion.value)
-    toast.success(`CUDA ${selectedCudaVersion.value} installation started`)
-  } catch (error) {
-    const errorMsg = error.response?.data?.detail || error.message || 'Failed to start CUDA installation'
-    toast.error(errorMsg)
-    installing.value = false
-  }
-}
-
-const confirmUninstall = (installed) => {
-  confirm.require({
-    message: `Are you sure you want to uninstall CUDA ${installed.version}? This will remove the installation directory and cannot be undone.`,
-    header: 'Uninstall CUDA Toolkit',
-    icon: 'pi pi-exclamation-triangle',
-    rejectLabel: 'Cancel',
-    acceptLabel: 'Uninstall',
-    accept: async () => {
-      try {
-        await systemStore.uninstallCuda(installed.version)
-        toast.success(`CUDA ${installed.version} uninstallation started`)
-        await refreshStatus()
-      } catch (error) {
-        const errorMsg = error.response?.data?.detail || error.message || 'Failed to start CUDA uninstallation'
-        toast.error(errorMsg)
-      }
-    }
-  })
-}
-
-// WebSocket handlers
-const handleCudaInstallStatus = (data) => {
-  if (data.status === 'completed') {
-    installing.value = false
-    cudaInstallProgress.value = { progress: 100, message: 'Installation completed!' }
-    toast.success(data.message || 'CUDA installation completed')
-    refreshStatus()
-  } else if (data.status === 'failed') {
-    installing.value = false
-    toast.error(data.message || 'CUDA installation failed')
-  }
-}
-
-const handleCudaInstallProgress = (data) => {
-  cudaInstallProgress.value = {
-    progress: data.progress || 0,
-    message: data.message || '',
-    stage: data.stage || 'unknown'
-  }
-}
-
-const handleCudaInstallLog = (data) => {
-  cudaInstallLogs.value.push(data.line)
-  if (cudaInstallLogs.value.length > 100) {
-    cudaInstallLogs.value = cudaInstallLogs.value.slice(-100)
-  }
-}
-
-const unsubscribeStatus = ref(null)
-const unsubscribeProgress = ref(null)
-const unsubscribeLog = ref(null)
-
-onMounted(async () => {
-  await refreshStatus()
-  
-  // Subscribe to CUDA installation events
-  unsubscribeStatus.value = wsStore.subscribe('cuda_install_status', handleCudaInstallStatus)
-  unsubscribeProgress.value = wsStore.subscribe('cuda_install_progress', handleCudaInstallProgress)
-  unsubscribeLog.value = wsStore.subscribe('cuda_install_log', handleCudaInstallLog)
-})
-
-onUnmounted(() => {
-  if (unsubscribeStatus.value) unsubscribeStatus.value()
-  if (unsubscribeProgress.value) unsubscribeProgress.value()
-  if (unsubscribeLog.value) unsubscribeLog.value()
-})
-</script>
-
-<style scoped>
-.cuda-installer {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  margin-bottom: 2rem;
-}
-
-.card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-lg);
-}
-
-.card-header h3 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.3rem;
-}
-
-.status-section {
-  margin-bottom: var(--spacing-xl);
-}
-
-.status-card {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-}
-
-.status-card.installed {
-  border-color: var(--green-500);
-  background: var(--green-50);
-}
-
-.status-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-md);
-}
-
-.status-header i {
-  font-size: 1.5rem;
-}
-
-.status-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-}
-
-.status-details {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.detail-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.detail-label {
-  font-weight: 600;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.detail-value {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-family: 'Courier New', monospace;
-  background: var(--bg-card);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-  font-size: 0.875rem;
-}
-
-.installed-versions {
-  margin-bottom: var(--spacing-xl);
-}
-
-.installed-versions h4 {
-  margin-bottom: var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 700;
-}
-
-.version-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.version-card {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-}
-
-.version-card.current {
-  border-color: var(--green-500);
-  background: var(--green-50);
-}
-
-.version-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-md);
-}
-
-.version-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.version-label {
-  font-weight: 700;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-}
-
-.version-details {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-
-.install-section h4 {
-  margin-bottom: var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 700;
-}
-
-.install-form {
-  margin-bottom: var(--spacing-lg);
-}
-
-.form-group {
-  margin-bottom: var(--spacing-md);
-}
-
-.form-group label {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.version-select {
-  width: 100%;
-}
-
-.form-info {
-  margin-top: var(--spacing-md);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  border: 1px solid var(--border-primary);
-}
-
-.form-info p {
-  margin: var(--spacing-xs) 0;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.info-text {
-  color: var(--text-secondary);
-  font-size: 0.875rem;
-  font-style: italic;
-}
-
-.install-progress {
-  margin-bottom: var(--spacing-lg);
-}
-
-.progress-message {
-  margin-top: var(--spacing-sm);
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.install-logs {
-  margin-bottom: var(--spacing-lg);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-md);
-  padding: var(--spacing-md);
-}
-
-.logs-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-sm);
-}
-
-.logs-header h5 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-}
-
-
-.action-buttons {
-  display: flex;
-  gap: var(--spacing-md);
-}
-
-.operation-status,
-.error-message {
-  margin-top: var(--spacing-lg);
-}
-</style>
-
diff --git a/frontend/src/components/system/LMDeployTab.vue b/frontend/src/components/system/LMDeployTab.vue
deleted file mode 100644
index a921b44..0000000
--- a/frontend/src/components/system/LMDeployTab.vue
+++ /dev/null
@@ -1,347 +0,0 @@
-<template>
-  <div class="lmdeploy-page">
-    <section class="card">
-      <header class="card-header">
-        <div>
-          <h2>LMDeploy Installer</h2>
-          <p class="card-subtitle">Install or remove LMDeploy inside the running container without rebuilding the image.</p>
-        </div>
-        <Tag
-          :value="installed ? 'Installed' : 'Not Installed'"
-          :severity="installed ? 'success' : 'warning'"
-        />
-      </header>
-
-      <div class="status-grid">
-        <div>
-          <label>Status</label>
-          <div class="status-value">
-            <i
-              :class="[
-                'pi',
-                operationInProgress ? 'pi-spin pi-spinner text-warning' : installed ? 'pi-check-circle text-success' : 'pi-info-circle text-muted'
-              ]"
-            ></i>
-            <span>
-              {{ operationInProgress ? `Running ${status?.operation}…` : installed ? 'Ready' : 'Install required' }}
-            </span>
-          </div>
-          <small v-if="status?.operation_started_at">
-            Started at {{ formatDate(status.operation_started_at) }}
-          </small>
-        </div>
-        <div>
-          <label>Version</label>
-          <div class="status-value monospace">
-            {{ status?.version || 'Unknown' }}
-          </div>
-        </div>
-        <div>
-          <label>Binary Path</label>
-          <div class="status-value monospace truncate">
-            {{ status?.binary_path || 'Not found' }}
-          </div>
-        </div>
-        <div>
-          <label>Virtual Environment</label>
-          <div class="status-value monospace truncate">
-            {{ status?.venv_path || 'Not created' }}
-          </div>
-        </div>
-        <div>
-          <label>Last Error</label>
-          <div class="status-value error-text">
-            {{ status?.last_error || 'None' }}
-          </div>
-        </div>
-      </div>
-
-      <div class="card-actions">
-        <Button
-          label="Install LMDeploy"
-          icon="pi pi-download"
-          severity="success"
-          :loading="installing"
-          :disabled="operationInProgress || installed"
-          @click="startInstall"
-        />
-        <Button
-          label="Remove LMDeploy"
-          icon="pi pi-trash"
-          severity="danger"
-          outlined
-          :loading="removing"
-          :disabled="operationInProgress || !installed"
-          @click="startRemoval"
-        />
-        <Button
-          label="Refresh"
-          icon="pi pi-refresh"
-          severity="secondary"
-          text
-          :loading="statusLoading"
-          @click="refresh"
-        />
-      </div>
-
-      <p class="helper-text">
-        Need to run safetensors models? Install LMDeploy here, then start runtimes from the Safetensors panel.
-      </p>
-    </section>
-
-    <section class="card">
-      <header class="card-header">
-        <div>
-          <h3>Installer Logs</h3>
-          <p class="card-subtitle">Newest lines first. Use this to monitor pip progress if an install is running.</p>
-        </div>
-        <Button
-          icon="pi pi-refresh"
-          severity="secondary"
-          text
-          :loading="logLoading"
-          @click="refreshLogs"
-        />
-      </header>
-      <LogViewer v-if="logContent" :logs="parsedLogLines" mode="structured" />
-      <div v-else class="empty-log">
-        <i class="pi pi-info-circle"></i>
-        <p>No LMDeploy installer logs yet.</p>
-      </div>
-    </section>
-
-    <section class="card">
-      <header class="card-header">
-        <div>
-          <h3>Runtime Logs</h3>
-          <p class="card-subtitle">Logs from running LMDeploy server instances. Use this to monitor model serving activity.</p>
-        </div>
-        <Button
-          icon="pi pi-refresh"
-          severity="secondary"
-          text
-          :loading="runtimeLogLoading"
-          @click="refreshRuntimeLogs"
-        />
-      </header>
-      <LogViewer v-if="runtimeLogContent" :logs="parsedRuntimeLogLines" mode="structured" />
-      <div v-else class="empty-log">
-        <i class="pi pi-info-circle"></i>
-        <p>No LMDeploy runtime logs yet. Start a model to see logs here.</p>
-      </div>
-    </section>
-  </div>
-</template>
-
-<script setup>
-import { computed, onMounted, onUnmounted } from 'vue'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-
-import LogViewer from '@/components/common/LogViewer.vue'
-import { useLmdeployStore } from '@/stores/lmdeploy'
-import { formatDate } from '@/utils/formatting'
-
-const store = useLmdeployStore()
-
-const status = computed(() => store.status)
-const installed = computed(() => !!status.value?.installed)
-const operationInProgress = computed(() => !!status.value?.operation)
-const installing = computed(() => store.installing || status.value?.operation === 'install')
-const removing = computed(() => store.removing || status.value?.operation === 'remove')
-const statusLoading = computed(() => store.loading)
-const logLoading = computed(() => store.logLoading)
-const runtimeLogLoading = computed(() => store.runtimeLogLoading)
-const logContent = computed(() => store.logs || '')
-const runtimeLogContent = computed(() => store.runtimeLogs || '')
-const parsedLogLines = computed(() =>
-  (store.logs || '')
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      const match = line.match(/^\[(.*?)\]\s*(.*)$/)
-      const timestamp = match ? match[1] : ''
-      const data = match ? match[2] : line
-      return {
-        timestamp,
-        log_type: 'install',
-        data,
-        id: `${timestamp || 'log'}-${index}`
-      }
-    })
-)
-const parsedRuntimeLogLines = computed(() =>
-  (store.runtimeLogs || '')
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      // Try to parse timestamp from various log formats
-      const timestampMatch = line.match(/^(\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}[.\d]*Z?)\s*(.*)$/)
-      const bracketMatch = line.match(/^\[(.*?)\]\s*(.*)$/)
-      const timestamp = timestampMatch ? timestampMatch[1] : (bracketMatch ? bracketMatch[1] : '')
-      const data = timestampMatch ? timestampMatch[2] : (bracketMatch ? bracketMatch[2] : line)
-      return {
-        timestamp,
-        log_type: 'runtime',
-        data,
-        id: `runtime-${timestamp || 'log'}-${index}`
-      }
-    })
-)
-
-const refresh = async () => {
-  try {
-    await Promise.all([store.fetchStatus(), store.fetchLogs(), store.fetchRuntimeLogs()])
-  } catch (error) {
-    toast.error('Failed to refresh LMDeploy status')
-  }
-}
-
-const refreshLogs = async () => {
-  try {
-    await store.fetchLogs(16384)
-  } catch (error) {
-    toast.error('Failed to refresh LMDeploy logs')
-  }
-}
-
-const refreshRuntimeLogs = async () => {
-  try {
-    await store.fetchRuntimeLogs(16384)
-  } catch (error) {
-    toast.error('Failed to refresh LMDeploy runtime logs')
-  }
-}
-
-const startInstall = async () => {
-  try {
-    await store.install()
-    toast.success('LMDeploy installation started')
-  } catch (error) {
-    toast.error(error?.response?.data?.detail || 'Failed to start installation')
-  }
-}
-
-const startRemoval = async () => {
-  try {
-    await store.remove()
-    toast.success('LMDeploy removal started')
-  } catch (error) {
-    toast.error(error?.response?.data?.detail || 'Failed to start removal')
-  }
-}
-
-// formatDate is now imported from @/utils/formatting
-
-onMounted(() => {
-  refresh()
-  store.startWebSocketSubscriptions()
-})
-
-onUnmounted(() => {
-  store.stopWebSocketSubscriptions()
-})
-</script>
-
-<style scoped>
-.lmdeploy-page {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xl);
-}
-
-.card {
-  background: var(--bg-card);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-lg);
-  border: 1px solid var(--border-primary);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-lg);
-}
-
-.card-subtitle {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.95rem;
-}
-
-.status-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
-  gap: var(--spacing-lg);
-}
-
-.status-grid label {
-  display: block;
-  font-size: 0.75rem;
-  text-transform: uppercase;
-  letter-spacing: 0.08em;
-  color: var(--text-secondary);
-  margin-bottom: var(--spacing-xs);
-}
-
-.status-value {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.monospace {
-  font-family: 'JetBrains Mono', 'Fira Code', monospace;
-}
-
-.truncate {
-  max-width: 320px;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
-}
-
-.error-text {
-  color: var(--status-error);
-}
-
-.card-actions {
-  display: flex;
-  flex-wrap: wrap;
-  gap: var(--spacing-md);
-}
-
-.helper-text {
-  margin: 0;
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-}
-
-.empty-log {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
-  padding: var(--spacing-xl);
-}
-
-.text-success {
-  color: var(--status-success);
-}
-
-.text-warning {
-  color: var(--status-warning);
-}
-
-.text-muted {
-  color: var(--text-secondary);
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/BuildDialog.vue b/frontend/src/components/system/LlamaCppManager/BuildDialog.vue
deleted file mode 100644
index 2d0d465..0000000
--- a/frontend/src/components/system/LlamaCppManager/BuildDialog.vue
+++ /dev/null
@@ -1,715 +0,0 @@
-<template>
-  <BaseDialog
-    :visible="visible"
-    header="Build from Source"
-    :modal="true"
-    :dialog-style="{ width: '70vw', maxWidth: '900px' }"
-    :draggable="false"
-    :resizable="false"
-    dialog-class="build-dialog"
-    @update:visible="$emit('update:visible', $event)"
-  >
-    <div class="build-form">
-      <div class="dialog-section">
-        <h4 class="section-title">Source Information</h4>
-        <div class="form-row">
-          <div class="form-field">
-            <label>Repository Source *</label>
-            <Dropdown 
-              v-model="buildForm.repositorySource"
-              :options="repositorySourceOptions"
-              optionLabel="label"
-              optionValue="value"
-              placeholder="Select repository"
-            />
-            <small>Choose the repository to build from</small>
-          </div>
-        </div>
-        <div class="form-field full-width">
-          <label>Commit SHA or Branch *</label>
-          <InputText 
-            v-model="buildForm.commitSha"
-            placeholder="master/main or commit hash"
-          />
-          <small>Default: {{ defaultBranch }} (latest stable)</small>
-        </div>
-        
-        <div class="form-field full-width">
-          <label>Build Name Suffix (Optional)</label>
-          <InputText 
-            v-model="buildForm.versionSuffix"
-            placeholder="e.g., test-build, production"
-          />
-          <small>Custom suffix for version name. If empty, timestamp will be used.</small>
-        </div>
-        
-        <div v-if="previewVersionName" class="version-preview">
-          <i class="pi pi-info-circle"></i>
-          <span>Version name: <strong>{{ previewVersionName }}</strong></span>
-        </div>
-        
-        <div class="form-field full-width">
-          <label>Patches (Optional)</label>
-          <Textarea 
-            v-model="buildForm.patches"
-            rows="3"
-            placeholder="One patch URL per line"
-          />
-          <small>GitHub PR URLs or raw patch file URLs</small>
-        </div>
-      </div>
-      
-      <div class="dialog-section">
-        <h4 class="section-title">Build Configuration</h4>
-        <div class="form-row">
-          <div class="form-field">
-            <label>Build Type</label>
-            <Dropdown 
-              v-model="buildForm.buildType"
-              :options="['Release', 'Debug', 'RelWithDebInfo']"
-              placeholder="Select build type"
-            />
-            <small>Release recommended for production</small>
-          </div>
-        </div>
-      </div>
-      
-      <div class="dialog-section">
-        <h4 class="section-title">GPU Backends</h4>
-        <div class="checkbox-group">
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableCuda" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>CUDA</span>
-              <small class="capability-info" :class="getCapabilityClass(buildCapabilities?.cuda)">
-                Enables NVIDIA GPU acceleration (requires driver + CUDA runtime). {{ buildCapabilities?.cuda?.reason || 'Not detected' }}
-              </small>
-              <div v-if="buildForm.enableCuda && !cudaStatus?.installed" class="cuda-install-prompt">
-                <i class="pi pi-exclamation-triangle" style="color: var(--status-warning);"></i>
-                <span>CUDA Toolkit not detected. </span>
-                <Button 
-                  label="Install CUDA" 
-                  icon="pi pi-download"
-                  size="small"
-                  severity="warning"
-                  text
-                  @click="$emit('show-cuda-install')"
-                />
-              </div>
-            </div>
-          </div>
-          
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableVulkan" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Vulkan</span>
-              <small class="capability-info" :class="getCapabilityClass(buildCapabilities?.vulkan)">
-                Cross-vendor GPU backend for AMD/Intel/NVIDIA. {{ buildCapabilities?.vulkan?.reason || 'Not detected' }}
-              </small>
-            </div>
-          </div>
-          
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableMetal" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Metal</span>
-              <small class="capability-info" :class="getCapabilityClass(buildCapabilities?.metal)">
-                Apple Silicon/AMD Metal backend for macOS builds. {{ buildCapabilities?.metal?.reason || 'Not detected' }}
-              </small>
-            </div>
-          </div>
-          
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableOpenBLAS" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>OpenBLAS</span>
-              <small class="capability-info" :class="getCapabilityClass(buildCapabilities?.openblas)">
-                Uses CPU BLAS kernels (OpenBLAS backend). {{ buildCapabilities?.openblas?.reason || 'Not detected' }}
-              </small>
-            </div>
-          </div>
-          
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableFlashAttention" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Flash Attention (FA)</span>
-              <small class="capability-info" :class="buildForm.enableCuda ? 'text-blue-500' : 'text-gray-500'">
-                CUDA-only: compiles FlashAttention kernels needed for KV-cache quantization
-              </small>
-            </div>
-          </div>
-        </div>
-      </div>
-      
-      <div class="dialog-section">
-        <h4 class="section-title">Build Artifacts</h4>
-        <div class="checkbox-group">
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.buildExamples" 
-              :binary="true"
-              :disabled="buildForm.repositorySource === 'ik_llama.cpp'"
-            />
-            <div class="checkbox-label">
-              <span>Examples</span>
-              <small class="capability-info text-gray-500">
-                Compiles example apps (benchmarking, embedding demos, playground)
-              </small>
-              <div v-if="buildForm.repositorySource === 'ik_llama.cpp'" class="ik-llama-warning">
-                <i class="pi pi-exclamation-triangle"></i>
-                <span><strong>Required for ik_llama.cpp:</strong> Examples must be enabled to build the server (server is in examples directory)</span>
-              </div>
-            </div>
-          </div>
-
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.buildTests" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Test Suite</span>
-              <small class="capability-info text-gray-500">
-                Adds CTest targets; useful for CI or verifying new toolchains
-              </small>
-            </div>
-          </div>
-        </div>
-        <small class="option-note">
-          Core binaries (`llama-server`, CLI tooling, shared libraries) are always built to keep the Studio API fully functional.
-        </small>
-      </div>
-
-      <div class="dialog-section">
-        <h4 class="section-title">CPU &amp; Link Options</h4>
-        <div class="checkbox-group">
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableCpuAllVariants" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>CPU All Variants</span>
-              <small class="capability-info" :class="buildForm.enableBackendDl ? 'text-blue-500' : 'text-gray-500'">
-                Compiles every CPU ISA variant (AVX, AVX2, AVX512, etc.); requires backend loader
-              </small>
-            </div>
-          </div>
-
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableNative" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Native Optimizations</span>
-              <small class="capability-info text-gray-500">
-                Enables `-march=native` style tuning; disable to produce broadly portable binaries
-              </small>
-            </div>
-          </div>
-
-          <div class="checkbox-item">
-            <Checkbox 
-              v-model="buildForm.enableLto" 
-              :binary="true"
-            />
-            <div class="checkbox-label">
-              <span>Link Time Optimization (LTO)</span>
-              <small class="capability-info text-gray-500">
-                Turns on LTO / thin-LTO, shrinking binaries and improving throughput (longer link step)
-              </small>
-            </div>
-          </div>
-        </div>
-      </div>
-      
-      <Accordion class="advanced-accordion">
-        <AccordionTab header="Advanced Options">
-          <div class="form-field">
-            <label>Custom CMake Arguments</label>
-            <InputText 
-              v-model="buildForm.customCmakeArgs"
-              placeholder='-DLLAMA_BUILD_TESTS=OFF -DGGML_LTO=ON'
-            />
-            <small>Additional CMake flags</small>
-          </div>
-          <div class="form-row">
-            <div class="form-field">
-              <label>CFLAGS</label>
-              <InputText v-model="buildForm.cflags" placeholder="-O3 -march=native"/>
-            </div>
-            <div class="form-field">
-              <label>CXXFLAGS</label>
-              <InputText v-model="buildForm.cxxflags" placeholder="-O3 -march=native"/>
-            </div>
-          </div>
-        </AccordionTab>
-      </Accordion>
-    </div>
-    
-    <template #footer>
-      <Button 
-        label="Cancel" 
-        icon="pi pi-times" 
-        @click="$emit('update:visible', false)"
-        severity="secondary"
-        text
-      />
-      <Button 
-        label="Build" 
-        icon="pi pi-code" 
-        @click="handleBuild"
-        :loading="building"
-      />
-    </template>
-  </BaseDialog>
-</template>
-
-<script setup>
-import { ref, watch, computed, onMounted } from 'vue'
-import { useSystemStore } from '@/stores/system'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import InputText from 'primevue/inputtext'
-import Textarea from 'primevue/textarea'
-import Dropdown from 'primevue/dropdown'
-import Checkbox from 'primevue/checkbox'
-import Accordion from 'primevue/accordion'
-import AccordionTab from 'primevue/accordiontab'
-import BaseDialog from '@/components/common/BaseDialog.vue'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  buildCapabilities: {
-    type: Object,
-    default: null
-  },
-  cudaStatus: {
-    type: Object,
-    default: null
-  }
-})
-
-const emit = defineEmits(['update:visible', 'build', 'show-cuda-install'])
-
-const systemStore = useSystemStore()
-
-const building = ref(false)
-
-const buildForm = ref({
-  repositorySource: 'llama.cpp',
-  commitSha: 'master', // Default: master for llama.cpp, main for ik_llama.cpp
-  versionSuffix: '',
-  patches: '',
-  buildType: 'Release',
-  enableCuda: false,
-  enableVulkan: false,
-  enableMetal: false,
-  enableOpenBLAS: false,
-  enableFlashAttention: false,
-  buildCommon: true,
-  buildTests: false,
-  buildTools: true,
-  buildExamples: false,
-  buildServer: true,
-  installTools: true,
-  enableBackendDl: false,
-  enableCpuAllVariants: false,
-  enableLto: false,
-  enableNative: true,
-  customCmakeArgs: '',
-  cflags: '',
-  cxxflags: ''
-})
-
-const repositorySourceOptions = [
-  { label: 'llama.cpp (Official)', value: 'llama.cpp' },
-  { label: 'ik_llama.cpp (Fork)', value: 'ik_llama.cpp' }
-]
-
-const defaultBranch = computed(() => {
-  return buildForm.value.repositorySource === 'ik_llama.cpp' ? 'main' : 'master'
-})
-
-const previewVersionName = computed(() => {
-  if (!buildForm.value.commitSha) return null
-  const commitShort = buildForm.value.commitSha.substring(0, 8)
-  if (buildForm.value.versionSuffix) {
-    return `source-${commitShort}-${buildForm.value.versionSuffix}`
-  }
-  return `source-${commitShort}-{timestamp}`
-})
-
-// Update commit SHA when repository source changes
-watch(() => buildForm.value.repositorySource, (newSource) => {
-  // Only update if user hasn't manually changed the commit SHA from default
-  const currentDefault = newSource === 'ik_llama.cpp' ? 'main' : 'master'
-  if (buildForm.value.commitSha === 'master' || buildForm.value.commitSha === 'main') {
-    buildForm.value.commitSha = currentDefault
-  }
-  // Auto-enable buildExamples for ik_llama.cpp (required for server build)
-  if (newSource === 'ik_llama.cpp') {
-    buildForm.value.buildExamples = true
-  }
-})
-
-const getCapabilityClass = (capability) => {
-  if (!capability) return 'text-gray-500'
-  return capability.available ? 'text-green-500' : 'text-gray-500'
-}
-
-const handleBuild = async () => {
-  building.value = true
-  
-  try {
-    const patches = buildForm.value.patches
-      .split('\n')
-      .map(line => line.trim())
-      .filter(line => line)
-    
-    const buildConfig = {
-      build_type: buildForm.value.buildType || 'Release',
-      enable_cuda: buildForm.value.enableCuda || false,
-      enable_vulkan: buildForm.value.enableVulkan || false,
-      enable_metal: buildForm.value.enableMetal || false,
-      enable_openblas: buildForm.value.enableOpenBLAS || false,
-      enable_flash_attention: buildForm.value.enableFlashAttention || false,
-      build_common: buildForm.value.buildCommon,
-      build_tests: buildForm.value.buildTests,
-      build_tools: buildForm.value.buildTools,
-      build_examples: buildForm.value.buildExamples,
-      build_server: buildForm.value.buildServer,
-      install_tools: buildForm.value.installTools,
-      enable_backend_dl: buildForm.value.enableBackendDl,
-      enable_cpu_all_variants: buildForm.value.enableCpuAllVariants,
-      enable_lto: buildForm.value.enableLto,
-      enable_native: buildForm.value.enableNative,
-      custom_cmake_args: buildForm.value.customCmakeArgs || '',
-      cflags: buildForm.value.cflags || '',
-      cxxflags: buildForm.value.cxxflags || ''
-    }
-    
-    await systemStore.buildSource(
-      buildForm.value.commitSha, 
-      patches, 
-      buildConfig,
-      buildForm.value.repositorySource,
-      buildForm.value.versionSuffix || null
-    )
-    emit('build', { 
-      commitSha: buildForm.value.commitSha, 
-      patches, 
-      buildConfig,
-      repositorySource: buildForm.value.repositorySource,
-      versionSuffix: buildForm.value.versionSuffix
-    })
-    emit('update:visible', false)
-    toast.success('Build started successfully')
-  } catch (error) {
-    toast.error('Failed to start build from source')
-  } finally {
-    building.value = false
-  }
-}
-
-watch(() => props.buildCapabilities, (capabilities) => {
-  if (capabilities && props.visible) {
-    buildForm.value.enableCuda = capabilities.cuda?.recommended || false
-    buildForm.value.enableVulkan = capabilities.vulkan?.recommended || false
-    buildForm.value.enableMetal = capabilities.metal?.recommended || false
-    buildForm.value.enableOpenBLAS = capabilities.openblas?.recommended || false
-  }
-}, { immediate: true })
-
-watch(() => props.visible, (newVisible) => {
-  if (newVisible && props.buildCapabilities) {
-    buildForm.value.enableCuda = props.buildCapabilities.cuda?.recommended || false
-    buildForm.value.enableVulkan = props.buildCapabilities.vulkan?.recommended || false
-    buildForm.value.enableMetal = props.buildCapabilities.metal?.recommended || false
-    buildForm.value.enableOpenBLAS = props.buildCapabilities.openblas?.recommended || false
-    // Ensure buildExamples is enabled for ik_llama.cpp
-    if (buildForm.value.repositorySource === 'ik_llama.cpp') {
-      buildForm.value.buildExamples = true
-    }
-  }
-})
-
-watch(
-  () => buildForm.value.enableCpuAllVariants,
-  (value) => {
-    if (value) {
-      buildForm.value.enableBackendDl = true
-    }
-  }
-)
-
-watch(
-  () => buildForm.value.enableBackendDl,
-  (value) => {
-    if (!value && buildForm.value.enableCpuAllVariants) {
-      buildForm.value.enableCpuAllVariants = false
-    }
-  }
-)
-</script>
-
-<style scoped>
-.build-dialog :deep(.p-dialog-content) {
-  padding: 2rem 1.5rem !important;
-  overflow-y: auto !important;
-  max-height: calc(80vh - 120px) !important;
-  background: transparent !important;
-}
-
-.build-form {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-
-.dialog-section {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-  padding: 1.5rem;
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.dialog-section::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.dialog-section:hover {
-  border-color: var(--accent-cyan);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-  transform: translateY(-2px);
-}
-
-.dialog-section:hover::before {
-  opacity: 1;
-}
-
-.section-title {
-  margin: 0 0 0.75rem 0;
-  color: var(--text-primary);
-  font-size: 1.2rem;
-  font-weight: 700;
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-}
-
-.section-title::before {
-  content: '';
-  width: 4px;
-  height: 1.5rem;
-  background: var(--gradient-primary);
-  border-radius: 2px;
-}
-
-.form-row {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-  gap: 1rem;
-}
-
-.form-field {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.form-field.full-width {
-  grid-column: 1 / -1;
-}
-
-.form-field label {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-}
-
-.form-field small {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-  margin-top: 0.25rem;
-}
-
-.advanced-accordion {
-  margin-top: 0.5rem;
-}
-
-.checkbox-group {
-  display: flex;
-  flex-direction: column;
-  gap: 0.75rem;
-}
-
-.checkbox-item {
-  display: flex;
-  align-items: flex-start;
-  gap: 0.75rem;
-  padding: 1rem;
-  background: var(--gradient-surface);
-  border-radius: var(--radius-lg);
-  border: 1px solid var(--border-primary);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.checkbox-item::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 2px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.checkbox-item:hover {
-  background: var(--gradient-card);
-  border-color: var(--accent-cyan);
-  transform: translateX(4px);
-  box-shadow: var(--shadow-md);
-}
-
-.checkbox-item:hover::before {
-  opacity: 1;
-}
-
-.checkbox-label {
-  display: flex;
-  flex-direction: column;
-  gap: 0.25rem;
-  flex: 1;
-}
-
-.checkbox-label span {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-}
-
-.cuda-install-prompt {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-top: 0.5rem;
-  padding: 0.5rem;
-  background: var(--status-warning-soft);
-  border-radius: var(--radius-md);
-  font-size: 0.875rem;
-}
-
-.capability-info {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-  font-weight: 400;
-  line-height: 1.4;
-}
-
-.capability-info.text-green-500 {
-  color: var(--status-success);
-}
-
-.capability-info.text-gray-500 {
-  color: var(--text-secondary);
-}
-
-.capability-info.text-blue-500 {
-  color: var(--accent-blue);
-}
-
-.option-note {
-  display: block;
-  margin-top: 0.5rem;
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-}
-
-.version-preview {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  padding: 0.75rem;
-  background: var(--gradient-surface);
-  border-radius: var(--radius-md);
-  border: 1px solid var(--border-primary);
-  margin-top: 0.5rem;
-  font-size: 0.875rem;
-  color: var(--text-primary);
-}
-
-.version-preview i {
-  color: var(--accent-blue);
-}
-
-.version-preview strong {
-  color: var(--accent-cyan);
-  font-family: monospace;
-}
-
-.ik-llama-warning {
-  display: flex;
-  align-items: flex-start;
-  gap: 0.5rem;
-  margin-top: 0.5rem;
-  padding: 0.75rem;
-  background: var(--status-warning-soft);
-  border: 1px solid var(--status-warning);
-  border-radius: var(--radius-md);
-  font-size: 0.875rem;
-  color: var(--text-primary);
-  line-height: 1.4;
-}
-
-.ik-llama-warning i {
-  color: var(--status-warning);
-  margin-top: 0.1rem;
-  flex-shrink: 0;
-}
-
-.ik-llama-warning strong {
-  color: var(--status-warning);
-  font-weight: 600;
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/CudaInstallDialog.vue b/frontend/src/components/system/LlamaCppManager/CudaInstallDialog.vue
deleted file mode 100644
index 19a3781..0000000
--- a/frontend/src/components/system/LlamaCppManager/CudaInstallDialog.vue
+++ /dev/null
@@ -1,274 +0,0 @@
-<template>
-  <BaseDialog
-    :visible="visible"
-    header="Install CUDA Toolkit"
-    :modal="true"
-    :dialog-style="{ width: '60vw', maxWidth: '700px' }"
-    :draggable="false"
-    :resizable="false"
-    @update:visible="$emit('update:visible', $event)"
-  >
-    <div class="cuda-install-dialog">
-      <div v-if="cudaStatus" class="cuda-status-section">
-        <div class="status-card" :class="{ 'installed': cudaStatus.installed }">
-          <div class="status-header">
-            <i :class="cudaStatus.installed ? 'pi pi-check-circle' : 'pi pi-times-circle'" 
-               :style="{ color: cudaStatus.installed ? 'var(--status-success)' : 'var(--status-error)' }"></i>
-            <h4>{{ cudaStatus.installed ? 'CUDA Installed' : 'CUDA Not Installed' }}</h4>
-          </div>
-          <div v-if="cudaStatus.installed" class="status-details">
-            <p><strong>Version:</strong> {{ cudaStatus.version }}</p>
-            <p v-if="cudaStatus.cuda_path"><strong>Path:</strong> {{ cudaStatus.cuda_path }}</p>
-            <p v-if="cudaStatus.installed_at"><strong>Installed:</strong> {{ formatDate(cudaStatus.installed_at) }}</p>
-          </div>
-          <div v-else class="status-details">
-            <p>CUDA Toolkit is required for CUDA-enabled builds.</p>
-          </div>
-        </div>
-      </div>
-
-      <div v-if="!cudaStatus?.installed" class="install-section">
-        <div class="form-field">
-          <label>CUDA Version</label>
-          <Dropdown 
-            v-model="selectedCudaVersion"
-            :options="cudaStatus?.available_versions || ['13.0', '12.9', '12.8', '12.7', '12.6', '12.5', '12.4', '12.3', '12.2', '12.1', '12.0', '11.9', '11.8']"
-            placeholder="Select CUDA version"
-          />
-          <small>Recommended: 13.0 (latest stable)</small>
-        </div>
-
-        <div class="platform-info">
-          <p><strong>Platform:</strong> {{ cudaStatus?.platform?.[0] || 'Unknown' }} / {{ cudaStatus?.platform?.[1] || 'Unknown' }}</p>
-          <p class="warning-text">
-            <i class="pi pi-info-circle"></i>
-            <span v-if="cudaStatus?.platform?.[0] === 'linux'">
-              Linux installation may require sudo privileges. If installation fails, you may need to install CUDA manually.
-            </span>
-            <span v-else>
-              The installer will run in silent mode. Please ensure no other CUDA installations are in progress.
-            </span>
-          </p>
-        </div>
-
-        <div v-if="cudaInstallProgress" class="install-progress">
-          <ProgressBar :value="cudaInstallProgress.progress || 0" />
-          <p class="progress-message">{{ cudaInstallProgress.message || 'Preparing installation...' }}</p>
-        </div>
-
-        <div v-if="cudaInstallLogs.length > 0" class="install-logs">
-          <h5>Installation Logs</h5>
-          <LogViewer 
-            :logs="cudaInstallLogs" 
-            mode="raw"
-            :show-header="false"
-            :compact="true"
-            max-height="300px"
-          />
-        </div>
-      </div>
-    </div>
-
-    <template #footer>
-      <Button 
-        label="Close" 
-        icon="pi pi-times" 
-        @click="$emit('update:visible', false)"
-        severity="secondary"
-        text
-      />
-      <Button 
-        v-if="!cudaStatus?.installed && !installing"
-        label="Install CUDA" 
-        icon="pi pi-download"
-        @click="handleInstall"
-        :disabled="!selectedCudaVersion || cudaStatus?.operation"
-        :loading="installing"
-      />
-    </template>
-  </BaseDialog>
-</template>
-
-<script setup>
-import { ref, watch } from 'vue'
-import LogViewer from '@/components/common/LogViewer.vue'
-import { useSystemStore } from '@/stores/system'
-import { toast } from 'vue3-toastify'
-import { formatDate } from '@/utils/formatting'
-import Button from 'primevue/button'
-import Dropdown from 'primevue/dropdown'
-import ProgressBar from 'primevue/progressbar'
-import BaseDialog from '@/components/common/BaseDialog.vue'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  cudaStatus: {
-    type: Object,
-    default: null
-  },
-  cudaInstallProgress: {
-    type: Object,
-    default: null
-  },
-  cudaInstallLogs: {
-    type: Array,
-    default: () => []
-  }
-})
-
-const emit = defineEmits(['update:visible', 'install', 'status-update', 'progress', 'log'])
-
-const systemStore = useSystemStore()
-
-const selectedCudaVersion = ref('13.0')
-const installing = ref(false)
-
-const handleInstall = async () => {
-  if (!selectedCudaVersion.value) {
-    toast.error('Please select a CUDA version')
-    return
-  }
-
-  installing.value = true
-  
-  try {
-    await systemStore.installCuda(selectedCudaVersion.value)
-    toast.success(`CUDA ${selectedCudaVersion.value} installation started`)
-    emit('install', selectedCudaVersion.value)
-  } catch (error) {
-    const errorMsg = error.response?.data?.detail || error.message || 'Failed to start CUDA installation'
-    toast.error(errorMsg)
-  } finally {
-    installing.value = false
-  }
-}
-
-watch(() => props.cudaStatus, (status) => {
-  if (status?.installed) {
-    installing.value = false
-  }
-}, { deep: true })
-</script>
-
-<style scoped>
-.cuda-install-dialog {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.cuda-status-section {
-  margin-bottom: var(--spacing-md);
-}
-
-.status-card {
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-  background: var(--bg-card);
-}
-
-.status-card.installed {
-  border-color: var(--status-success);
-  background: var(--status-success-soft);
-}
-
-.status-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-md);
-}
-
-.status-header i {
-  font-size: 1.5rem;
-}
-
-.status-header h4 {
-  margin: 0;
-  font-size: 1.125rem;
-}
-
-.status-details {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.status-details p {
-  margin: 0;
-  font-size: 0.875rem;
-}
-
-.install-section {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.form-field {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.form-field label {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-}
-
-.form-field small {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-}
-
-.platform-info {
-  padding: var(--spacing-md);
-  background: var(--status-info-soft);
-  border-radius: var(--radius-md);
-  border-left: 3px solid var(--accent-blue);
-}
-
-.platform-info p {
-  margin: 0.25rem 0;
-  font-size: 0.875rem;
-}
-
-.warning-text {
-  display: flex;
-  align-items: flex-start;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
-}
-
-.warning-text i {
-  color: var(--accent-blue);
-  margin-top: 0.125rem;
-}
-
-.install-progress {
-  margin: var(--spacing-md) 0;
-}
-
-.progress-message {
-  margin-top: var(--spacing-sm);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.install-logs {
-  margin-top: var(--spacing-md);
-}
-
-.install-logs h5 {
-  margin: 0 0 var(--spacing-sm) 0;
-  font-size: 0.875rem;
-  font-weight: 600;
-}
-
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/ReleaseDialog.vue b/frontend/src/components/system/LlamaCppManager/ReleaseDialog.vue
deleted file mode 100644
index 264729e..0000000
--- a/frontend/src/components/system/LlamaCppManager/ReleaseDialog.vue
+++ /dev/null
@@ -1,419 +0,0 @@
-<template>
-  <BaseDialog
-    :visible="visible"
-    :header="selectedReleaseTag ? `Install Release ${selectedReleaseTag}` : 'Install Release'"
-    :modal="true"
-    :dialog-style="{ width: '60vw', maxWidth: '750px' }"
-    :draggable="false"
-    :resizable="false"
-    dialog-class="release-dialog"
-    @update:visible="handleVisibleChange"
-    @hide="handleHide"
-  >
-    <div class="release-dialog-body">
-      <div v-if="loading" class="release-assets-loading">
-        <ProgressSpinner style="width: 48px; height: 48px" strokeWidth="4" />
-        <span>Loading release artifacts…</span>
-      </div>
-      
-      <div v-else-if="error" class="release-assets-error">
-        <i class="pi pi-exclamation-triangle"></i>
-        <p>{{ error }}</p>
-        <div v-if="skippedAssets.length" class="skipped-artifacts">
-          <h5>Filtered Out</h5>
-          <ul>
-            <li v-for="asset in skippedAssets" :key="asset.id || asset.name">
-              <span class="skipped-name">{{ asset.name }}</span>
-              <span class="skipped-reason">{{ asset.compatibility_reason || 'Incompatible with container' }}</span>
-            </li>
-          </ul>
-        </div>
-      </div>
-      
-      <div v-else class="release-asset-list">
-        <div 
-          v-for="asset in assets" 
-          :key="asset.id" 
-          :class="['release-asset-option', { selected: selectedAssetId === asset.id }]"
-          @click="selectedAssetId = asset.id"
-        >
-          <div class="asset-option-header">
-            <RadioButton 
-              :inputId="`release-asset-${asset.id}`"
-              :value="asset.id"
-              v-model="selectedAssetId"
-            />
-            <label :for="`release-asset-${asset.id}`" class="asset-label">
-              <span class="asset-name">{{ asset.name }}</span>
-            </label>
-            <span class="asset-size">{{ formatBytes(asset.size) }}</span>
-          </div>
-          <div v-if="asset.features && asset.features.length" class="asset-features">
-            <Tag 
-              v-for="feature in asset.features"
-              :key="feature"
-              severity="info"
-              class="asset-feature-tag"
-            >
-              {{ feature }}
-            </Tag>
-          </div>
-          <div class="asset-meta">
-            <span class="archive-type">{{ (asset.archive_type || '').toUpperCase() }}</span>
-            <span 
-              v-if="asset.download_count !== undefined && asset.download_count !== null" 
-              class="download-count"
-            >
-              {{ asset.download_count }} downloads
-            </span>
-          </div>
-        </div>
-        
-        <div v-if="skippedAssets.length" class="skipped-artifacts">
-          <h5>Filtered Out</h5>
-          <ul>
-            <li v-for="asset in skippedAssets" :key="asset.id || asset.name">
-              <span class="skipped-name">{{ asset.name }}</span>
-              <span class="skipped-reason">{{ asset.compatibility_reason || 'Incompatible with container' }}</span>
-            </li>
-          </ul>
-        </div>
-      </div>
-    </div>
-    
-    <template #footer>
-      <Button 
-        label="Cancel" 
-        icon="pi pi-times" 
-        @click="handleCancel"
-        severity="secondary"
-        text
-      />
-      <Button 
-        label="Install" 
-        icon="pi pi-download" 
-        @click="handleInstall"
-        :loading="installing"
-        :disabled="!selectedAssetId || assets.length === 0 || loading"
-      />
-    </template>
-  </BaseDialog>
-</template>
-
-<script setup>
-import { ref, computed, watch } from 'vue'
-import { useSystemStore } from '@/stores/system'
-import { toast } from 'vue3-toastify'
-import { formatBytes } from '@/utils/formatting'
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-import RadioButton from 'primevue/radiobutton'
-import ProgressSpinner from 'primevue/progressspinner'
-import BaseDialog from '@/components/common/BaseDialog.vue'
-
-const props = defineProps({
-  visible: {
-    type: Boolean,
-    default: false
-  },
-  releaseTag: {
-    type: String,
-    default: null
-  }
-})
-
-const emit = defineEmits(['update:visible', 'installed', 'hide'])
-
-const systemStore = useSystemStore()
-
-const loading = ref(false)
-const error = ref(null)
-const assets = ref([])
-const skippedAssets = ref([])
-const selectedAssetId = ref(null)
-const installing = ref(false)
-
-const selectedAsset = computed(() => {
-  return assets.value.find(asset => asset.id === selectedAssetId.value) || null
-})
-
-const resetState = () => {
-  assets.value = []
-  skippedAssets.value = []
-  error.value = null
-  loading.value = false
-  selectedAssetId.value = null
-}
-
-const loadReleaseAssets = async (tagName) => {
-  if (!tagName) return
-  
-  resetState()
-  loading.value = true
-  
-  try {
-    const data = await systemStore.fetchReleaseAssets(tagName)
-    assets.value = (data?.assets || []).map(asset => ({
-      ...asset,
-      id: asset.id !== undefined && asset.id !== null ? Number(asset.id) : asset.id
-    }))
-    skippedAssets.value = (data?.skipped_assets || []).map(asset => ({
-      ...asset,
-      id: asset.id !== undefined && asset.id !== null ? Number(asset.id) : asset.id
-    }))
-    
-    if (assets.value.length === 0) {
-      error.value = 'No compatible artifacts were found for this release in the current container.'
-    } else {
-      const defaultId = data?.default_asset_id
-      if (defaultId !== undefined && defaultId !== null) {
-        selectedAssetId.value = Number(defaultId)
-      } else {
-        selectedAssetId.value = assets.value[0]?.id ?? null
-      }
-    }
-  } catch (err) {
-    if (err.response?.data?.detail) {
-      error.value = err.response.data.detail
-    } else if (err.message) {
-      error.value = err.message
-    } else {
-      error.value = 'Failed to load release artifacts.'
-    }
-  } finally {
-    loading.value = false
-  }
-}
-
-const handleVisibleChange = (value) => {
-  emit('update:visible', value)
-  if (value && props.releaseTag) {
-    loadReleaseAssets(props.releaseTag)
-  }
-}
-
-const handleHide = () => {
-  resetState()
-  emit('hide')
-}
-
-const handleCancel = () => {
-  emit('update:visible', false)
-}
-
-const handleInstall = async () => {
-  const tagName = props.releaseTag
-  const asset = selectedAsset.value
-  
-  if (!tagName) return
-  if (!asset) {
-    toast.error('Please select an artifact to install.')
-    return
-  }
-  
-  installing.value = true
-  let installSucceeded = false
-  
-  try {
-    await systemStore.installRelease(tagName, asset.id)
-    installSucceeded = true
-    const assetLabel = asset.name ? ` (${asset.name})` : ''
-    toast.success(`Installing release ${tagName}${assetLabel}`)
-    await systemStore.fetchLlamaVersions()
-    emit('installed', { tagName, asset })
-  } catch (err) {
-    let errorMessage = 'Failed to install release'
-    let detail = err.response?.data?.detail
-    if (typeof detail === 'string') {
-      const trimmedDetail = detail.startsWith('400:') ? detail.substring(4).trim() : detail
-      if (trimmedDetail.toLowerCase().includes('version already installed')) {
-        errorMessage = 'That release artifact is already installed. Select a different artifact or remove the existing installation first.'
-      } else if (trimmedDetail.length > 0) {
-        errorMessage = trimmedDetail
-      }
-    } else if (detail) {
-      errorMessage = detail
-    } else if (err.message) {
-      errorMessage = err.message
-    }
-    
-    toast.error(errorMessage)
-  } finally {
-    installing.value = false
-    if (installSucceeded) {
-      emit('update:visible', false)
-      resetState()
-    }
-  }
-}
-
-watch(() => props.releaseTag, (newTag) => {
-  if (props.visible && newTag) {
-    loadReleaseAssets(newTag)
-  }
-})
-
-watch(() => props.visible, (newVisible) => {
-  if (newVisible && props.releaseTag) {
-    loadReleaseAssets(props.releaseTag)
-  } else if (!newVisible) {
-    resetState()
-  }
-})
-</script>
-
-<style scoped>
-.release-dialog :deep(.p-dialog-content) {
-  padding-top: 0;
-}
-
-.release-dialog-body {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.release-assets-loading {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-2xl) 0;
-  color: var(--text-secondary);
-}
-
-.release-assets-error {
-  text-align: center;
-  padding: var(--spacing-xl) var(--spacing-lg);
-  color: var(--text-secondary);
-}
-
-.release-assets-error i {
-  font-size: 1.5rem;
-  color: var(--status-warning);
-  margin-bottom: var(--spacing-sm);
-  display: block;
-}
-
-.release-assets-error p {
-  margin: 0;
-}
-
-.release-asset-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-  max-height: 60vh;
-  overflow-y: auto;
-  padding-right: 0.5rem;
-}
-
-.release-asset-option {
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-  background: var(--bg-card);
-  transition: border-color var(--transition-fast), box-shadow var(--transition-fast), transform var(--transition-fast);
-  cursor: pointer;
-}
-
-.release-asset-option:hover {
-  border-color: var(--primary-color);
-  box-shadow: var(--shadow-sm);
-  transform: translateY(-1px);
-}
-
-.release-asset-option.selected {
-  border-color: var(--primary-color);
-  box-shadow: var(--shadow-sm);
-}
-
-.asset-option-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  justify-content: space-between;
-}
-
-.asset-label {
-  flex: 1;
-  display: flex;
-  flex-direction: column;
-  gap: 0.25rem;
-}
-
-.asset-name {
-  font-weight: 600;
-  color: var(--text-primary);
-  word-break: break-word;
-}
-
-.asset-size {
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-  white-space: nowrap;
-}
-
-.asset-features {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 0.5rem;
-  margin-top: var(--spacing-md);
-}
-
-.asset-feature-tag {
-  font-size: 0.75rem;
-  font-weight: 600;
-}
-
-.asset-meta {
-  margin-top: var(--spacing-md);
-  display: flex;
-  gap: var(--spacing-lg);
-  font-size: 0.8rem;
-  color: var(--text-secondary);
-}
-
-.archive-type {
-  text-transform: uppercase;
-  letter-spacing: 0.05em;
-}
-
-.download-count {
-  color: var(--text-secondary);
-}
-
-.skipped-artifacts {
-  margin-top: var(--spacing-lg);
-  border-top: 1px solid var(--border-primary);
-  padding-top: var(--spacing-md);
-  text-align: left;
-}
-
-.skipped-artifacts h5 {
-  margin: 0 0 var(--spacing-sm);
-  font-size: 0.9rem;
-  font-weight: 600;
-  color: var(--text-secondary);
-}
-
-.skipped-artifacts ul {
-  margin: 0;
-  padding-left: 1rem;
-  display: flex;
-  flex-direction: column;
-  gap: 0.4rem;
-}
-
-.skipped-name {
-  font-weight: 600;
-  color: var(--text-secondary);
-}
-
-.skipped-reason {
-  display: block;
-  font-size: 0.8rem;
-  color: var(--text-secondary);
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/UpdateInfo.vue b/frontend/src/components/system/LlamaCppManager/UpdateInfo.vue
deleted file mode 100644
index ddf817d..0000000
--- a/frontend/src/components/system/LlamaCppManager/UpdateInfo.vue
+++ /dev/null
@@ -1,135 +0,0 @@
-<template>
-  <div v-if="updateInfo" class="update-info">
-    <h3>Available Updates</h3>
-    <div class="update-cards">
-      <div class="update-card">
-        <div class="update-header">
-          <h4>Latest Release</h4>
-          <Tag :value="updateInfo.latest_release?.tag_name || 'N/A'" severity="info" />
-        </div>
-        <p v-if="updateInfo.latest_release">
-          Published: {{ formatDate(updateInfo.latest_release.published_at) }}
-        </p>
-        <Button 
-          label="Install Release"
-          icon="pi pi-download"
-          @click="$emit('install-release', updateInfo.latest_release.tag_name)"
-          :loading="installingRelease"
-          :disabled="!updateInfo.latest_release"
-        />
-      </div>
-      
-      <div class="update-card">
-        <div class="update-header">
-          <h4>Latest Source</h4>
-          <Tag :value="updateInfo.latest_commit?.sha?.substring(0, 8) || 'N/A'" severity="success" />
-        </div>
-        <p v-if="updateInfo.latest_commit">
-          {{ updateInfo.latest_commit.message }}
-        </p>
-        <Button 
-          label="Build from Source"
-          icon="pi pi-code"
-          @click="$emit('build-source')"
-          :loading="buildingSource"
-        />
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { formatDate } from '@/utils/formatting'
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-
-defineProps({
-  updateInfo: {
-    type: Object,
-    default: null
-  },
-  installingRelease: {
-    type: Boolean,
-    default: false
-  },
-  buildingSource: {
-    type: Boolean,
-    default: false
-  }
-})
-
-defineEmits(['install-release', 'build-source'])
-</script>
-
-<style scoped>
-.update-info {
-  margin-bottom: 2rem;
-}
-
-.update-cards {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-  gap: 1rem;
-  margin-top: 1rem;
-}
-
-.update-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.update-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.update-card:hover {
-  transform: translateY(-3px);
-  box-shadow: var(--shadow-lg);
-}
-
-.update-card:hover::before {
-  opacity: 1;
-}
-
-.update-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: 0.5rem;
-}
-
-.update-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.1rem;
-}
-
-.update-card p {
-  margin: var(--spacing-md) 0 var(--spacing-lg);
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  line-height: 1.5;
-}
-
-@media (max-width: 768px) {
-  .update-cards {
-    grid-template-columns: 1fr;
-  }
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/VersionCard.vue b/frontend/src/components/system/LlamaCppManager/VersionCard.vue
deleted file mode 100644
index 8e65aa2..0000000
--- a/frontend/src/components/system/LlamaCppManager/VersionCard.vue
+++ /dev/null
@@ -1,250 +0,0 @@
-<template>
-  <div 
-    class="version-card"
-    :class="{ 'active-version': version.is_active }"
-  >
-    <div class="version-header">
-      <div class="version-info">
-        <div class="version-title">
-          <h4>{{ version.version }}</h4>
-          <div v-if="version.is_active" class="active-indicators">
-            <Tag 
-              value="ACTIVE" 
-              severity="success"
-              class="active-badge"
-            />
-            <i class="pi pi-check-circle active-icon"></i>
-          </div>
-        </div>
-        <div class="version-meta">
-          <Tag 
-            :value="version.install_type" 
-            :severity="getInstallTypeSeverity(version.install_type)"
-          />
-          <Tag 
-            v-if="version.repository_source && version.repository_source !== 'llama.cpp'"
-            :value="version.repository_source" 
-            severity="warning"
-            class="repository-badge"
-          />
-          <span class="install-date">
-            Installed: {{ formatDate(version.installed_at) }}
-          </span>
-        </div>
-      </div>
-      <div class="version-actions">
-        <Button 
-          v-if="!version.is_active"
-          icon="pi pi-check"
-          @click="$emit('activate', version.id)"
-          severity="success"
-          size="small"
-          text
-          :loading="activating === version.id"
-        />
-        <Button 
-          icon="pi pi-trash"
-          severity="danger"
-          outlined
-          @click="$emit('delete', version)"
-          :disabled="version.is_active"
-        />
-      </div>
-    </div>
-    
-    <div v-if="version.source_commit" class="version-details">
-      <p><strong>Commit:</strong> {{ version.source_commit }}</p>
-    </div>
-    
-    <div v-if="version.patches && version.patches.length > 0" class="version-patches">
-      <p><strong>Patches Applied:</strong></p>
-      <ul>
-        <li v-for="patch in version.patches" :key="patch">
-          <a :href="patch" target="_blank">{{ patch }}</a>
-        </li>
-      </ul>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { formatDate } from '@/utils/formatting'
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-
-defineProps({
-  version: {
-    type: Object,
-    required: true
-  },
-  activating: {
-    type: [String, Number],
-    default: null
-  }
-})
-
-defineEmits(['activate', 'delete'])
-
-const getInstallTypeSeverity = (type) => {
-  switch (type) {
-    case 'release': return 'info'
-    case 'source': return 'success'
-    case 'patched': return 'warning'
-    default: return 'secondary'
-  }
-}
-</script>
-
-<style scoped>
-.version-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.version-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.version-card:hover {
-  transform: translateY(-5px) scale(1.02);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-}
-
-.version-card:hover::before {
-  opacity: 1;
-}
-
-.version-card.active-version {
-  background: var(--gradient-card);
-  border: 2px solid var(--status-success);
-  box-shadow: var(--shadow-lg), var(--glow-success);
-}
-
-.version-card.active-version::before {
-  background: var(--gradient-success);
-  opacity: 1;
-}
-
-.version-card.active-version:hover {
-  box-shadow: var(--shadow-xl), var(--glow-success);
-}
-
-.version-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: 0.5rem;
-}
-
-.version-title {
-  display: flex;
-  align-items: center;
-  gap: 0.75rem;
-  margin-bottom: 0.5rem;
-}
-
-.version-title h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.2rem;
-}
-
-.active-indicators {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-}
-
-.active-badge {
-  font-size: 0.75rem;
-  font-weight: 600;
-  animation: pulse 2s infinite;
-}
-
-.active-icon {
-  color: var(--status-success);
-  font-size: 1.25rem;
-  animation: pulse 2s infinite;
-}
-
-@keyframes pulse {
-  0%, 100% {
-    opacity: 1;
-  }
-  50% {
-    opacity: 0.7;
-  }
-}
-
-.version-meta {
-  display: flex;
-  align-items: center;
-  gap: 1rem;
-}
-
-.install-date {
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.version-details,
-.version-patches {
-  margin-top: var(--spacing-md);
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  line-height: 1.5;
-}
-
-.version-patches ul {
-  margin: 0.5rem 0 0 1rem;
-}
-
-.version-patches a {
-  color: var(--accent-cyan);
-  text-decoration: none;
-  font-weight: 500;
-}
-
-.version-patches a:hover {
-  text-decoration: underline;
-  color: var(--accent-blue);
-}
-
-.version-actions {
-  display: flex;
-  gap: var(--spacing-sm);
-  align-items: center;
-}
-
-@media (max-width: 768px) {
-  .version-header {
-    flex-direction: column;
-    gap: 1rem;
-  }
-  
-  .version-meta {
-    flex-direction: column;
-    align-items: flex-start;
-    gap: 0.5rem;
-  }
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppManager/VersionList.vue b/frontend/src/components/system/LlamaCppManager/VersionList.vue
deleted file mode 100644
index 3958328..0000000
--- a/frontend/src/components/system/LlamaCppManager/VersionList.vue
+++ /dev/null
@@ -1,52 +0,0 @@
-<template>
-  <div class="installed-versions">
-    <h3>Installed Versions</h3>
-    <div v-if="versions.length === 0" class="empty-state">
-      <i class="pi pi-code" style="font-size: 3rem; color: var(--text-secondary);"></i>
-      <h4>No Versions Installed</h4>
-      <p>Install a release or build from source to get started.</p>
-    </div>
-    
-    <div v-else class="version-list">
-      <VersionCard
-        v-for="version in versions"
-        :key="version.id"
-        :version="version"
-        :activating="activating"
-        @activate="$emit('activate', $event)"
-        @delete="$emit('delete', $event)"
-      />
-    </div>
-  </div>
-</template>
-
-<script setup>
-import VersionCard from './VersionCard.vue'
-
-defineProps({
-  versions: {
-    type: Array,
-    default: () => []
-  },
-  activating: {
-    type: [String, Number],
-    default: null
-  }
-})
-
-defineEmits(['activate', 'delete'])
-</script>
-
-<style scoped>
-.installed-versions {
-  margin-top: 2rem;
-}
-
-.version-list {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-  margin-top: 1rem;
-}
-</style>
-
diff --git a/frontend/src/components/system/LlamaCppTab.vue b/frontend/src/components/system/LlamaCppTab.vue
deleted file mode 100644
index 018e933..0000000
--- a/frontend/src/components/system/LlamaCppTab.vue
+++ /dev/null
@@ -1,7 +0,0 @@
-<template>
-  <LlamaCppManager />
-</template>
-
-<script setup>
-import LlamaCppManager from '@/views/LlamaCppManager.vue'
-</script>
diff --git a/frontend/src/components/system/SystemTab.vue b/frontend/src/components/system/SystemTab.vue
deleted file mode 100644
index d876c46..0000000
--- a/frontend/src/components/system/SystemTab.vue
+++ /dev/null
@@ -1,748 +0,0 @@
-<template>
-  <div class="system-status">
-    <div class="card">
-      <div class="card-header">
-        <h2 class="card-title">System Status</h2>
-        <div class="header-actions">
-          <div class="connection-info">
-            <div class="live-indicator" v-if="wsStore.isConnected">
-              <i class="pi pi-circle-fill" style="color: #22d3ee; font-size: 0.5rem;"></i>
-              <span>Live</span>
-            </div>
-            <div class="connection-status" v-else>
-              <i class="pi pi-circle" style="color: #ef4444; font-size: 0.5rem;"></i>
-              <span>{{ wsStore.connectionStatus }}</span>
-            </div>
-          </div>
-          <Button 
-            icon="pi pi-refresh" 
-            @click="refreshStatus"
-            :loading="systemStore.loading"
-            severity="secondary"
-            text
-          />
-        </div>
-      </div>
-
-      <!-- System Overview -->
-      <div class="system-overview">
-        <div class="overview-grid">
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">🖥️</span>
-              <h3>CPU</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.cpu_percent || 0).toFixed(1) }}%</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.cpu_percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">💾</span>
-              <h3>Memory</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.memory?.percent || 0).toFixed(1) }}%</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Available</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.systemStatus.system?.memory?.available || 0) }}</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.memory?.percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">💿</span>
-              <h3>Storage</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.disk?.percent || 0).toFixed(1) }}%</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Free</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.systemStatus.system?.disk?.free || 0) }}</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.disk?.percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">🎮</span>
-              <h3>GPU</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Count</span>
-                <span class="metric-value">{{ systemStore.gpuInfo.device_count || 0 }}</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Total VRAM</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.gpuInfo.total_vram || 0) }}</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Available</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.gpuInfo.available_vram || 0) }}</span>
-              </div>
-              <div v-if="systemStore.gpuInfo.nvlink_topology?.has_nvlink" class="metric">
-                <span class="metric-label">NVLink</span>
-                <span class="metric-value">{{ systemStore.gpuInfo.nvlink_topology.recommended_strategy }}</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- GPU Details -->
-      <div v-if="systemStore.gpuInfo.gpus && systemStore.gpuInfo.gpus.length > 0" class="gpu-details">
-        <h3>GPU Details</h3>
-        <div class="gpu-list">
-          <div 
-            v-for="gpu in systemStore.gpuInfo.gpus" 
-            :key="gpu.index"
-            class="gpu-card"
-          >
-            <div class="gpu-header">
-              <h4>GPU {{ gpu.index }}: {{ gpu.name }}</h4>
-              <div class="gpu-status">
-                <span 
-                  :class="['status-indicator', gpu.utilization?.gpu ? 'status-running' : 'status-stopped']"
-                >
-                  <i :class="gpu.utilization?.gpu ? 'pi pi-play' : 'pi pi-pause'"></i>
-                  {{ gpu.utilization?.gpu ? `${gpu.utilization.gpu}%` : 'Idle' }}
-                </span>
-              </div>
-            </div>
-            
-            <div class="gpu-metrics">
-              <div class="metric-row">
-                <span class="metric-label">Memory Usage</span>
-                <div class="metric-bar">
-                  <ProgressBar 
-                    :value="(gpu.memory.used / gpu.memory.total) * 100"
-                    :showValue="false"
-                  />
-                  <span class="metric-text">
-                    {{ formatFileSize(gpu.memory.used) }} / {{ formatFileSize(gpu.memory.total) }}
-                  </span>
-                </div>
-              </div>
-              
-              <div class="metric-row">
-                <span class="metric-label">Compute Capability</span>
-                <span class="metric-value">{{ gpu.compute_capability }}</span>
-              </div>
-              
-              <div v-if="gpu.nvlink && gpu.nvlink.connections.length > 0" class="metric-row">
-                <span class="metric-label">NVLink</span>
-                <div class="nvlink-info">
-                  <span class="nvlink-version">v{{ gpu.nvlink.nvlink_version }}</span>
-                  <span class="nvlink-bandwidth">{{ gpu.nvlink.total_bandwidth }} GB/s</span>
-                  <span class="nvlink-connections">{{ gpu.nvlink.connections.length }} links</span>
-                </div>
-              </div>
-              
-              <div v-if="gpu.temperature" class="metric-row">
-                <span class="metric-label">Temperature</span>
-                <span class="metric-value">{{ gpu.temperature }}°C</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Proxy Status -->
-      <div v-if="systemStore.systemStatus.proxy_status" class="proxy-status">
-        <h3>Multi-Model Proxy</h3>
-        <div class="proxy-card">
-          <div class="proxy-header">
-            <i class="pi pi-share-alt"></i>
-            <h4>llama-swap Proxy</h4>
-            <span class="status-indicator status-running">
-              <i class="pi pi-check"></i>
-              Active
-            </span>
-          </div>
-          <div class="proxy-details">
-            <div class="detail-row">
-              <span class="detail-label">Port:</span>
-              <span class="detail-value">{{ systemStore.systemStatus.proxy_status.port }}</span>
-            </div>
-            <div class="detail-row">
-              <span class="detail-label">API Endpoint:</span>
-              <span class="detail-value">{{ systemStore.systemStatus.proxy_status.endpoint }}</span>
-            </div>
-            <div class="detail-row">
-              <span class="detail-label">Models Available:</span>
-              <span class="detail-value">{{ sortedRunningInstances.length }}</span>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Running Instances -->
-      <div v-if="sortedRunningInstances && sortedRunningInstances.length > 0" class="running-instances">
-        <h3>Running Models</h3>
-        <div class="instance-list">
-          <div 
-            v-for="instance in sortedRunningInstances" 
-            :key="instance.id"
-            class="instance-card"
-          >
-            <div class="instance-header">
-              <h4>{{ instance.proxy_model_name || `Model #${instance.model_id}` }}</h4>
-              <div class="instance-status">
-                <span class="status-indicator status-running">
-                  <i class="pi pi-play"></i>
-                  Running
-                </span>
-              </div>
-            </div>
-            
-            <div class="instance-metrics">
-              <div class="metric-row">
-                <span class="metric-label">Model ID</span>
-                <span class="metric-value">{{ instance.model_id }}</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Empty State -->
-      <div v-if="!sortedRunningInstances || sortedRunningInstances.length === 0" class="empty-instances">
-        <i class="pi pi-play-circle" style="font-size: 3rem; color: var(--text-secondary);"></i>
-        <h4>No Running Instances</h4>
-        <p>Start a model from the Model Library to see running instances here.</p>
-      </div>
-    </div>
-
-    <!-- CUDA Toolkit Manager -->
-    <div class="card">
-      <CudaInstaller />
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { onMounted, onUnmounted, computed } from 'vue'
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import ProgressBar from 'primevue/progressbar'
-import CudaInstaller from '@/components/system/CudaInstaller.vue'
-import { formatFileSize } from '@/utils/formatting'
-
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
-let unsubscribeUnifiedMonitoring = null
-
-// Computed property to sort running instances by model name (alphabetical)
-const sortedRunningInstances = computed(() => {
-  if (!systemStore.systemStatus.running_instances) return []
-  
-  return [...systemStore.systemStatus.running_instances].sort((a, b) => {
-    const nameA = (a.proxy_model_name || a.model_id || '').toLowerCase()
-    const nameB = (b.proxy_model_name || b.model_id || '').toLowerCase()
-    return nameA.localeCompare(nameB)
-  })
-})
-
-onMounted(() => {
-  refreshStatus()
-  
-  // Subscribe to unified monitoring updates for real-time status
-  unsubscribeUnifiedMonitoring = wsStore.subscribeToUnifiedMonitoring((data) => {
-    // Update system status with real-time data
-    if (data.system) {
-      systemStore.updateSystemStatus({
-        system: data.system,
-        proxy_status: data.proxy_status
-      })
-    }
-    
-    // Update running instances from both database and llama-swap
-    if (data.models) {
-      const runningInstances = data.models.running_instances || []
-      const llamaSwapModels = data.models.llama_swap_models || []
-      
-      // Combine both sources of running models
-      const allRunningInstances = [...runningInstances]
-      
-      // Add llama-swap models as additional instances
-      if (Array.isArray(llamaSwapModels)) {
-        llamaSwapModels.forEach(model => {
-          // Check if this model is already in running_instances
-          const exists = allRunningInstances.some(instance => 
-            instance.proxy_model_name === model.model || 
-            instance.model_id === model.model ||
-            instance.proxy_model_name === model.name
-          )
-          
-          if (!exists) {
-            allRunningInstances.push({
-              id: `llama-swap-${model.model}`,
-              model_id: model.model,
-              proxy_model_name: model.model,
-              port: 'N/A',
-              started_at: new Date().toISOString(),
-              source: 'llama-swap',
-              state: model.state
-            })
-          }
-        })
-      }
-      
-      systemStore.updateSystemStatus({
-        running_instances: allRunningInstances
-      })
-    }
-    
-    // Update GPU info if available
-    if (data.gpu) {
-      systemStore.updateGpuInfo(data.gpu)
-    }
-  })
-})
-
-onUnmounted(() => {
-  if (unsubscribeUnifiedMonitoring) {
-    unsubscribeUnifiedMonitoring()
-  }
-})
-
-const refreshStatus = async () => {
-  try {
-    await systemStore.fetchSystemStatus()
-  } catch (error) {
-    toast.error('Failed to refresh system status')
-  }
-}
-
-// formatFileSize is now imported from @/utils/formatting
-</script>
-
-<style scoped>
-.system-status {
-  max-width: 1400px;
-  margin: 0 auto;
-}
-
-.system-overview {
-  margin-bottom: 2rem;
-}
-
-.overview-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
-  gap: 1rem;
-}
-
-.overview-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.overview-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.overview-card:hover {
-  transform: translateY(-5px) scale(1.02);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-}
-
-.overview-card:hover::before {
-  opacity: 1;
-}
-
-.overview-header {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-bottom: 1rem;
-}
-
-.overview-header i {
-  font-size: 1.75rem !important;
-  color: var(--accent-cyan) !important;
-  display: inline-block !important;
-  visibility: visible !important;
-  opacity: 1 !important;
-}
-
-.overview-header h3 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.2rem;
-}
-
-.overview-content {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.metric {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.metric-label {
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.metric-value {
-  font-weight: 700;
-  color: var(--text-primary);
-  font-size: 1rem;
-}
-
-.gpu-details,
-.proxy-status {
-  margin-bottom: 2rem;
-}
-
-.proxy-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.proxy-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-success);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.proxy-card:hover {
-  transform: translateY(-3px);
-  box-shadow: var(--shadow-lg);
-}
-
-.proxy-card:hover::before {
-  opacity: 1;
-}
-
-.proxy-header {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-bottom: 1rem;
-}
-
-.proxy-header i {
-  font-size: 1.75rem;
-  background: var(--gradient-success);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-}
-
-.proxy-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.2rem;
-}
-
-.proxy-details {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.detail-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.detail-label {
-  font-weight: 600;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.detail-value {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-family: 'Courier New', monospace;
-  background: var(--bg-surface);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-}
-
-.running-instances {
-  margin-top: 2rem;
-}
-
-.gpu-list,
-.instance-list {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-  margin-top: 1rem;
-}
-
-.gpu-card,
-.instance-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.gpu-card::before,
-.instance-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.gpu-card:hover,
-.instance-card:hover {
-  transform: translateY(-3px);
-  box-shadow: var(--shadow-lg);
-}
-
-.gpu-card:hover::before,
-.instance-card:hover::before {
-  opacity: 1;
-}
-
-.gpu-header,
-.instance-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: 1rem;
-}
-
-.gpu-header h4,
-.instance-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.1rem;
-}
-
-.gpu-metrics,
-.instance-metrics {
-  display: flex;
-  flex-direction: column;
-  gap: 0.75rem;
-}
-
-.metric-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.metric-bar {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  flex: 1;
-  margin-left: 1rem;
-}
-
-.metric-text {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  min-width: 120px;
-  text-align: right;
-}
-
-.empty-instances {
-  text-align: center;
-  padding: var(--spacing-3xl) var(--spacing-xl);
-  color: var(--text-secondary);
-  background: var(--gradient-surface);
-  border-radius: var(--radius-xl);
-  border: 2px dashed var(--border-secondary);
-  margin: var(--spacing-xl) 0;
-  position: relative;
-  overflow: hidden;
-}
-
-.empty-instances::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 2px;
-  background: var(--gradient-primary);
-  opacity: 0.3;
-}
-
-.empty-instances i {
-  font-size: 3rem !important;
-  background: var(--gradient-primary);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-  margin-bottom: var(--spacing-lg);
-}
-
-.empty-instances h4 {
-  margin: var(--spacing-lg) 0 var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.3rem;
-}
-
-@media (max-width: 768px) {
-  .overview-grid {
-    grid-template-columns: 1fr;
-  }
-  
-  .gpu-header,
-  .instance-header {
-    flex-direction: column;
-    align-items: flex-start;
-    gap: 0.5rem;
-  }
-  
-  .metric-bar {
-    flex-direction: column;
-    align-items: flex-start;
-    margin-left: 0;
-    margin-top: 0.5rem;
-  }
-  
-  .metric-text {
-    text-align: left;
-    min-width: auto;
-  }
-}
-
-.nvlink-info {
-  display: flex;
-  flex-direction: column;
-  gap: 0.25rem;
-  font-size: 0.875rem;
-}
-
-.nvlink-version {
-  font-weight: 700;
-  color: var(--accent-cyan);
-}
-
-.nvlink-bandwidth {
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.nvlink-connections {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-  font-weight: 500;
-}
-
-.connection-info {
-  display: flex;
-  align-items: center;
-}
-
-.live-indicator,
-.connection-status {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.live-indicator i {
-  animation: pulse 2s infinite;
-}
-
-.connection-status {
-  color: #ef4444;
-}
-
-@keyframes pulse {
-  0% { opacity: 1; }
-  50% { opacity: 0.5; }
-  100% { opacity: 1; }
-}
-
-.status-indicator {
-  display: inline-flex;
-  align-items: center;
-  gap: 0.25rem;
-  padding: 0.25rem 0.5rem;
-  border-radius: var(--radius-sm);
-  font-size: 0.875rem;
-  font-weight: 600;
-}
-
-.status-indicator.status-running {
-  background: var(--green-50);
-  color: var(--green-700);
-}
-
-.status-indicator.status-stopped {
-  background: var(--gray-50);
-  color: var(--gray-700);
-}
-</style>
-
diff --git a/frontend/src/components/system/VersionTable.vue b/frontend/src/components/system/VersionTable.vue
new file mode 100644
index 0000000..f6974d4
--- /dev/null
+++ b/frontend/src/components/system/VersionTable.vue
@@ -0,0 +1,131 @@
+<template>
+  <div>
+    <div v-if="!versions.length" class="empty-state-mini">
+      <i class="pi pi-code" />
+      <span>No versions installed — use the buttons above to install one.</span>
+    </div>
+    <div v-else class="version-table">
+      <div
+        v-for="v in versions"
+        :key="v.id ?? v.version"
+        class="version-row"
+        :class="{ active: v.is_active }"
+      >
+        <div class="version-info">
+          <code class="version-name">{{ v.version }}</code>
+          <Tag v-if="v.is_active" value="Active" severity="success" />
+          <Tag :value="v.type || 'release'" severity="secondary" />
+          <small v-if="v.repository_source" class="repo-label">{{ v.repository_source }}</small>
+          <small v-if="v.build_config?.cuda" class="cuda-badge">CUDA</small>
+        </div>
+        <div class="version-actions">
+          <Button
+            v-if="!v.is_active"
+            label="Activate"
+            icon="pi pi-play"
+            size="small"
+            severity="success"
+            outlined
+            :loading="activating === (v.id ?? v.version)"
+            @click="$emit('activate', v.id ?? v.version)"
+          />
+          <Button
+            icon="pi pi-trash"
+            text
+            severity="danger"
+            size="small"
+            v-tooltip.top="'Delete version'"
+            @click="$emit('delete', v.id ?? v.version)"
+          />
+        </div>
+      </div>
+    </div>
+  </div>
+</template>
+
+<script setup>
+import Button from 'primevue/button'
+import Tag from 'primevue/tag'
+
+defineProps({
+  versions: {
+    type: Array,
+    default: () => [],
+  },
+  activating: {
+    type: [String, Number],
+    default: null,
+  },
+})
+
+defineEmits(['activate', 'delete'])
+</script>
+
+<style scoped>
+.empty-state-mini {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  color: var(--text-secondary);
+  font-size: 0.875rem;
+  padding: 0.75rem 0;
+}
+
+.version-table {
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+
+.version-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.5rem 0.75rem;
+  background: var(--bg-surface);
+  border: 1px solid var(--border-primary);
+  border-radius: var(--radius-md);
+  gap: 0.5rem;
+  transition: border-color 0.15s;
+}
+
+.version-row.active {
+  border-color: var(--accent-green);
+}
+
+.version-info {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  flex: 1;
+  min-width: 0;
+  flex-wrap: wrap;
+}
+
+.version-name {
+  font-weight: 600;
+  font-size: 0.875rem;
+  font-family: monospace;
+}
+
+.repo-label {
+  color: var(--text-secondary);
+  font-size: 0.75rem;
+}
+
+.cuda-badge {
+  background: rgba(34, 211, 238, 0.1);
+  color: var(--accent-cyan);
+  border: 1px solid rgba(34, 211, 238, 0.3);
+  border-radius: 0.25rem;
+  padding: 0.1em 0.4em;
+  font-size: 0.7rem;
+  font-weight: 600;
+}
+
+.version-actions {
+  display: flex;
+  gap: 0.25rem;
+  flex-shrink: 0;
+}
+</style>
diff --git a/frontend/src/main.js b/frontend/src/main.js
index 7d51b17..35cb65a 100644
--- a/frontend/src/main.js
+++ b/frontend/src/main.js
@@ -4,8 +4,6 @@ import PrimeVue from 'primevue/config'
 import ConfirmationService from 'primevue/confirmationservice'
 import ToastService from 'primevue/toastservice'
 import Tooltip from 'primevue/tooltip'
-import Toastify from 'vue3-toastify'
-import 'vue3-toastify/dist/index.css'
 
 import App from './App.vue'
 import router from './router'
@@ -25,11 +23,6 @@ app.use(router)
 app.use(PrimeVue)
 app.use(ConfirmationService)
 app.use(ToastService)
-app.use(Toastify, {
-  autoClose: 3000,
-  position: 'top-right',
-  theme: 'dark'
-})
 app.directive('tooltip', Tooltip)
 
 app.mount('#app')
diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js
index da0a9ac..b39978f 100644
--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -2,7 +2,7 @@ import { createRouter, createWebHistory } from 'vue-router'
 import ModelLibrary from '@/views/ModelLibrary.vue'
 import ModelSearch from '@/views/ModelSearch.vue'
 import ModelConfig from '@/views/ModelConfig.vue'
-import System from '@/views/System.vue'
+import EnginesView from '@/views/EnginesView.vue'
 
 const routes = [
   {
@@ -27,8 +27,12 @@ const routes = [
   },
   {
     path: '/system',
-    name: 'system',
-    component: System
+    redirect: '/engines'
+  },
+  {
+    path: '/engines',
+    name: 'engines',
+    component: EnginesView
   }
 ]
 
diff --git a/frontend/src/stores/engines.js b/frontend/src/stores/engines.js
new file mode 100644
index 0000000..0642899
--- /dev/null
+++ b/frontend/src/stores/engines.js
@@ -0,0 +1,189 @@
+import { ref } from 'vue'
+import { defineStore } from 'pinia'
+import axios from 'axios'
+
+export const useEnginesStore = defineStore('engines', () => {
+  const llamaVersions = ref([])
+  const ikLlamaVersions = ref([])
+  const lmdeployStatus = ref({})
+  const cudaStatus = ref({})
+  const gpuInfo = ref({})
+  const systemStatus = ref({})
+  const loading = ref(false)
+
+  // --- llama.cpp versions ---
+
+  async function fetchLlamaVersions() {
+    const { data } = await axios.get('/api/llama-versions')
+    const all = Array.isArray(data) ? data : []
+    llamaVersions.value = all.filter(v => !v.repository_source || v.repository_source === 'llama.cpp')
+    ikLlamaVersions.value = all.filter(v => v.repository_source === 'ik_llama.cpp')
+  }
+
+  async function checkLlamaCppUpdates() {
+    const { data } = await axios.get('/api/llama-versions/check-updates')
+    return data
+  }
+
+  async function checkIkLlamaUpdates() {
+    const { data } = await axios.get('/api/llama-versions/check-updates', {
+      params: { source: 'ik_llama' },
+    })
+    return data
+  }
+
+  async function checkLmdeployUpdates() {
+    const { data } = await axios.get('/api/lmdeploy/check-updates')
+    return data
+  }
+
+  async function fetchReleaseAssets(tagName) {
+    const { data } = await axios.get(`/api/llama-versions/releases/${encodeURIComponent(tagName)}/assets`)
+    return data
+  }
+
+  async function installRelease(params) {
+    const { data } = await axios.post('/api/llama-versions/install-release', params)
+    await fetchLlamaVersions()
+    return data
+  }
+
+  async function buildSource(params) {
+    const { data } = await axios.post('/api/llama-versions/build-source', params)
+    await fetchLlamaVersions()
+    return data
+  }
+
+  async function activateVersion(versionId) {
+    await axios.post('/api/llama-versions/versions/activate', { version_id: versionId })
+    await fetchLlamaVersions()
+  }
+
+  async function deleteVersion(versionId) {
+    await axios.delete(`/api/llama-versions/${versionId}`)
+    await fetchLlamaVersions()
+  }
+
+  // --- CUDA ---
+
+  async function fetchCudaStatus() {
+    const { data } = await axios.get('/api/llama-versions/cuda-status')
+    cudaStatus.value = data
+    return data
+  }
+
+  async function installCuda(params) {
+    const { data } = await axios.post('/api/llama-versions/cuda-install', params)
+    return data
+  }
+
+  async function uninstallCuda(params = {}) {
+    const { data } = await axios.post('/api/llama-versions/cuda-uninstall', params)
+    await fetchCudaStatus()
+    return data
+  }
+
+  async function fetchCudaLogs() {
+    const { data } = await axios.get('/api/llama-versions/cuda-logs')
+    return data
+  }
+
+  // --- LMDeploy ---
+
+  async function fetchLmdeployStatus() {
+    const { data } = await axios.get('/api/lmdeploy/status')
+    lmdeployStatus.value = data
+    return data
+  }
+
+  async function installLmdeploy(params = {}) {
+    const { data } = await axios.post('/api/lmdeploy/install', params)
+    return data
+  }
+
+  async function installLmdeployFromSource(params) {
+    const { data } = await axios.post('/api/lmdeploy/install-source', params)
+    return data
+  }
+
+  async function removeLmdeploy() {
+    await axios.post('/api/lmdeploy/remove')
+    await fetchLmdeployStatus()
+  }
+
+  async function fetchLmdeployLogs(maxBytes = 8192) {
+    const { data } = await axios.get('/api/lmdeploy/logs', { params: { max_bytes: maxBytes } })
+    return data
+  }
+
+  // --- GPU / System ---
+
+  async function fetchGpuInfo() {
+    const { data } = await axios.get('/api/gpu-info')
+    gpuInfo.value = data
+    return data
+  }
+
+  async function fetchSystemStatus() {
+    loading.value = true
+    try {
+      const [statusRes, gpuRes] = await Promise.all([
+        axios.get('/api/status'),
+        axios.get('/api/gpu-info'),
+      ])
+      systemStatus.value = statusRes.data
+      gpuInfo.value = gpuRes.data
+    } catch (err) {
+      console.error('Failed to fetch system status:', err)
+      throw err
+    } finally {
+      loading.value = false
+    }
+  }
+
+  // --- Bulk fetch ---
+
+  async function fetchAll() {
+    await Promise.allSettled([
+      fetchLlamaVersions(),
+      fetchCudaStatus(),
+      fetchLmdeployStatus(),
+      fetchSystemStatus(),
+    ])
+  }
+
+  return {
+    llamaVersions,
+    ikLlamaVersions,
+    lmdeployStatus,
+    cudaStatus,
+    gpuInfo,
+    systemStatus,
+    loading,
+
+    fetchLlamaVersions,
+    checkLlamaCppUpdates,
+    checkIkLlamaUpdates,
+    checkLmdeployUpdates,
+    fetchReleaseAssets,
+    installRelease,
+    buildSource,
+    activateVersion,
+    deleteVersion,
+
+    fetchCudaStatus,
+    installCuda,
+    uninstallCuda,
+    fetchCudaLogs,
+
+    fetchLmdeployStatus,
+    installLmdeploy,
+    installLmdeployFromSource,
+    removeLmdeploy,
+    fetchLmdeployLogs,
+
+    fetchGpuInfo,
+    fetchSystemStatus,
+    fetchAll,
+  }
+})
diff --git a/frontend/src/stores/lmdeploy.js b/frontend/src/stores/lmdeploy.js
deleted file mode 100644
index 9662666..0000000
--- a/frontend/src/stores/lmdeploy.js
+++ /dev/null
@@ -1,172 +0,0 @@
-import { defineStore } from 'pinia'
-import { ref } from 'vue'
-import axios from 'axios'
-import { useWebSocketStore } from './websocket'
-
-export const useLmdeployStore = defineStore('lmdeployInstaller', () => {
-  const wsStore = useWebSocketStore()
-  let statusUnsubscribe = null
-  let installLogUnsubscribe = null
-  let runtimeLogUnsubscribe = null
-  
-  const status = ref(null)
-  const loading = ref(false)
-  const installing = ref(false)
-  const removing = ref(false)
-  const logs = ref('')
-  const logLoading = ref(false)
-  const runtimeLogs = ref('')
-  const runtimeLogLoading = ref(false)
-
-  const fetchStatus = async () => {
-    loading.value = true
-    try {
-      const response = await axios.get('/api/lmdeploy/status')
-      status.value = response.data
-    } catch (error) {
-      console.error('Failed to fetch LMDeploy installer status:', error)
-      throw error
-    } finally {
-      loading.value = false
-    }
-  }
-
-  const fetchLogs = async (maxBytes = 8192) => {
-    logLoading.value = true
-    try {
-      const response = await axios.get('/api/lmdeploy/logs', {
-        params: { max_bytes: maxBytes }
-      })
-      logs.value = response.data?.log || ''
-    } catch (error) {
-      console.error('Failed to fetch LMDeploy installer logs:', error)
-      throw error
-    } finally {
-      logLoading.value = false
-    }
-  }
-
-  const fetchRuntimeLogs = async (maxBytes = 8192) => {
-    runtimeLogLoading.value = true
-    try {
-      const response = await axios.get('/api/lmdeploy/runtime-logs', {
-        params: { max_bytes: maxBytes }
-      })
-      runtimeLogs.value = response.data?.log || ''
-    } catch (error) {
-      console.error('Failed to fetch LMDeploy runtime logs:', error)
-      throw error
-    } finally {
-      runtimeLogLoading.value = false
-    }
-  }
-
-  const install = async (options = {}) => {
-    installing.value = true
-    try {
-      await axios.post('/api/lmdeploy/install', options)
-      await fetchStatus()
-    } catch (error) {
-      console.error('Failed to start LMDeploy install:', error)
-      throw error
-    } finally {
-      installing.value = false
-    }
-  }
-
-  const remove = async () => {
-    removing.value = true
-    try {
-      await axios.post('/api/lmdeploy/remove')
-      await fetchStatus()
-    } catch (error) {
-      console.error('Failed to start LMDeploy removal:', error)
-      throw error
-    } finally {
-      removing.value = false
-    }
-  }
-
-  const startWebSocketSubscriptions = () => {
-    // Subscribe to status updates
-    if (!statusUnsubscribe) {
-      statusUnsubscribe = wsStore.subscribeToLmdeployStatus((data) => {
-        status.value = data
-      })
-    }
-    
-    // Subscribe to install log lines
-    if (!installLogUnsubscribe) {
-      installLogUnsubscribe = wsStore.subscribeToLmdeployInstallLog((data) => {
-        if (data.line) {
-          // Prevent duplicates: check if this line already exists in current logs
-          // This handles the case where HTTP fetch and WebSocket might send the same line
-          // We check the last 500 chars to avoid checking entire log for performance
-          const recentLogs = logs.value.slice(-500)
-          if (!recentLogs.includes(data.line)) {
-            // Append new log line
-            if (logs.value) {
-              logs.value += '\n' + data.line
-            } else {
-              logs.value = data.line
-            }
-          }
-        }
-      })
-    }
-    
-    // Subscribe to runtime log lines
-    if (!runtimeLogUnsubscribe) {
-      runtimeLogUnsubscribe = wsStore.subscribeToLmdeployRuntimeLog((data) => {
-        if (data.line) {
-          // Prevent duplicates: check if this line already exists in current logs
-          // This handles the case where HTTP fetch and WebSocket might send the same line
-          // We check the last 500 chars to avoid checking entire log for performance
-          const recentLogs = runtimeLogs.value.slice(-500)
-          if (!recentLogs.includes(data.line)) {
-            // Append new log line
-            if (runtimeLogs.value) {
-              runtimeLogs.value += '\n' + data.line
-            } else {
-              runtimeLogs.value = data.line
-            }
-          }
-        }
-      })
-    }
-  }
-
-  const stopWebSocketSubscriptions = () => {
-    if (statusUnsubscribe) {
-      statusUnsubscribe()
-      statusUnsubscribe = null
-    }
-    if (installLogUnsubscribe) {
-      installLogUnsubscribe()
-      installLogUnsubscribe = null
-    }
-    if (runtimeLogUnsubscribe) {
-      runtimeLogUnsubscribe()
-      runtimeLogUnsubscribe = null
-    }
-  }
-
-  return {
-    status,
-    logs,
-    runtimeLogs,
-    loading,
-    logLoading,
-    runtimeLogLoading,
-    installing,
-    removing,
-    fetchStatus,
-    fetchLogs,
-    fetchRuntimeLogs,
-    install,
-    remove,
-    startWebSocketSubscriptions,
-    stopWebSocketSubscriptions
-  }
-})
-
diff --git a/frontend/src/stores/models.js b/frontend/src/stores/models.js
index 5650fb3..c8ac178 100644
--- a/frontend/src/stores/models.js
+++ b/frontend/src/stores/models.js
@@ -3,7 +3,7 @@ import { ref, computed } from 'vue'
 import axios from 'axios'
 
 export const useModelStore = defineStore('models', () => {
-  const models = ref([]) // This will now contain grouped models
+  const models = ref([])        // array of groups: { huggingface_id, base_model_name, quantizations[] }
   const loading = ref(false)
   const searchResults = ref([])
   const searchLoading = ref(false)
@@ -11,469 +11,217 @@ export const useModelStore = defineStore('models', () => {
   const huggingfaceToken = ref(null)
   const hasHuggingfaceToken = ref(false)
   const tokenFromEnvironment = ref(false)
-  const safetensorsMetadata = ref({})
-  const safetensorsMetadataLoading = ref({})
   const safetensorsModels = ref([])
   const safetensorsLoading = ref(false)
-  const safetensorsRuntime = ref({})
-  const safetensorsRuntimeLoading = ref({})
-  const safetensorsMetadataRefreshing = ref({})
-  const lmdeployStatus = ref(null)
-  const lmdeployStatusLoading = ref(false)
-  const lmdeployStarting = ref({})
-  const lmdeployStopping = ref({})
+  const safetensorsMetadata = ref({})
+  const safetensorsMetadataLoading = ref({})
   const hfMetadata = ref({})
   const hfMetadataLoading = ref({})
-  
-  // Model loading state tracking (models currently being loaded by llama-swap)
-  const loadingModels = ref({})  // { proxyName: { started_at, elapsed_seconds } }
 
-  // Flatten all quantizations for backward compatibility
+  // ── Computed ──────────────────────────────────────────────
+
   const allQuantizations = computed(() => {
-    const quantizations = []
+    const result = []
     models.value.forEach(group => {
-      group.quantizations.forEach(quant => {
-        quantizations.push({
-          ...quant,
+      ;(group.quantizations || []).forEach(q => {
+        result.push({
+          ...q,
           base_model_name: group.base_model_name,
           huggingface_id: group.huggingface_id,
           model_type: group.model_type,
-          pipeline_tag: quant.pipeline_tag || group.pipeline_tag,
-          is_embedding_model: quant.is_embedding_model ?? group.is_embedding_model ?? false
+          pipeline_tag: q.pipeline_tag || group.pipeline_tag,
+          is_embedding_model: q.is_embedding_model ?? group.is_embedding_model ?? false,
         })
       })
     })
-    return quantizations
+    return result
   })
 
-  const downloadedModels = computed(() => 
-    allQuantizations.value.filter(model => model.file_path)
-  )
-
-  const runningModels = computed(() => 
-    allQuantizations.value.filter(model => model.is_active)
-  )
-
-  // Get all model groups (for grouped display)
-  const modelGroups = computed(() => models.value)
-  
-  // Check if a model is currently loading (by model ID or proxy name)
-  const isModelLoading = (modelIdOrProxyName) => {
-    // Check by proxy name first
-    if (loadingModels.value[modelIdOrProxyName]) {
-      return true
-    }
-    // Check by model ID - find the model and check its proxy name
-    const model = allQuantizations.value.find(m => m.id === modelIdOrProxyName)
-    if (model?.proxy_name && loadingModels.value[model.proxy_name]) {
-      return true
-    }
-    return false
-  }
-  
-  // Get loading progress for a model (elapsed seconds)
-  const getModelLoadingProgress = (modelIdOrProxyName) => {
-    // Check by proxy name first
-    if (loadingModels.value[modelIdOrProxyName]) {
-      return loadingModels.value[modelIdOrProxyName]
-    }
-    // Check by model ID
-    const model = allQuantizations.value.find(m => m.id === modelIdOrProxyName)
-    if (model?.proxy_name && loadingModels.value[model.proxy_name]) {
-      return loadingModels.value[model.proxy_name]
-    }
-    return null
-  }
-  
-  // Update loading models from unified monitoring data
-  const updateLoadingModels = (loadingData) => {
-    loadingModels.value = loadingData || {}
-  }
-  
-  // Check if any models are currently loading
-  const hasLoadingModels = computed(() => Object.keys(loadingModels.value).length > 0)
+  const downloadedModels = computed(() => allQuantizations.value.filter(m => m.downloaded_at))
+  const runningModels    = computed(() => allQuantizations.value.filter(m => m.is_active))
+  const modelGroups      = computed(() => models.value)
+
+  // ── Models CRUD ───────────────────────────────────────────
 
-  const fetchModels = async () => {
+  async function fetchModels() {
     loading.value = true
     try {
-      const response = await axios.get('/api/models')
-      models.value = response.data
-    } catch (error) {
-      console.error('Failed to fetch models:', error)
-      throw error
+      const { data } = await axios.get('/api/models')
+      models.value = data
+    } catch (e) {
+      console.error('Failed to fetch models:', e)
+      throw e
     } finally {
       loading.value = false
     }
   }
 
-  const fetchSafetensorsModels = async () => {
+  async function fetchSafetensorsModels() {
     safetensorsLoading.value = true
     try {
-      const response = await axios.get('/api/models/safetensors')
-      safetensorsModels.value = Array.isArray(response.data) ? response.data : []
-    } catch (error) {
-      console.error('Failed to fetch safetensors models:', error)
-      throw error
+      const { data } = await axios.get('/api/models/safetensors')
+      safetensorsModels.value = Array.isArray(data) ? data : []
+    } catch (e) {
+      console.error('Failed to fetch safetensors models:', e)
+      throw e
     } finally {
       safetensorsLoading.value = false
     }
   }
 
-  const searchModels = async (query, limit = 20, modelFormat = searchFormat.value) => {
+  async function deleteModel(modelId) {
+    await axios.delete(`/api/models/${modelId}`)
+    await fetchModels()
+  }
+
+  async function deleteModelGroup(huggingfaceId) {
+    await axios.post('/api/models/delete-group', { huggingface_id: huggingfaceId })
+    await fetchModels()
+  }
+
+  async function deleteSafetensorsModel(huggingfaceId) {
+    await axios.delete('/api/models/safetensors', { data: { huggingface_id: huggingfaceId } })
+    await fetchSafetensorsModels()
+  }
+
+  // ── Search ────────────────────────────────────────────────
+
+  async function searchModels(query, limit = 20, modelFormat = searchFormat.value) {
     searchLoading.value = true
     try {
-      const response = await axios.post('/api/models/search', { query, limit, model_format: modelFormat })
-      // Ensure searchResults is always an array
-      searchResults.value = Array.isArray(response.data) ? response.data : []
+      const { data } = await axios.post('/api/models/search', { query, limit, model_format: modelFormat })
+      searchResults.value = Array.isArray(data) ? data : []
       searchFormat.value = modelFormat
       return searchResults.value
-    } catch (error) {
-      console.error('Failed to search models:', error)
-      // Ensure searchResults is reset to empty array on error
+    } catch (e) {
+      console.error('Failed to search models:', e)
       searchResults.value = []
-      throw error
+      throw e
     } finally {
       searchLoading.value = false
     }
   }
 
-  const downloadModel = async (huggingfaceId, filename, totalBytes = 0, modelFormat = 'gguf', pipelineTag = null) => {
-  try {
-    const response = await axios.post('/api/models/download', {
+  // ── Download ──────────────────────────────────────────────
+
+  async function downloadModel(huggingfaceId, filename, totalBytes = 0, modelFormat = 'gguf', pipelineTag = null) {
+    const { data } = await axios.post('/api/models/download', {
       huggingface_id: huggingfaceId,
       filename,
       total_bytes: totalBytes,
       model_format: modelFormat,
-      pipeline_tag: pipelineTag
+      pipeline_tag: pipelineTag,
     })
-    // Refresh models list after download starts
     await fetchModels()
-    if (modelFormat === 'safetensors') {
-      await fetchSafetensorsModels()
-    }
-    return response.data
-  } catch (error) {
-    console.error('Failed to download model:', error)
-    throw error
-  }
-}
-
-  const deleteModel = async (modelId) => {
-    try {
-      await axios.delete(`/api/models/${modelId}`)
-      await fetchModels()
-    } catch (error) {
-      console.error('Failed to delete model:', error)
-      throw error
-    }
-  }
-
-  const downloadSafetensorsBundle = async (huggingfaceId, files) => {
-    try {
-      const response = await axios.post('/api/models/safetensors/download-bundle', {
-        huggingface_id: huggingfaceId,
-        files
-      })
-      return response.data
-    } catch (error) {
-      console.error('Failed to start safetensors bundle download:', error)
-      throw error
-    }
-  }
-
-  const downloadGgufBundle = async (huggingfaceId, quantization, files, pipelineTag = null) => {
-    try {
-      const response = await axios.post('/api/models/gguf/download-bundle', {
-        huggingface_id: huggingfaceId,
-        quantization,
-        files,
-        pipeline_tag: pipelineTag
-      })
-      return response.data
-    } catch (error) {
-      console.error('Failed to start GGUF bundle download:', error)
-      throw error
-    }
+    return data
   }
 
-  const deleteModelGroup = async (huggingfaceId) => {
-    try {
-      await axios.post('/api/models/delete-group', { huggingface_id: huggingfaceId })
-      await fetchModels()
-    } catch (error) {
-      console.error('Failed to delete model group:', error)
-      throw error
-    }
-  }
-
-  const deleteSafetensorsModel = async (huggingfaceId) => {
-    try {
-      await axios.delete('/api/models/safetensors', { data: { huggingface_id: huggingfaceId } })
-      await fetchSafetensorsModels()
-    } catch (error) {
-      console.error('Failed to delete safetensors model:', error)
-      throw error
-    }
-  }
-
-  const fetchHfMetadata = async (modelId) => {
-    if (!modelId) return null
-    if (hfMetadata.value[modelId]) {
-      return hfMetadata.value[modelId]
-    }
-    hfMetadataLoading.value[modelId] = true
-    try {
-      const response = await axios.get(`/api/models/${modelId}/hf-metadata`)
-      hfMetadata.value[modelId] = response.data || {}
-      return hfMetadata.value[modelId]
-    } catch (error) {
-      console.error('Failed to fetch HF metadata:', error)
-      throw error
-    } finally {
-      hfMetadataLoading.value[modelId] = false
-    }
-  }
-
-  const fetchLmdeployStatus = async () => {
-    lmdeployStatusLoading.value = true
-    try {
-      const response = await axios.get('/api/models/safetensors/lmdeploy/status')
-      lmdeployStatus.value = response.data || null
-      return response.data
-    } catch (error) {
-      console.error('Failed to fetch LMDeploy status:', error)
-      throw error
-    } finally {
-      lmdeployStatusLoading.value = false
-    }
-  }
-
-  const fetchSafetensorsRuntimeConfig = async (modelId) => {
-    if (!modelId) return null
-    safetensorsRuntimeLoading.value[modelId] = true
-    try {
-      const response = await axios.get(`/api/models/safetensors/${modelId}/lmdeploy/config`)
-      safetensorsRuntime.value[modelId] = response.data
-      if (response.data?.manager) {
-        lmdeployStatus.value = {
-          ...(lmdeployStatus.value || {}),
-          manager: response.data.manager
-        }
-      }
-      return response.data
-    } catch (error) {
-      console.error('Failed to fetch LMDeploy config:', error)
-      throw error
-    } finally {
-      safetensorsRuntimeLoading.value[modelId] = false
-    }
+  async function downloadSafetensorsBundle(huggingfaceId, files) {
+    const { data } = await axios.post('/api/models/safetensors/download-bundle', {
+      huggingface_id: huggingfaceId,
+      files,
+    })
+    return data
   }
 
-  const regenerateSafetensorsMetadata = async (modelId) => {
-    if (!modelId) return
-    safetensorsMetadataRefreshing.value[modelId] = true
-    try {
-      await axios.post(`/api/models/safetensors/${modelId}/metadata/regenerate`)
-      await fetchSafetensorsRuntimeConfig(modelId)
-      await fetchLmdeployStatus()
-      await fetchSafetensorsModels()
-    } catch (error) {
-      console.error('Failed to regenerate safetensors metadata:', error)
-      throw error
-    } finally {
-      safetensorsMetadataRefreshing.value[modelId] = false
-    }
+  async function downloadGgufBundle(huggingfaceId, quantization, files, pipelineTag = null) {
+    const { data } = await axios.post('/api/models/gguf/download-bundle', {
+      huggingface_id: huggingfaceId,
+      quantization,
+      files,
+      pipeline_tag: pipelineTag,
+    })
+    return data
   }
 
-  const updateSafetensorsRuntimeConfig = async (modelId, config) => {
-    if (!modelId) return
-    try {
-      await axios.put(`/api/models/safetensors/${modelId}/lmdeploy/config`, config)
-      await fetchSafetensorsRuntimeConfig(modelId)
-    } catch (error) {
-      console.error('Failed to update LMDeploy config:', error)
-      throw error
-    }
-  }
+  // ── Start / Stop ──────────────────────────────────────────
 
-  const startSafetensorsRuntime = async (modelId, configOverride = null) => {
-    if (!modelId) return
-    lmdeployStarting.value[modelId] = true
-    try {
-      const payload = configOverride ? { config: configOverride } : {}
-      await axios.post(`/api/models/safetensors/${modelId}/lmdeploy/start`, payload)
-      await fetchSafetensorsRuntimeConfig(modelId)
-      await fetchLmdeployStatus()
-    } catch (error) {
-      console.error('Failed to start LMDeploy runtime:', error)
-      throw error
-    } finally {
-      lmdeployStarting.value[modelId] = false
-    }
+  async function startModel(modelId) {
+    const { data } = await axios.post(`/api/models/${modelId}/start`)
+    await fetchModels()
+    return data
   }
 
-  const stopSafetensorsRuntime = async (modelId) => {
-    if (!modelId) return
-    lmdeployStopping.value[modelId] = true
-    try {
-      await axios.post(`/api/models/safetensors/${modelId}/lmdeploy/stop`)
-      await fetchLmdeployStatus()
-      await fetchSafetensorsRuntimeConfig(modelId)
-    } catch (error) {
-      console.error('Failed to stop LMDeploy runtime:', error)
-      throw error
-    } finally {
-      lmdeployStopping.value[modelId] = false
-    }
+  async function stopModel(modelId) {
+    await axios.post(`/api/models/${modelId}/stop`)
+    await fetchModels()
   }
 
-  const fetchHuggingfaceTokenStatus = async () => {
-    try {
-      const response = await axios.get('/api/models/huggingface-token')
-      hasHuggingfaceToken.value = response.data.has_token
-      huggingfaceToken.value = response.data.token_preview
-      tokenFromEnvironment.value = response.data.from_environment
-    } catch (error) {
-      console.error('Failed to fetch HuggingFace token status:', error)
-      throw error
-    }
-  }
+  // ── Config ────────────────────────────────────────────────
 
-  const setHuggingfaceToken = async (token) => {
-    try {
-      const response = await axios.post('/api/models/huggingface-token', { token })
-      await fetchHuggingfaceTokenStatus()
-      return response.data
-    } catch (error) {
-      console.error('Failed to set HuggingFace token:', error)
-      throw error
-    }
+  async function getModelConfig(modelId) {
+    const { data } = await axios.get(`/api/models/${modelId}/config`)
+    return data
   }
 
-  const clearHuggingfaceToken = async () => {
-    try {
-      const response = await axios.post('/api/models/huggingface-token', { token: '' })
-      await fetchHuggingfaceTokenStatus()
-      return response.data
-    } catch (error) {
-      console.error('Failed to clear HuggingFace token:', error)
-      throw error
-    }
+  async function updateModelConfig(modelId, config) {
+    await axios.put(`/api/models/${modelId}/config`, config)
   }
 
-  const startModel = async (modelId) => {
-    try {
-      const response = await axios.post(`/api/models/${modelId}/start`)
-      await fetchModels()
-      return response.data
-    } catch (error) {
-      console.error('Failed to start model:', error)
-      throw error
-    }
+  async function getModelDetails(modelId) {
+    const { data } = await axios.get(`/api/models/${modelId}/details`)
+    return data
   }
 
-  const stopModel = async (modelId) => {
-    try {
-      await axios.post(`/api/models/${modelId}/stop`)
-      await fetchModels()
-    } catch (error) {
-      console.error('Failed to stop model:', error)
-      throw error
-    }
-  }
+  // ── HuggingFace Token ─────────────────────────────────────
 
-  const getModelConfig = async (modelId) => {
-    try {
-      const response = await axios.get(`/api/models/${modelId}/config`)
-      return response.data
-    } catch (error) {
-      console.error('Failed to get model config:', error)
-      throw error
-    }
+  async function fetchHuggingfaceTokenStatus() {
+    const { data } = await axios.get('/api/models/huggingface-token')
+    hasHuggingfaceToken.value = data.has_token
+    huggingfaceToken.value = data.token_preview
+    tokenFromEnvironment.value = data.from_environment
   }
 
-  const updateModelConfig = async (modelId, config) => {
-    try {
-      await axios.put(`/api/models/${modelId}/config`, config)
-    } catch (error) {
-      console.error('Failed to update model config:', error)
-      throw error
-    }
+  async function setHuggingfaceToken(token) {
+    const { data } = await axios.post('/api/models/huggingface-token', { token })
+    await fetchHuggingfaceTokenStatus()
+    return data
   }
 
-  const generateAutoConfig = async (modelId) => {
-    try {
-      const response = await axios.post(`/api/models/${modelId}/auto-config`)
-      return response.data
-    } catch (error) {
-      console.error('Failed to generate auto config:', error)
-      throw error
-    }
+  async function clearHuggingfaceToken() {
+    const { data } = await axios.post('/api/models/huggingface-token', { token: '' })
+    await fetchHuggingfaceTokenStatus()
+    return data
   }
 
-  const getModelDetails = async (modelId) => {
-    try {
-      const response = await axios.get(`/api/models/${modelId}/details`)
-      return response.data
-    } catch (error) {
-      console.error('Failed to get model details:', error)
-      throw error
-    }
-  }
+  // ── Metadata ──────────────────────────────────────────────
 
-  const getQuantizationSizes = async (huggingfaceId, quantizations) => {
+  async function fetchHfMetadata(modelId) {
+    if (!modelId) return null
+    if (hfMetadata.value[modelId]) return hfMetadata.value[modelId]
+    hfMetadataLoading.value[modelId] = true
     try {
-      const response = await axios.post('/api/models/quantization-sizes', {
-        huggingface_id: huggingfaceId,
-        quantizations: quantizations
-      })
-      return response.data.quantizations
-    } catch (error) {
-      console.error('Failed to get quantization sizes:', error)
-      throw error
+      const { data } = await axios.get(`/api/models/${modelId}/hf-metadata`)
+      hfMetadata.value[modelId] = data || {}
+      return hfMetadata.value[modelId]
+    } finally {
+      hfMetadataLoading.value[modelId] = false
     }
   }
 
-  const fetchSafetensorsMetadata = async (modelId) => {
+  async function fetchSafetensorsMetadata(modelId) {
     if (!modelId) return null
-    if (safetensorsMetadata.value[modelId]) {
-      return safetensorsMetadata.value[modelId]
-    }
+    if (safetensorsMetadata.value[modelId]) return safetensorsMetadata.value[modelId]
+    safetensorsMetadataLoading.value[modelId] = true
     try {
-      safetensorsMetadataLoading.value[modelId] = true
-      const encodedId = encodeURIComponent(modelId)
-      const response = await axios.get(`/api/models/safetensors/${encodedId}/metadata`)
-      safetensorsMetadata.value[modelId] = response.data
-      return response.data
-    } catch (error) {
-      console.error('Failed to fetch safetensors metadata:', error)
-      throw error
+      const encoded = encodeURIComponent(modelId)
+      const { data } = await axios.get(`/api/models/safetensors/${encoded}/metadata`)
+      safetensorsMetadata.value[modelId] = data
+      return data
     } finally {
       safetensorsMetadataLoading.value[modelId] = false
     }
   }
 
-  const updateModelStatus = (modelId, status) => {
-    // Find and update the model in the grouped structure
-    models.value.forEach(group => {
-      const quantization = group.quantizations.find(q => q.id === modelId)
-      if (quantization) {
-        // Create a new object to trigger reactivity
-        Object.assign(quantization, status)
-      }
+  async function getQuantizationSizes(huggingfaceId, quantizations) {
+    const { data } = await axios.post('/api/models/quantization-sizes', {
+      huggingface_id: huggingfaceId,
+      quantizations,
     })
+    return data.quantizations
   }
 
-  const updateModelStatusByFilename = (filename, status) => {
-    // Find and update the model by filename (for llama-swap matching)
-    models.value.forEach(group => {
-      group.quantizations.forEach(quantization => {
-        if (quantization.filename === filename || quantization.name === filename) {
-          Object.assign(quantization, status)
-        }
-      })
-    })
-  }
+  // ── Return ────────────────────────────────────────────────
 
   return {
     models,
@@ -488,53 +236,32 @@ export const useModelStore = defineStore('models', () => {
     runningModels,
     modelGroups,
     allQuantizations,
+    safetensorsModels,
+    safetensorsLoading,
+    safetensorsMetadata,
+    safetensorsMetadataLoading,
+    hfMetadata,
+    hfMetadataLoading,
+
     fetchModels,
-    searchModels,
-    downloadModel,
+    fetchSafetensorsModels,
     deleteModel,
     deleteModelGroup,
     deleteSafetensorsModel,
-    fetchHuggingfaceTokenStatus,
-    setHuggingfaceToken,
-    clearHuggingfaceToken,
+    searchModels,
+    downloadModel,
+    downloadSafetensorsBundle,
+    downloadGgufBundle,
     startModel,
     stopModel,
     getModelConfig,
     updateModelConfig,
-    generateAutoConfig,
     getModelDetails,
-    getQuantizationSizes,
-    downloadSafetensorsBundle,
-    downloadGgufBundle,
-    safetensorsModels,
-    safetensorsLoading,
-    fetchSafetensorsModels,
-    safetensorsMetadata,
-    safetensorsMetadataLoading,
-    fetchSafetensorsMetadata,
-    safetensorsRuntime,
-    safetensorsRuntimeLoading,
-    safetensorsMetadataRefreshing,
-    lmdeployStatus,
-    lmdeployStatusLoading,
-    lmdeployStarting,
-    lmdeployStopping,
-    fetchLmdeployStatus,
-    fetchSafetensorsRuntimeConfig,
-    regenerateSafetensorsMetadata,
-    updateSafetensorsRuntimeConfig,
-    startSafetensorsRuntime,
-    stopSafetensorsRuntime,
-    updateModelStatus,
-    updateModelStatusByFilename,
-    hfMetadata,
-    hfMetadataLoading,
+    fetchHuggingfaceTokenStatus,
+    setHuggingfaceToken,
+    clearHuggingfaceToken,
     fetchHfMetadata,
-    // Loading state tracking
-    loadingModels,
-    isModelLoading,
-    getModelLoadingProgress,
-    updateLoadingModels,
-    hasLoadingModels
+    fetchSafetensorsMetadata,
+    getQuantizationSizes,
   }
 })
diff --git a/frontend/src/stores/progress.js b/frontend/src/stores/progress.js
new file mode 100644
index 0000000..7cfecda
--- /dev/null
+++ b/frontend/src/stores/progress.js
@@ -0,0 +1,151 @@
+/**
+ * SSE-based progress and events store. Subscribes to GET /api/events.
+ */
+import { defineStore } from 'pinia'
+import { ref, computed } from 'vue'
+
+const SSE_EVENT_TYPES = [
+  'task_created',
+  'task_updated',
+  'download_progress',
+  'download_complete',
+  'build_progress',
+  'notification',
+  'model_status',
+  'model_event',
+  'unified_monitoring',
+  'lmdeploy_status',
+  'lmdeploy_runtime_log',
+  'lmdeploy_install_status',
+  'lmdeploy_install_log',
+  'cuda_install_status',
+  'cuda_install_progress',
+  'cuda_install_log',
+  'broadcast'
+]
+
+export const useProgressStore = defineStore('progress', () => {
+  const tasks = ref({})
+  const eventSource = ref(null)
+  const connected = ref(false)
+  const subscribers = ref(new Map()) // eventType -> Set<callback>
+
+  const activeTasks = computed(() => {
+    return Object.values(tasks.value).filter(t => t.status === 'running')
+  })
+
+  const connectionStatus = computed(() => (connected.value ? 'connected' : 'disconnected'))
+  const isConnected = computed(() => connected.value && eventSource.value?.readyState === EventSource.OPEN)
+
+  function notifySubscribers(eventType, data) {
+    const callbacks = subscribers.value.get(eventType)
+    if (callbacks) {
+      callbacks.forEach(cb => {
+        try {
+          const result = cb(data)
+          if (result && typeof result.then === 'function') result.catch(() => {})
+        } catch (_) {}
+      })
+    }
+    const any = subscribers.value.get('*')
+    if (any) any.forEach(cb => { try { cb(eventType, data) } catch (_) {} })
+  }
+
+  function handleEvent(eventType, rawData) {
+    let data = rawData
+    try {
+      if (typeof rawData === 'string') data = JSON.parse(rawData)
+    } catch (_) { return }
+    if (eventType === 'task_created' || eventType === 'task_updated') {
+      const task = data?.data ?? data
+      if (task?.task_id) tasks.value = { ...tasks.value, [task.task_id]: task }
+    }
+    const payload = data?.data != null ? data.data : data
+    notifySubscribers(eventType, payload)
+    if (payload?.type && payload.type !== eventType) notifySubscribers(payload.type, payload)
+  }
+
+  function connect() {
+    if (eventSource.value?.readyState === EventSource.OPEN) return
+    // In dev, connect directly to backend to avoid proxy buffering SSE (port must match vite proxy target)
+    const base = typeof window !== 'undefined' && window.location?.origin ? window.location.origin : ''
+    const isDev = typeof import.meta !== 'undefined' && import.meta.env?.DEV
+    const devPort = typeof import.meta !== 'undefined' && import.meta.env?.VITE_API_PORT ? Number(import.meta.env.VITE_API_PORT) : 8081
+    const url = isDev ? `http://localhost:${devPort}/api/events` : `${base}/api/events`
+    if (isDev) console.log('[SSE] Connecting to', url, '(dev: direct to backend)')
+    const es = new EventSource(url)
+    es.onopen = () => {
+      if (isDev) console.log('[SSE] onopen, readyState=', es.readyState)
+      connected.value = true
+    }
+    es.onerror = (e) => {
+      if (isDev) console.warn('[SSE] onerror, readyState=', es.readyState, 'event=', e)
+      es.close()
+      eventSource.value = null
+      connected.value = false
+      setTimeout(() => connect(), 3000)
+    }
+    es.onmessage = (e) => handleEvent('message', e.data)
+    SSE_EVENT_TYPES.forEach(type => {
+      es.addEventListener(type, (e) => handleEvent(type, e.data))
+    })
+    eventSource.value = es
+  }
+
+  function disconnect() {
+    if (eventSource.value) {
+      eventSource.value.close()
+      eventSource.value = null
+    }
+    connected.value = false
+  }
+
+  function getTask(taskId) {
+    return tasks.value[taskId] || null
+  }
+
+  function subscribe(eventType, callback) {
+    if (!subscribers.value.has(eventType)) subscribers.value.set(eventType, new Set())
+    subscribers.value.get(eventType).add(callback)
+    return () => {
+      const set = subscribers.value.get(eventType)
+      if (set) {
+        set.delete(callback)
+        if (set.size === 0) subscribers.value.delete(eventType)
+      }
+    }
+  }
+
+  const subscribeToDownloadProgress = (cb) => subscribe('download_progress', cb)
+  const subscribeToBuildProgress = (cb) => subscribe('build_progress', cb)
+  const subscribeToModelStatus = (cb) => subscribe('model_status', cb)
+  const subscribeToNotifications = (cb) => subscribe('notification', cb)
+  const subscribeToDownloadComplete = (cb) => subscribe('download_complete', cb)
+  const subscribeToUnifiedMonitoring = (cb) => subscribe('unified_monitoring', cb)
+  const subscribeToModelEvents = (cb) => subscribe('model_event', cb)
+  const subscribeToLmdeployStatus = (cb) => subscribe('lmdeploy_status', cb)
+  const subscribeToLmdeployInstallLog = (cb) => subscribe('lmdeploy_install_log', cb)
+  const subscribeToLmdeployRuntimeLog = (cb) => subscribe('lmdeploy_runtime_log', cb)
+
+  return {
+    tasks,
+    activeTasks,
+    connected,
+    connectionStatus,
+    isConnected,
+    connect,
+    disconnect,
+    getTask,
+    subscribe,
+    subscribeToDownloadProgress,
+    subscribeToBuildProgress,
+    subscribeToModelStatus,
+    subscribeToNotifications,
+    subscribeToDownloadComplete,
+    subscribeToUnifiedMonitoring,
+    subscribeToModelEvents,
+    subscribeToLmdeployStatus,
+    subscribeToLmdeployInstallLog,
+    subscribeToLmdeployRuntimeLog
+  }
+})
diff --git a/frontend/src/stores/system.js b/frontend/src/stores/system.js
deleted file mode 100644
index 77d29a7..0000000
--- a/frontend/src/stores/system.js
+++ /dev/null
@@ -1,178 +0,0 @@
-import { defineStore } from 'pinia'
-import { ref } from 'vue'
-import axios from 'axios'
-
-export const useSystemStore = defineStore('system', () => {
-  const systemStatus = ref({})
-  const gpuInfo = ref({})
-  const llamaVersions = ref([])
-  const loading = ref(false)
-
-  const fetchSystemStatus = async () => {
-    loading.value = true
-    try {
-      const [statusResponse, gpuResponse] = await Promise.all([
-        axios.get('/api/monitoring/status'),
-        axios.get('/api/gpu-info')
-      ])
-      
-      systemStatus.value = statusResponse.data
-      gpuInfo.value = gpuResponse.data
-    } catch (error) {
-      console.error('Failed to fetch system status:', error)
-      throw error
-    } finally {
-      loading.value = false
-    }
-  }
-
-  const fetchLlamaVersions = async () => {
-    try {
-      const response = await axios.get('/api/llama-versions')
-      llamaVersions.value = response.data
-    } catch (error) {
-      console.error('Failed to fetch llama versions:', error)
-      throw error
-    }
-  }
-
-  const checkUpdates = async () => {
-    try {
-      const response = await axios.get('/api/llama-versions/check-updates')
-      return response.data
-    } catch (error) {
-      console.error('Failed to check updates:', error)
-      throw error
-    }
-  }
-
-  const fetchReleaseAssets = async (tagName) => {
-    try {
-      const response = await axios.get(`/api/llama-versions/releases/${encodeURIComponent(tagName)}/assets`)
-      return response.data
-    } catch (error) {
-      console.error('Failed to fetch release assets:', error)
-      throw error
-    }
-  }
-
-  const installRelease = async (tagName, assetId) => {
-    try {
-      const payload = { tag_name: tagName }
-      if (assetId !== undefined && assetId !== null) {
-        payload.asset_id = assetId
-      }
-      await axios.post('/api/llama-versions/install-release', payload)
-      await fetchLlamaVersions()
-    } catch (error) {
-      console.error('Failed to install release:', error)
-      throw error
-    }
-  }
-
-  const buildSource = async (commitSha, patches = [], buildConfig = {}, repositorySource = 'llama.cpp', versionSuffix = null) => {
-    try {
-      const payload = {
-        commit_sha: commitSha,
-        patches,
-        build_config: buildConfig,
-        repository_source: repositorySource
-      }
-      if (versionSuffix) {
-        payload.version_suffix = versionSuffix
-      }
-      await axios.post('/api/llama-versions/build-source', payload)
-      await fetchLlamaVersions()
-    } catch (error) {
-      console.error('Failed to build from source:', error)
-      throw error
-    }
-  }
-
-  const activateVersion = async (versionId) => {
-    try {
-      await axios.post(`/api/llama-versions/${versionId}/activate`)
-      await fetchLlamaVersions()
-    } catch (error) {
-      console.error('Failed to activate version:', error)
-      throw error
-    }
-  }
-
-  const deleteVersion = async (versionId) => {
-    try {
-      await axios.delete(`/api/llama-versions/${versionId}`)
-      await fetchLlamaVersions()
-    } catch (error) {
-      console.error('Failed to delete version:', error)
-      throw error
-    }
-  }
-
-  const updateSystemStatus = (status) => {
-    systemStatus.value = { ...systemStatus.value, ...status }
-  }
-
-  const updateGpuInfo = (gpuData) => {
-    gpuInfo.value = { ...gpuInfo.value, ...gpuData }
-  }
-
-  const getCudaStatus = async () => {
-    try {
-      const response = await axios.get('/api/llama-versions/cuda-status')
-      return response.data
-    } catch (error) {
-      console.error('Failed to get CUDA status:', error)
-      throw error
-    }
-  }
-
-  const installCuda = async (version = '12.6') => {
-    try {
-      await axios.post('/api/llama-versions/cuda-install', { version })
-    } catch (error) {
-      console.error('Failed to install CUDA:', error)
-      throw error
-    }
-  }
-
-  const getCudaLogs = async () => {
-    try {
-      const response = await axios.get('/api/llama-versions/cuda-logs')
-      return response.data
-    } catch (error) {
-      console.error('Failed to get CUDA logs:', error)
-      throw error
-    }
-  }
-
-  const uninstallCuda = async (version = null) => {
-    try {
-      await axios.post('/api/llama-versions/cuda-uninstall', { version })
-    } catch (error) {
-      console.error('Failed to uninstall CUDA:', error)
-      throw error
-    }
-  }
-
-  return {
-    systemStatus,
-    gpuInfo,
-    llamaVersions,
-    loading,
-    fetchSystemStatus,
-    fetchLlamaVersions,
-    checkUpdates,
-    fetchReleaseAssets,
-    installRelease,
-    buildSource,
-    activateVersion,
-    deleteVersion,
-    updateSystemStatus,
-    updateGpuInfo,
-    getCudaStatus,
-    installCuda,
-    uninstallCuda,
-    getCudaLogs
-  }
-})
diff --git a/frontend/src/stores/websocket.js b/frontend/src/stores/websocket.js
deleted file mode 100644
index 3c53d95..0000000
--- a/frontend/src/stores/websocket.js
+++ /dev/null
@@ -1,322 +0,0 @@
-import { defineStore } from 'pinia'
-import { ref, computed } from 'vue'
-
-export const useWebSocketStore = defineStore('websocket', () => {
-  
-  const ws = ref(null)
-  const connected = ref(false)
-  const reconnecting = ref(false)
-  const lastMessage = ref(null)
-  const messageHistory = ref([])
-  const subscribers = ref(new Map())
-  
-  // Connection settings
-  const maxReconnectAttempts = 10
-  const reconnectDelay = 1000 // Start with 1 second
-  const maxReconnectDelay = 30000 // Max 30 seconds
-  let reconnectAttempts = 0
-  let reconnectTimeout = null
-  
-  // Computed
-  const connectionStatus = computed(() => {
-    if (connected.value) return 'connected'
-    if (reconnecting.value) return 'reconnecting'
-    return 'disconnected'
-  })
-  
-  const isConnected = computed(() => connected.value && ws.value?.readyState === WebSocket.OPEN)
-  
-  // WebSocket URL
-  const getWebSocketUrl = () => {
-    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
-    const host = window.location.host
-    return `${protocol}//${host}/ws`
-  }
-  
-  // Connect to WebSocket
-        const connect = () => {
-            if (ws.value?.readyState === WebSocket.OPEN || ws.value?.readyState === WebSocket.CONNECTING) {
-              return
-            }
-            
-            const url = getWebSocketUrl()
-            
-            try {
-              ws.value = new WebSocket(url)
-              
-        ws.value.onopen = () => {
-          console.log('WebSocket connected successfully to:', url)
-          connected.value = true
-                reconnecting.value = false
-                reconnectAttempts = 0
-                
-                // Send a test message to verify connection
-                send({
-                  type: 'test',
-                  message: 'WebSocket connection test',
-                  timestamp: new Date().toISOString()
-                })
-                
-                // Send any queued messages
-                flushQueuedMessages()
-              }
-              
-        ws.value.onmessage = async (event) => {
-          try {
-            const data = JSON.parse(event.data)
-            console.log('WebSocket received:', data)
-            lastMessage.value = data
-            messageHistory.value.push({
-              ...data,
-              timestamp: new Date().toISOString()
-            })
-            
-            // Keep only last 100 messages
-            if (messageHistory.value.length > 100) {
-              messageHistory.value = messageHistory.value.slice(-100)
-            }
-            
-            // Notify subscribers (now async)
-            await notifySubscribers(data.type, data)
-            
-          } catch (error) {
-            console.error('Failed to parse WebSocket message:', error)
-          }
-        }
-              
-              ws.value.onclose = (event) => {
-                console.log('WebSocket disconnected:', event.code, event.reason)
-                connected.value = false
-                ws.value = null
-                
-                // Attempt reconnection if not manually closed
-                if (event.code !== 1000 && reconnectAttempts < maxReconnectAttempts) {
-                  scheduleReconnect()
-                }
-              }
-              
-              ws.value.onerror = (error) => {
-                console.error('WebSocket error:', error)
-                connected.value = false
-              }
-              
-            } catch (error) {
-              console.error('Failed to create WebSocket connection:', error)
-              scheduleReconnect()
-            }
-        }
-  
-  // Schedule reconnection with exponential backoff
-  const scheduleReconnect = () => {
-    if (reconnectTimeout) {
-      clearTimeout(reconnectTimeout)
-    }
-    
-    reconnectAttempts++
-    reconnecting.value = true
-    
-    const delay = Math.min(
-      reconnectDelay * Math.pow(2, reconnectAttempts - 1),
-      maxReconnectDelay
-    )
-    
-    console.log(`Scheduling WebSocket reconnection in ${delay}ms (attempt ${reconnectAttempts}/${maxReconnectAttempts})`)
-    
-    reconnectTimeout = setTimeout(() => {
-      connect()
-    }, delay)
-  }
-  
-  // Disconnect WebSocket
-  const disconnect = () => {
-    if (reconnectTimeout) {
-      clearTimeout(reconnectTimeout)
-      reconnectTimeout = null
-    }
-    
-    reconnectAttempts = maxReconnectAttempts // Prevent reconnection
-    
-    if (ws.value) {
-      ws.value.close(1000, 'Manual disconnect')
-      ws.value = null
-    }
-    
-    connected.value = false
-    reconnecting.value = false
-  }
-  
-  // Send message
-  const send = (message) => {
-    if (isConnected.value) {
-      try {
-        ws.value.send(JSON.stringify(message))
-        return true
-      } catch (error) {
-        console.error('Failed to send WebSocket message:', error)
-        return false
-      }
-    } else {
-      console.warn('WebSocket not connected, message queued')
-      queueMessage(message)
-      return false
-    }
-  }
-  
-  // Message queue for when disconnected
-  const queuedMessages = ref([])
-  
-  const queueMessage = (message) => {
-    queuedMessages.value.push({
-      ...message,
-      timestamp: Date.now()
-    })
-  }
-  
-  const flushQueuedMessages = () => {
-    while (queuedMessages.value.length > 0 && isConnected.value) {
-      const message = queuedMessages.value.shift()
-      send(message)
-    }
-  }
-  
-  // Subscribe to specific message types
-  const subscribe = (messageType, callback) => {
-    if (!subscribers.value.has(messageType)) {
-      subscribers.value.set(messageType, new Set())
-    }
-    subscribers.value.get(messageType).add(callback)
-    
-    // Return unsubscribe function
-    return () => {
-      const callbacks = subscribers.value.get(messageType)
-      if (callbacks) {
-        callbacks.delete(callback)
-        if (callbacks.size === 0) {
-          subscribers.value.delete(messageType)
-        }
-      }
-    }
-  }
-  
-  // Notify subscribers
-  const notifySubscribers = async (messageType, data) => {
-    const callbacks = subscribers.value.get(messageType)
-    if (callbacks) {
-      // Use Promise.allSettled to handle both sync and async callbacks
-      const promises = Array.from(callbacks).map(async (callback) => {
-        try {
-          const result = callback(data)
-          // If callback returns a promise, await it
-          if (result && typeof result.then === 'function') {
-            await result
-          }
-        } catch (error) {
-          console.error('Error in WebSocket subscriber callback:', error)
-        }
-      })
-      
-      // Wait for all callbacks to complete (or fail)
-      await Promise.allSettled(promises)
-    }
-  }
-  
-  // Convenience methods for specific message types
-  const subscribeToDownloadProgress = (callback) => {
-    return subscribe('download_progress', callback)
-  }
-  
-  const subscribeToBuildProgress = (callback) => {
-    return subscribe('build_progress', callback)
-  }
-  
-  const subscribeToModelStatus = (callback) => {
-    return subscribe('model_status', callback)
-  }
-  
-  const subscribeToSystemMetrics = (callback) => {
-    return subscribe('system_metrics', callback)
-  }
-  
-  const subscribeToNotifications = (callback) => {
-    return subscribe('notification', callback)
-  }
-  
-  const subscribeToRamUpdates = (callback) => {
-    return subscribe('ram_update', callback)
-  }
-  
-  const subscribeToVramUpdates = (callback) => {
-    return subscribe('vram_update', callback)
-  }
-  
-  const subscribeToDownloadComplete = (callback) => {
-    return subscribe('download_complete', callback)
-  }
-  
-  // Unified monitoring subscription for real-time system data
-  const subscribeToUnifiedMonitoring = (callback) => {
-    return subscribe('unified_monitoring', callback)
-  }
-  
-  // Model events subscription for instant start/stop/loading updates
-  const subscribeToModelEvents = (callback) => {
-    return subscribe('model_event', callback)
-  }
-  
-  const subscribeToLmdeployStatus = (callback) => {
-    return subscribe('lmdeploy_status', callback)
-  }
-  
-  const subscribeToLmdeployInstallLog = (callback) => {
-    return subscribe('lmdeploy_install_log', callback)
-  }
-  
-  const subscribeToLmdeployRuntimeLog = (callback) => {
-    return subscribe('lmdeploy_runtime_log', callback)
-  }
-  
-  // Get recent messages of specific type
-  const getRecentMessages = (messageType, limit = 10) => {
-    return messageHistory.value
-      .filter(msg => msg.type === messageType)
-      .slice(-limit)
-  }
-  
-  // Clear message history
-  const clearHistory = () => {
-    messageHistory.value = []
-  }
-  
-  return {
-    // State
-    connected,
-    reconnecting,
-    lastMessage,
-    messageHistory,
-    
-    // Computed
-    connectionStatus,
-    isConnected,
-    
-    // Methods
-    connect,
-    disconnect,
-    send,
-    subscribe,
-    subscribeToDownloadProgress,
-    subscribeToBuildProgress,
-    subscribeToModelStatus,
-    subscribeToSystemMetrics,
-    subscribeToNotifications,
-    subscribeToRamUpdates,
-    subscribeToVramUpdates,
-    subscribeToDownloadComplete,
-    subscribeToUnifiedMonitoring,
-    subscribeToModelEvents,
-    subscribeToLmdeployStatus,
-    subscribeToLmdeployInstallLog,
-    subscribeToLmdeployRuntimeLog,
-    getRecentMessages,
-    clearHistory
-  }
-})
diff --git a/frontend/src/styles/_base.css b/frontend/src/styles/_base.css
index 12178c7..f316d6e 100644
--- a/frontend/src/styles/_base.css
+++ b/frontend/src/styles/_base.css
@@ -192,7 +192,8 @@ a:hover {
   font-size: 0.875rem;
 }
 
-.connection-status {
+.connection-status,
+.live-status {
   display: flex;
   align-items: center;
   gap: var(--spacing-sm);
diff --git a/frontend/src/styles/_components.css b/frontend/src/styles/_components.css
index 1dce5ef..2154168 100644
--- a/frontend/src/styles/_components.css
+++ b/frontend/src/styles/_components.css
@@ -994,67 +994,6 @@
   margin-bottom: var(--spacing-md);
 }
 
-/* Vue3 Toastify Custom Styling */
-.Toastify__toast {
-  background: transparent;
-  color: var(--text-primary);
-  border: none;
-  border-radius: var(--radius-md);
-  box-shadow: var(--shadow-lg);
-}
-
-.Toastify__toast-container {
-  background: transparent;
-}
-
-.Toastify__toast--success {
-  background: var(--bg-primary);
-  border-left: 4px solid var(--accent-green);
-}
-
-.Toastify__toast--info {
-  background: var(--bg-primary);
-  border-left: 4px solid var(--accent-blue);
-}
-
-.Toastify__toast--warning {
-  background: var(--bg-primary);
-  border-left: 4px solid var(--accent-amber);
-}
-
-.Toastify__toast--error {
-  background: var(--bg-primary);
-  border-left: 4px solid var(--accent-red);
-}
-
-.Toastify__toast-body {
-  color: var(--text-primary);
-}
-
-.Toastify__close-button {
-  color: var(--text-secondary);
-}
-
-.Toastify__close-button:hover {
-  color: var(--text-primary);
-}
-
-.Toastify__progress-bar {
-  background: var(--accent-blue);
-}
-
-.Toastify__toast--success .Toastify__progress-bar {
-  background: var(--accent-green);
-}
-
-.Toastify__toast--warning .Toastify__progress-bar {
-  background: var(--accent-amber);
-}
-
-.Toastify__toast--error .Toastify__progress-bar {
-  background: var(--accent-red);
-}
-
 /* Tour highlight */
 .tour-highlight {
   position: relative;
diff --git a/frontend/src/styles/_variables.css b/frontend/src/styles/_variables.css
index 4dca028..f86ba76 100644
--- a/frontend/src/styles/_variables.css
+++ b/frontend/src/styles/_variables.css
@@ -44,6 +44,7 @@
   
   /* Interactive States */
   --hover-bg: rgba(34, 211, 238, 0.1);
+  --bg-card-hover: rgba(34, 211, 238, 0.06);
   --active-bg: rgba(34, 211, 238, 0.2);
   --focus-ring: rgba(34, 211, 238, 0.3);
   
@@ -137,6 +138,7 @@
   --accent-cyan-soft: rgba(8, 145, 178, 0.12);
   
   --hover-bg: rgba(8, 145, 178, 0.1);
+  --bg-card-hover: rgba(8, 145, 178, 0.06);
   --active-bg: rgba(8, 145, 178, 0.2);
   --focus-ring: rgba(8, 145, 178, 0.3);
   
diff --git a/frontend/src/utils/formatting.js b/frontend/src/utils/formatting.js
index 1b8a53e..003536a 100644
--- a/frontend/src/utils/formatting.js
+++ b/frontend/src/utils/formatting.js
@@ -1,5 +1,5 @@
 /**
- * Format bytes to human-readable string
+ * Format bytes to human-readable string (decimal: 1 MB = 1000² bytes, matches Hugging Face)
  * @param {number} bytes - Bytes to format
  * @returns {string} Formatted string (e.g., "1.5 GB")
  */
@@ -7,11 +7,29 @@ export const formatBytes = (bytes) => {
   if (Number.isNaN(bytes) || bytes === null || bytes === undefined) return 'Unknown size'
   if (bytes === 0) return '0 B'
   const units = ['B', 'KB', 'MB', 'GB', 'TB']
-  const exponent = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1)
-  const value = bytes / Math.pow(1024, exponent)
+  const exponent = Math.min(Math.floor(Math.log(bytes) / Math.log(1000)), units.length - 1)
+  const value = bytes / Math.pow(1000, exponent)
   return `${value.toFixed(value >= 10 || exponent === 0 ? 0 : 1)} ${units[exponent]}`
 }
 
+/**
+ * Format bytes using IEC units (base 1024: KiB, MiB, GiB). Use for RAM, disk, VRAM.
+ * @param {number} bytes - Bytes to format
+ * @returns {string} Formatted string (e.g., "1.5 GiB")
+ */
+export const formatBytesIEC = (bytes) => {
+  if (Number.isNaN(bytes) || bytes === null || bytes === undefined) return '0 B'
+  if (bytes === 0) return '0 B'
+  const units = ['B', 'KiB', 'MiB', 'GiB', 'TiB']
+  let i = 0
+  let val = bytes
+  while (val >= 1024 && i < units.length - 1) {
+    val /= 1024
+    i++
+  }
+  return `${val.toFixed(val >= 10 || i === 0 ? 0 : 1)} ${units[i]}`
+}
+
 /**
  * Format file size (alias for formatBytes)
  * @param {number} bytes - Bytes to format
diff --git a/frontend/src/views/EnginesView.vue b/frontend/src/views/EnginesView.vue
new file mode 100644
index 0000000..a7c78b0
--- /dev/null
+++ b/frontend/src/views/EnginesView.vue
@@ -0,0 +1,1059 @@
+<template>
+  <div class="engines-view">
+
+    <!-- ── System Info ─────────────────────────────────────── -->
+    <section class="ev-section">
+      <div class="ev-section-header" @click="systemExpanded = !systemExpanded">
+        <div class="ev-section-title">
+          <i class="pi pi-desktop" />
+          <h2>System</h2>
+        </div>
+        <div class="ev-section-actions">
+          <Button icon="pi pi-refresh" text severity="secondary" size="small"
+            :loading="enginesStore.loading" @click.stop="enginesStore.fetchSystemStatus()" />
+          <i :class="['pi', systemExpanded ? 'pi-chevron-up' : 'pi-chevron-down']" />
+        </div>
+      </div>
+      <Transition name="ev-collapse">
+        <div v-if="systemExpanded" class="ev-section-body">
+          <div class="metrics-grid">
+            <div class="metric-card">
+              <i class="pi pi-desktop metric-icon" />
+              <div class="metric-data">
+                <div class="metric-label">CPU</div>
+                <div class="metric-value">{{ (sys.cpu_percent || 0).toFixed(1) }}%</div>
+                <ProgressBar :value="sys.cpu_percent || 0" :showValue="false" class="metric-bar" />
+              </div>
+            </div>
+            <div class="metric-card">
+              <i class="pi pi-database metric-icon" />
+              <div class="metric-data">
+                <div class="metric-label">Memory</div>
+                <div class="metric-value">
+                  {{ formatBytesIEC(sys.memory?.used) }} / {{ formatBytesIEC(sys.memory?.total) }} ({{ memPercent }}%)
+                </div>
+                <ProgressBar :value="memPercent" :showValue="false" class="metric-bar" />
+              </div>
+            </div>
+            <div class="metric-card">
+              <i class="pi pi-save metric-icon" />
+              <div class="metric-data">
+                <div class="metric-label">Disk</div>
+                <div class="metric-value">
+                  {{ formatBytesIEC(sys.disk?.used) }} / {{ formatBytesIEC(sys.disk?.total) }} ({{ diskPercent }}%)
+                </div>
+                <ProgressBar :value="diskPercent" :showValue="false" class="metric-bar" />
+              </div>
+            </div>
+            <div v-if="gpu" class="metric-card">
+              <i class="pi pi-bolt metric-icon" />
+              <div class="metric-data">
+                <div class="metric-label">GPU — {{ gpu.name }}</div>
+                <div class="metric-value">
+                  {{ formatBytesIEC(gpu.memory_used_mb * 1048576) }} /
+                  {{ formatBytesIEC(gpu.memory_total_mb * 1048576) }} VRAM
+                </div>
+                <ProgressBar :value="gpuPercent" :showValue="false" class="metric-bar" />
+              </div>
+            </div>
+          </div>
+        </div>
+      </Transition>
+    </section>
+
+    <!-- ── llama.cpp ──────────────────────────────────────── -->
+    <section class="ev-section">
+      <div class="ev-section-header">
+        <div class="ev-section-title">
+          <i class="pi pi-microchip" />
+          <h2>llama.cpp</h2>
+          <Tag v-if="activeLlamaCpp" :value="activeLlamaCpp.version" severity="success" />
+          <Tag v-else-if="enginesStore.llamaVersions.length" value="No Active" severity="warning" />
+        </div>
+        <div class="ev-section-actions">
+          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
+            :loading="checkingLlamaCpp" @click="checkLlamaCppUpdates" />
+          <Button icon="pi pi-refresh" text severity="secondary" size="small"
+            @click="enginesStore.fetchLlamaVersions()" />
+        </div>
+      </div>
+      <div class="ev-section-body">
+        <div v-if="llamaCppUpdateInfo?.update_available" class="update-banner">
+          <i class="pi pi-arrow-up-right" />
+          Update available: <strong>{{ llamaCppUpdateInfo.latest_version }}</strong>
+          <a :href="llamaCppUpdateInfo.release_url" target="_blank" class="update-link">View release</a>
+        </div>
+        <div v-else-if="llamaCppUpdateInfo" class="update-current">
+          <i class="pi pi-check" /> Up to date ({{ llamaCppUpdateInfo.current_version }})
+        </div>
+
+        <ProgressTracker :type="['build', 'install_release']" />
+
+        <div class="ev-actions">
+          <Button label="Install Release" icon="pi pi-download" severity="success" outlined size="small"
+            @click="openReleaseDialog('llama_cpp')" />
+          <Button label="Build from Source" icon="pi pi-code" severity="info" outlined size="small"
+            @click="openBuildDialog('llama_cpp')" />
+        </div>
+
+        <VersionTable
+          :versions="enginesStore.llamaVersions"
+          :activating="activating"
+          @activate="activateVersion"
+          @delete="confirmDeleteVersion"
+        />
+      </div>
+    </section>
+
+    <!-- ── ik_llama.cpp ───────────────────────────────────── -->
+    <section class="ev-section">
+      <div class="ev-section-header">
+        <div class="ev-section-title">
+          <i class="pi pi-microchip" />
+          <h2>ik_llama.cpp</h2>
+          <Tag v-if="activeIkLlama" :value="activeIkLlama.version" severity="success" />
+          <Tag v-else-if="enginesStore.ikLlamaVersions.length" value="No Active" severity="warning" />
+        </div>
+        <div class="ev-section-actions">
+          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
+            :loading="checkingIkLlama" @click="checkIkLlamaUpdates" />
+          <Button icon="pi pi-refresh" text severity="secondary" size="small"
+            @click="enginesStore.fetchLlamaVersions()" />
+        </div>
+      </div>
+      <div class="ev-section-body">
+        <div v-if="ikLlamaUpdateInfo?.update_available" class="update-banner">
+          <i class="pi pi-arrow-up-right" />
+          Update available: <strong>{{ ikLlamaUpdateInfo.latest_version }}</strong>
+          <a :href="ikLlamaUpdateInfo.release_url" target="_blank" class="update-link">View</a>
+        </div>
+        <div v-else-if="ikLlamaUpdateInfo" class="update-current">
+          <i class="pi pi-check" /> Up to date ({{ ikLlamaUpdateInfo.current_version }})
+        </div>
+
+        <ProgressTracker type="build" />
+
+        <div class="ev-actions">
+          <Button label="Build from Source" icon="pi pi-code" severity="info" outlined size="small"
+            @click="openBuildDialog('ik_llama')" />
+        </div>
+
+        <VersionTable
+          :versions="enginesStore.ikLlamaVersions"
+          :activating="activating"
+          @activate="activateVersion"
+          @delete="confirmDeleteVersion"
+        />
+      </div>
+    </section>
+
+    <!-- ── CUDA Toolkit ───────────────────────────────────── -->
+    <section class="ev-section">
+      <div class="ev-section-header">
+        <div class="ev-section-title">
+          <i class="pi pi-bolt" />
+          <h2>CUDA Toolkit</h2>
+          <Tag v-if="cuda.installed" :value="`CUDA ${cuda.version}`" severity="success" />
+          <Tag v-else value="Not Installed" severity="secondary" />
+        </div>
+        <div class="ev-section-actions">
+          <Button icon="pi pi-refresh" text severity="secondary" size="small"
+            @click="enginesStore.fetchCudaStatus()" />
+        </div>
+      </div>
+      <div class="ev-section-body">
+        <ProgressTracker type="install" />
+
+        <div v-if="cuda.installed" class="status-detail">
+          <span class="detail-label">Path:</span>
+          <code>{{ cuda.cuda_path || 'unknown' }}</code>
+        </div>
+
+        <div v-if="cuda.installed_versions?.length" class="ev-version-list">
+          <div v-for="v in cuda.installed_versions" :key="v.version" class="ev-version-row">
+            <code class="version-name">CUDA {{ v.version }}</code>
+            <Tag v-if="v.is_current" value="Active" severity="success" />
+            <Button icon="pi pi-trash" text severity="danger" size="small"
+              @click="confirmUninstallCuda(v.version)" />
+          </div>
+        </div>
+        <div v-else-if="cuda.installed" class="empty-state-mini">
+          <i class="pi pi-bolt" />
+          <span>No CUDA versions listed.</span>
+        </div>
+
+        <div class="ev-actions">
+          <Button label="Install CUDA" icon="pi pi-download" severity="success" outlined size="small"
+            @click="cudaInstallDialogVisible = true" />
+        </div>
+      </div>
+    </section>
+
+    <!-- ── CUDA Install Dialog ────────────────────────────── -->
+    <Dialog v-model:visible="cudaInstallDialogVisible" header="Install CUDA Toolkit" modal :style="{ width: '400px' }">
+      <div class="dialog-body">
+        <div class="form-field">
+          <label>Version</label>
+          <Dropdown v-model="cudaInstallVersion" :options="cudaVersionOptions"
+            placeholder="Select version…" style="width:100%" />
+        </div>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="cudaInstallDialogVisible = false" />
+        <Button label="Install" icon="pi pi-download" severity="success"
+          :disabled="!cudaInstallVersion" :loading="cudaInstalling"
+          @click="installCuda" />
+      </template>
+    </Dialog>
+
+    <!-- ── LMDeploy ───────────────────────────────────────── -->
+    <section class="ev-section">
+      <div class="ev-section-header">
+        <div class="ev-section-title">
+          <i class="pi pi-server" />
+          <h2>LMDeploy</h2>
+          <Tag v-if="lm.installed" :value="`v${lm.version || '?'}`" severity="success" />
+          <Tag v-else value="Not Installed" severity="secondary" />
+        </div>
+        <div class="ev-section-actions">
+          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
+            :loading="checkingLmdeploy" @click="checkLmdeployUpdates" />
+          <Button icon="pi pi-refresh" text severity="secondary" size="small"
+            @click="enginesStore.fetchLmdeployStatus()" />
+        </div>
+      </div>
+      <div class="ev-section-body">
+        <div v-if="lmdeployUpdateInfo?.update_available" class="update-banner">
+          <i class="pi pi-arrow-up-right" />
+          Update available: <strong>v{{ lmdeployUpdateInfo.latest_version }}</strong>
+          <a href="https://pypi.org/project/lmdeploy/" target="_blank" class="update-link">View on PyPI</a>
+        </div>
+        <div v-else-if="lmdeployUpdateInfo" class="update-current">
+          <i class="pi pi-check" /> Up to date (v{{ lmdeployUpdateInfo.current_version || 'none' }})
+        </div>
+
+        <ProgressTracker type="install" />
+
+        <div v-if="lm.installed" class="status-detail">
+          <span class="detail-label">Install type:</span>
+          <Tag :value="lm.install_type || 'pip'" severity="info" />
+          <template v-if="lm.venv_path">
+            <span class="detail-label ml">Venv:</span>
+            <code>{{ lm.venv_path }}</code>
+          </template>
+        </div>
+        <div v-if="lm.source_repo" class="status-detail">
+          <span class="detail-label">Source:</span>
+          <code>{{ lm.source_repo }} ({{ lm.source_branch }})</code>
+        </div>
+
+        <div class="ev-actions">
+          <Button label="Install from pip" icon="pi pi-download" severity="success" outlined size="small"
+            :disabled="lm.installed" @click="lmPipDialogVisible = true" />
+          <Button label="Install from Source" icon="pi pi-code" severity="info" outlined size="small"
+            :disabled="lm.installed" @click="lmSourceDialogVisible = true" />
+        </div>
+
+        <div v-if="lm.installed" class="ev-actions" style="margin-top:1rem; border-top:1px solid var(--border-primary); padding-top:1rem">
+          <Button label="Remove LMDeploy" icon="pi pi-trash" severity="danger" outlined
+            :loading="lmdeployRemoving" @click="confirmRemoveLmdeploy" />
+        </div>
+      </div>
+    </section>
+
+    <!-- ── Install Release Dialog ─────────────────────────── -->
+    <Dialog v-model:visible="releaseDialogVisible"
+      :header="`Install ${releaseTarget === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'} Release`"
+      modal :style="{ width: '520px' }">
+      <div class="dialog-body">
+        <div v-if="loadingReleases" class="dialog-loading">
+          <ProgressSpinner style="width:40px;height:40px" strokeWidth="4" />
+          <span>Fetching releases…</span>
+        </div>
+        <template v-else>
+          <div class="form-field">
+            <label>Release Tag</label>
+            <Dropdown v-model="selectedReleaseTag" :options="releaseTagOptions"
+              placeholder="Select release…" style="width:100%"
+              @change="loadReleaseAssets" />
+          </div>
+          <div v-if="releaseAssets.length" class="form-field">
+            <label>Asset</label>
+            <div class="asset-list">
+              <div v-for="asset in releaseAssets" :key="asset.id"
+                class="asset-option" :class="{ selected: selectedAssetId === asset.id }"
+                @click="selectedAssetId = asset.id">
+                <RadioButton :value="asset.id" v-model="selectedAssetId" />
+                <span class="asset-name">{{ asset.name }}</span>
+                <span class="asset-size">{{ formatBytes(asset.size) }}</span>
+              </div>
+            </div>
+          </div>
+          <Message v-if="!releaseTagOptions.length" severity="warn" :closable="false">
+            No compatible release assets found.
+          </Message>
+        </template>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="releaseDialogVisible = false" />
+        <Button label="Install" icon="pi pi-download" severity="success"
+          :disabled="!selectedReleaseTag || loadingReleases || installingRelease"
+          :loading="installingRelease"
+          @click="doInstallRelease" />
+      </template>
+    </Dialog>
+
+    <!-- ── Build from Source Dialog ──────────────────────── -->
+    <Dialog v-model:visible="buildDialogVisible"
+      :header="`Build ${buildTarget === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'} from Source`"
+      modal :style="{ width: '560px' }">
+      <div class="dialog-body">
+        <div class="form-field">
+          <label>Commit / Branch</label>
+          <InputText v-model="buildForm.commitSha"
+            :placeholder="buildTarget === 'ik_llama' ? 'main' : 'master'"
+            style="width:100%" />
+          <small>Leave blank for default branch</small>
+        </div>
+        <div class="form-field">
+          <label>Build Name Suffix <span class="optional">(optional)</span></label>
+          <InputText v-model="buildForm.versionSuffix" placeholder="e.g. my-build" style="width:100%" />
+          <small>Appended to version name. Defaults to timestamp if empty.</small>
+        </div>
+        <div class="form-field">
+          <label>Build Options</label>
+          <div class="toggle-grid">
+            <div v-for="opt in buildOptions" :key="opt.key" class="toggle-row">
+              <InputSwitch v-model="buildForm.buildConfig[opt.key]" />
+              <div>
+                <span class="opt-label">{{ opt.label }}</span>
+                <small class="opt-desc">{{ opt.desc }}</small>
+              </div>
+            </div>
+          </div>
+        </div>
+        <div v-if="buildForm.buildConfig.cuda" class="form-field">
+          <label>CUDA Architectures <span class="optional">(optional)</span></label>
+          <InputText v-model="buildForm.buildConfig.cuda_architectures"
+            placeholder="e.g. 86;89 (blank = auto)" style="width:100%" />
+        </div>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="buildDialogVisible = false" />
+        <Button label="Start Build" icon="pi pi-cog" severity="info"
+          :loading="building" @click="doStartBuild" />
+      </template>
+    </Dialog>
+
+    <!-- ── LMDeploy Install from pip Dialog ───────────────── -->
+    <Dialog v-model:visible="lmPipDialogVisible" header="Install LMDeploy from pip" modal :style="{ width: '420px' }">
+      <div class="dialog-body">
+        <div class="form-field">
+          <label>Version</label>
+          <InputText v-model="lmdeployPipVersion" placeholder="Blank = latest" style="width:100%" />
+          <small>Leave blank to install the latest from PyPI.</small>
+        </div>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="lmPipDialogVisible = false" />
+        <Button label="Install" icon="pi pi-download" severity="success"
+          :loading="lmdeployInstalling" :disabled="lmdeployInstalling"
+          @click="installLmdeployPip" />
+      </template>
+    </Dialog>
+
+    <!-- ── LMDeploy Install from Source Dialog ─────────────── -->
+    <Dialog v-model:visible="lmSourceDialogVisible" header="Install LMDeploy from Source" modal :style="{ width: '480px' }">
+      <div class="dialog-body">
+        <div class="form-field">
+          <label>Repo URL</label>
+          <InputText v-model="lmSourceRepo" placeholder="https://github.com/InternLM/lmdeploy.git" style="width:100%" />
+        </div>
+        <div class="form-field">
+          <label>Branch</label>
+          <InputText v-model="lmSourceBranch" placeholder="main" style="width:100%" />
+        </div>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="lmSourceDialogVisible = false" />
+        <Button label="Install from Source" icon="pi pi-code" severity="info"
+          :loading="lmdeployInstalling" :disabled="lmdeployInstalling"
+          @click="installLmdeploySource" />
+      </template>
+    </Dialog>
+
+    <ConfirmDialog />
+
+  </div>
+</template>
+
+<script setup>
+import { ref, computed, onMounted } from 'vue'
+import { useConfirm } from 'primevue/useconfirm'
+import { useToast } from 'primevue/usetoast'
+import Button from 'primevue/button'
+import Tag from 'primevue/tag'
+import ProgressBar from 'primevue/progressbar'
+import ProgressSpinner from 'primevue/progressspinner'
+import Dialog from 'primevue/dialog'
+import Dropdown from 'primevue/dropdown'
+import InputText from 'primevue/inputtext'
+import InputSwitch from 'primevue/inputswitch'
+import RadioButton from 'primevue/radiobutton'
+import ConfirmDialog from 'primevue/confirmdialog'
+import Message from 'primevue/message'
+import ProgressTracker from '@/components/common/ProgressTracker.vue'
+import VersionTable from '@/components/system/VersionTable.vue'
+import { useEnginesStore } from '@/stores/engines'
+import { formatBytes, formatBytesIEC } from '@/utils/formatting'
+
+const enginesStore = useEnginesStore()
+const confirm = useConfirm()
+const toast = useToast()
+
+// ── System metrics ─────────────────────────────────────────
+const systemExpanded = ref(true)
+
+const sys = computed(() => {
+  const s = enginesStore.systemStatus
+  return s?.system || s || {}
+})
+
+const gpu = computed(() => enginesStore.gpuInfo?.gpus?.[0] ?? null)
+
+const memPercent = computed(() => {
+  const m = sys.value.memory
+  const used = m?.used ?? 0
+  const total = m?.total ?? 0
+  return total > 0 ? Math.round((used / total) * 100) : 0
+})
+
+const diskPercent = computed(() => {
+  const d = sys.value.disk
+  const used = d?.used ?? 0
+  const total = d?.total ?? 0
+  return total > 0 ? Math.round((used / total) * 100) : 0
+})
+
+const gpuPercent = computed(() => {
+  const g = gpu.value
+  const used = g?.memory_used_mb ?? 0
+  const total = g?.memory_total_mb ?? 0
+  return total > 0 ? Math.round((used / total) * 100) : 0
+})
+
+// ── Active versions ────────────────────────────────────────
+const activeLlamaCpp = computed(() => enginesStore.llamaVersions.find(v => v.is_active) ?? null)
+const activeIkLlama = computed(() => enginesStore.ikLlamaVersions.find(v => v.is_active) ?? null)
+
+// ── Version activate / delete ──────────────────────────────
+const activating = ref(null)
+
+async function activateVersion(versionId) {
+  activating.value = versionId
+  try {
+    await enginesStore.activateVersion(versionId)
+    toast.add({ severity: 'success', summary: 'Version activated', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  } finally {
+    activating.value = null
+  }
+}
+
+function confirmDeleteVersion(versionId) {
+  confirm.require({
+    message: `Delete version "${versionId}"?`,
+    header: 'Confirm Delete',
+    icon: 'pi pi-exclamation-triangle',
+    acceptClass: 'p-button-danger',
+    accept: async () => {
+      try {
+        await enginesStore.deleteVersion(versionId)
+        toast.add({ severity: 'info', summary: 'Version deleted', life: 3000 })
+      } catch (e) {
+        toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+      }
+    },
+  })
+}
+
+// ── Update check (shared normalizer for llama/ik_llama API shape) ─────────
+function normalizeLlamaUpdateInfo(raw, currentVersion, commitUrlPrefix) {
+  if (!raw?.latest_release && !raw?.latest_commit) return null
+  const latestVersion = raw.latest_release?.tag_name || raw.latest_commit?.sha?.slice(0, 8) || null
+  const releaseUrl = raw.latest_release?.html_url ||
+    (raw.latest_commit ? `${commitUrlPrefix}/commit/${raw.latest_commit.sha}` : null)
+  const current = currentVersion || 'none'
+  const updateAvailable = latestVersion && current !== latestVersion
+  return {
+    update_available: updateAvailable,
+    latest_version: latestVersion,
+    release_url: releaseUrl,
+    current_version: current,
+    available_tags: raw.available_tags || (raw.latest_release?.tag_name ? [raw.latest_release.tag_name] : []),
+  }
+}
+
+const checkingLlamaCpp = ref(false)
+const llamaCppUpdateInfo = ref(null)
+
+async function checkLlamaCppUpdates() {
+  checkingLlamaCpp.value = true
+  try {
+    const raw = await enginesStore.checkLlamaCppUpdates()
+    llamaCppUpdateInfo.value = normalizeLlamaUpdateInfo(
+      raw,
+      activeLlamaCpp.value?.version,
+      'https://github.com/ggerganov/llama.cpp',
+    )
+    if (llamaCppUpdateInfo.value?.available_tags?.length) {
+      releaseTagOptions.value = llamaCppUpdateInfo.value.available_tags
+    } else if (llamaCppUpdateInfo.value?.latest_version) {
+      releaseTagOptions.value = [llamaCppUpdateInfo.value.latest_version]
+    }
+  } catch (e) {
+    toast.add({ severity: 'warn', summary: 'Could not check updates', detail: e.message, life: 3000 })
+  } finally {
+    checkingLlamaCpp.value = false
+  }
+}
+
+const checkingIkLlama = ref(false)
+const ikLlamaUpdateInfo = ref(null)
+
+async function checkIkLlamaUpdates() {
+  checkingIkLlama.value = true
+  try {
+    const raw = await enginesStore.checkIkLlamaUpdates()
+    ikLlamaUpdateInfo.value = normalizeLlamaUpdateInfo(
+      raw,
+      activeIkLlama.value?.version,
+      'https://github.com/ikawrakow/ik_llama.cpp',
+    )
+  } catch (e) {
+    toast.add({ severity: 'warn', summary: 'Could not check updates', detail: e.message, life: 3000 })
+  } finally {
+    checkingIkLlama.value = false
+  }
+}
+
+// ── Release install dialog ─────────────────────────────────
+const releaseDialogVisible = ref(false)
+const releaseTarget = ref('llama_cpp')
+const loadingReleases = ref(false)
+const releaseTagOptions = ref([])
+const releaseAssets = ref([])
+const selectedReleaseTag = ref(null)
+const selectedAssetId = ref(null)
+const installingRelease = ref(false)
+
+async function openReleaseDialog(engineKey) {
+  releaseTarget.value = engineKey
+  releaseDialogVisible.value = true
+  releaseAssets.value = []
+  selectedAssetId.value = null
+  if (!releaseTagOptions.value.length) {
+    loadingReleases.value = true
+    try {
+      await checkLlamaCppUpdates()
+    } finally {
+      loadingReleases.value = false
+    }
+  }
+  if (releaseTagOptions.value.length) {
+    selectedReleaseTag.value = releaseTagOptions.value[0]
+    await loadReleaseAssets()
+  }
+}
+
+async function loadReleaseAssets() {
+  if (!selectedReleaseTag.value) return
+  loadingReleases.value = true
+  try {
+    const data = await enginesStore.fetchReleaseAssets(selectedReleaseTag.value)
+    releaseAssets.value = data?.assets || []
+    if (releaseAssets.value.length) selectedAssetId.value = releaseAssets.value[0].id
+  } catch {
+    releaseAssets.value = []
+  } finally {
+    loadingReleases.value = false
+  }
+}
+
+async function doInstallRelease() {
+  installingRelease.value = true
+  try {
+    await enginesStore.installRelease({
+      tag_name: selectedReleaseTag.value,
+      asset_id: selectedAssetId.value || undefined,
+    })
+    releaseDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'Install started', detail: 'Track progress below', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Install failed', detail: e.message, life: 4000 })
+  } finally {
+    installingRelease.value = false
+  }
+}
+
+// ── Build from source dialog ───────────────────────────────
+const buildDialogVisible = ref(false)
+const buildTarget = ref('llama_cpp')
+const building = ref(false)
+const buildForm = ref({
+  commitSha: '',
+  versionSuffix: '',
+  buildConfig: {
+    cuda: false,
+    flash_attention: false,
+    native: true,
+    backend_dl: false,
+    cpu_all_variants: false,
+    cuda_architectures: '',
+  },
+})
+const buildOptions = [
+  { key: 'cuda',             label: 'CUDA Support',             desc: 'GGML_CUDA=on' },
+  { key: 'flash_attention',  label: 'Flash Attention',          desc: 'GGML_CUDA_FA_ALL_QUANTS=on (requires CUDA)' },
+  { key: 'native',           label: 'Native CPU Optimizations', desc: 'GGML_NATIVE=on' },
+  { key: 'backend_dl',       label: 'Backend Dynamic Loading',  desc: 'GGML_BACKEND_DL=on' },
+  { key: 'cpu_all_variants', label: 'CPU All Variants',         desc: 'GGML_CPU_ALL_VARIANTS=on' },
+]
+
+function openBuildDialog(engineKey) {
+  buildTarget.value = engineKey
+  buildForm.value.commitSha = engineKey === 'ik_llama' ? 'main' : 'master'
+  buildForm.value.versionSuffix = ''
+  buildForm.value.buildConfig = {
+    cuda: false, flash_attention: false, native: true,
+    backend_dl: false, cpu_all_variants: false, cuda_architectures: '',
+  }
+  buildDialogVisible.value = true
+}
+
+async function doStartBuild() {
+  building.value = true
+  try {
+    const repoSource = buildTarget.value === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'
+    const config = { ...buildForm.value.buildConfig }
+    if (!config.cuda_architectures) delete config.cuda_architectures
+    await enginesStore.buildSource({
+      commit_sha: buildForm.value.commitSha || (buildTarget.value === 'ik_llama' ? 'main' : 'master'),
+      repository_source: repoSource,
+      version_suffix: buildForm.value.versionSuffix || undefined,
+      build_config: config,
+    })
+    buildDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'Build started', detail: 'Track progress below', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Build failed', detail: e.message, life: 4000 })
+  } finally {
+    building.value = false
+  }
+}
+
+// ── CUDA ───────────────────────────────────────────────────
+const cuda = computed(() => enginesStore.cudaStatus || {})
+const cudaVersionOptions = ['12.9', '12.8', '12.7', '12.6', '12.5', '12.4', '12.3', '12.2', '12.1', '12.0', '11.9', '11.8']
+const cudaInstallVersion = ref(null)
+const cudaInstalling = ref(false)
+const cudaInstallDialogVisible = ref(false)
+
+async function installCuda() {
+  cudaInstalling.value = true
+  try {
+    await enginesStore.installCuda({ version: cudaInstallVersion.value })
+    cudaInstallDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'CUDA install started', detail: 'Track progress below', life: 3000 })
+    await enginesStore.fetchCudaStatus()
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  } finally {
+    cudaInstalling.value = false
+  }
+}
+
+function confirmUninstallCuda(version) {
+  confirm.require({
+    message: `Uninstall CUDA ${version}?`,
+    header: 'Confirm Uninstall',
+    icon: 'pi pi-exclamation-triangle',
+    acceptClass: 'p-button-danger',
+    accept: async () => {
+      try {
+        await enginesStore.uninstallCuda({ version })
+        toast.add({ severity: 'info', summary: `CUDA ${version} uninstalled`, life: 3000 })
+      } catch (e) {
+        toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+      }
+    },
+  })
+}
+
+// ── LMDeploy ───────────────────────────────────────────────
+const lm = computed(() => enginesStore.lmdeployStatus || {})
+const lmdeployPipVersion = ref('')
+const lmSourceRepo = ref('https://github.com/InternLM/lmdeploy.git')
+const lmSourceBranch = ref('main')
+const lmdeployInstalling = ref(false)
+const lmdeployRemoving = ref(false)
+const checkingLmdeploy = ref(false)
+const lmdeployUpdateInfo = ref(null)
+const lmPipDialogVisible = ref(false)
+const lmSourceDialogVisible = ref(false)
+
+async function checkLmdeployUpdates() {
+  checkingLmdeploy.value = true
+  try {
+    const raw = await enginesStore.checkLmdeployUpdates()
+    const current = lm.value?.version || null
+    const latest = raw?.latest_version || null
+    const updateAvailable = latest && current !== latest
+    lmdeployUpdateInfo.value = {
+      update_available: updateAvailable,
+      latest_version: latest,
+      current_version: current,
+    }
+  } catch (e) {
+    toast.add({ severity: 'warn', summary: 'Could not check updates', detail: e.message, life: 3000 })
+  } finally {
+    checkingLmdeploy.value = false
+  }
+}
+
+async function installLmdeployPip() {
+  lmdeployInstalling.value = true
+  try {
+    await enginesStore.installLmdeploy(lmdeployPipVersion.value ? { version: lmdeployPipVersion.value } : {})
+    lmPipDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'LMDeploy install started', detail: 'Track progress below', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  } finally {
+    lmdeployInstalling.value = false
+  }
+}
+
+async function installLmdeploySource() {
+  lmdeployInstalling.value = true
+  try {
+    await enginesStore.installLmdeployFromSource({
+      repo_url: lmSourceRepo.value,
+      branch: lmSourceBranch.value,
+    })
+    lmSourceDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'Install from source started', detail: 'Track progress below', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  } finally {
+    lmdeployInstalling.value = false
+  }
+}
+
+function confirmRemoveLmdeploy() {
+  confirm.require({
+    message: 'Remove LMDeploy from the venv?',
+    header: 'Confirm Remove',
+    icon: 'pi pi-exclamation-triangle',
+    acceptClass: 'p-button-danger',
+    accept: async () => {
+      lmdeployRemoving.value = true
+      try {
+        await enginesStore.removeLmdeploy()
+        toast.add({ severity: 'info', summary: 'LMDeploy removed', life: 3000 })
+      } catch (e) {
+        toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+      } finally {
+        lmdeployRemoving.value = false
+      }
+    },
+  })
+}
+
+// ── Lifecycle ──────────────────────────────────────────────
+onMounted(() => enginesStore.fetchAll())
+</script>
+
+<style scoped>
+.engines-view {
+  max-width: 960px;
+  margin: 0 auto;
+  padding: var(--spacing-lg);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-lg);
+}
+
+/* ── Collapse transition ─────────────────────────────── */
+.ev-collapse-enter-active,
+.ev-collapse-leave-active { transition: all 0.2s ease; overflow: hidden; }
+.ev-collapse-enter-from,
+.ev-collapse-leave-to    { max-height: 0; opacity: 0; }
+.ev-collapse-enter-to,
+.ev-collapse-leave-from  { max-height: 600px; opacity: 1; }
+
+/* ── Section ─────────────────────────────────────────── */
+.ev-section {
+  background: var(--bg-card);
+  border: 1px solid var(--border-primary);
+  border-radius: var(--radius-lg);
+  overflow: hidden;
+}
+
+.ev-section-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.75rem 1.25rem;
+  background: var(--bg-surface);
+  border-bottom: 1px solid var(--border-primary);
+  cursor: default;
+  user-select: none;
+}
+
+.ev-section-title {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+
+.ev-section-title h2 {
+  font-size: 1rem;
+  font-weight: 600;
+  margin: 0;
+}
+
+.ev-section-actions {
+  display: flex;
+  align-items: center;
+  gap: 0.25rem;
+}
+
+.ev-section-body {
+  padding: 1.25rem;
+}
+
+/* ── Metrics ─────────────────────────────────────────── */
+.metrics-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+  gap: 0.75rem;
+}
+
+.metric-card {
+  display: flex;
+  gap: 0.5rem;
+  align-items: flex-start;
+  background: var(--bg-surface);
+  padding: 0.75rem;
+  border-radius: var(--radius-md);
+  border: 1px solid var(--border-primary);
+}
+
+.metric-icon { font-size: 1.5rem; flex-shrink: 0; line-height: 1; color: var(--accent-cyan); }
+.metric-data { flex: 1; min-width: 0; }
+.metric-label { font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--text-secondary); margin-bottom: 0.2rem; }
+.metric-value { font-size: 0.875rem; font-weight: 600; }
+.metric-bar { margin-top: 0.5rem; }
+/* No text inside the bar so low percentages don’t get clipped; value is shown above */
+
+/* ── Actions ─────────────────────────────────────────── */
+.ev-actions {
+  display: flex;
+  gap: 0.5rem;
+  align-items: center;
+  flex-wrap: wrap;
+  margin-bottom: 0.75rem;
+}
+
+.ev-subsection { margin-top: 1.25rem; }
+.ev-subsection h4 {
+  font-size: 0.75rem;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  color: var(--text-secondary, #9ca3af);
+  margin: 0 0 0.5rem;
+}
+
+.ev-form {
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+
+.form-row {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+
+.form-row label {
+  font-size: 0.875rem;
+  width: 88px;
+  flex-shrink: 0;
+  color: var(--text-secondary);
+}
+
+.form-input      { flex: 1; }
+.form-input-short { width: 140px; }
+
+/* ── Status details ──────────────────────────────────── */
+.status-detail {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  font-size: 0.875rem;
+  margin-bottom: 0.75rem;
+  flex-wrap: wrap;
+}
+
+.detail-label { color: var(--text-secondary); flex-shrink: 0; }
+.ml { margin-left: 0.75rem; }
+
+code {
+  background: var(--bg-surface);
+  padding: 0.1em 0.4em;
+  border-radius: 0.25rem;
+  font-size: 0.8rem;
+  font-family: monospace;
+  word-break: break-all;
+}
+
+/* ── Version list (CUDA / table-like) ───────────────── */
+.ev-version-list {
+  display: flex;
+  flex-direction: column;
+  gap: 0.25rem;
+  margin-bottom: 0.75rem;
+}
+
+.ev-version-row {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.5rem 0.75rem;
+  background: var(--bg-surface);
+  border: 1px solid var(--border-primary);
+  border-radius: var(--radius-md);
+  font-size: 0.875rem;
+}
+
+.ev-version-row .version-name { flex: 1; margin: 0; }
+
+.empty-state-mini {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.75rem;
+  color: var(--text-secondary);
+  font-size: 0.875rem;
+  margin-bottom: 0.75rem;
+}
+
+.empty-state-mini i { color: var(--text-muted); }
+
+.cuda-version-select { min-width: 160px; }
+.lm-version-input { width: 220px; }
+
+/* ── Update banners ──────────────────────────────────── */
+.update-banner, .update-current {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.5rem 0.75rem;
+  border-radius: var(--radius-md);
+  font-size: 0.875rem;
+  margin-bottom: 0.75rem;
+}
+
+.update-banner {
+  background: var(--status-warning-soft);
+  border: 1px solid rgba(245, 158, 11, 0.3);
+  color: var(--status-warning);
+}
+
+.update-current {
+  background: var(--status-success-soft);
+  border: 1px solid rgba(16, 185, 129, 0.3);
+  color: var(--status-success);
+}
+
+.update-link {
+  color: inherit;
+  margin-left: 0.5rem;
+  text-decoration: underline;
+  opacity: 0.8;
+}
+
+/* ── Dialog ──────────────────────────────────────────── */
+.dialog-body {
+  display: flex;
+  flex-direction: column;
+  gap: 0.75rem;
+}
+
+.dialog-loading {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 0.75rem;
+  padding: 2rem 0;
+  color: var(--text-secondary);
+}
+
+.form-field {
+  display: flex;
+  flex-direction: column;
+  gap: 0.25rem;
+}
+
+.form-field label {
+  font-size: 0.875rem;
+  font-weight: 500;
+  color: var(--text-secondary);
+}
+
+.form-field small { font-size: 0.75rem; color: var(--text-secondary); }
+.optional { font-weight: 400; opacity: 0.6; }
+
+.asset-list {
+  display: flex;
+  flex-direction: column;
+  gap: 0.25rem;
+  max-height: 240px;
+  overflow-y: auto;
+}
+
+.asset-option {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.4rem 0.6rem;
+  border-radius: var(--radius-md, 0.5rem);
+  cursor: pointer;
+  border: 1px solid transparent;
+  transition: background 0.15s;
+}
+
+.asset-option:hover { background: var(--bg-surface); }
+.asset-option.selected {
+  background: var(--bg-surface);
+  border-color: var(--accent-cyan);
+}
+
+.asset-name { flex: 1; font-size: 0.8rem; font-family: monospace; }
+.asset-size { font-size: 0.75rem; color: var(--text-secondary); }
+
+.toggle-grid { display: flex; flex-direction: column; gap: 0.5rem; }
+
+.toggle-row {
+  display: flex;
+  align-items: flex-start;
+  gap: 0.75rem;
+}
+
+.opt-label { font-size: 0.875rem; font-weight: 500; display: block; }
+.opt-desc  { font-size: 0.75rem; color: var(--text-secondary); display: block; }
+</style>
diff --git a/frontend/src/views/LMDeploy.vue b/frontend/src/views/LMDeploy.vue
deleted file mode 100644
index e65b35a..0000000
--- a/frontend/src/views/LMDeploy.vue
+++ /dev/null
@@ -1,298 +0,0 @@
-<template>
-  <div class="lmdeploy-page">
-    <section class="card">
-      <header class="card-header">
-        <div>
-          <h2>LMDeploy Installer</h2>
-          <p class="card-subtitle">Install or remove LMDeploy inside the running container without rebuilding the image.</p>
-        </div>
-        <Tag
-          :value="installed ? 'Installed' : 'Not Installed'"
-          :severity="installed ? 'success' : 'warning'"
-        />
-      </header>
-
-      <div class="status-grid">
-        <div>
-          <label>Status</label>
-          <div class="status-value">
-            <i
-              :class="[
-                'pi',
-                operationInProgress ? 'pi-spin pi-spinner text-warning' : installed ? 'pi-check-circle text-success' : 'pi-info-circle text-muted'
-              ]"
-            ></i>
-            <span>
-              {{ operationInProgress ? `Running ${status?.operation}…` : installed ? 'Ready' : 'Install required' }}
-            </span>
-          </div>
-          <small v-if="status?.operation_started_at">
-            Started at {{ formatDate(status.operation_started_at) }}
-          </small>
-        </div>
-        <div>
-          <label>Version</label>
-          <div class="status-value monospace">
-            {{ status?.version || 'Unknown' }}
-          </div>
-        </div>
-        <div>
-          <label>Binary Path</label>
-          <div class="status-value monospace truncate">
-            {{ status?.binary_path || 'Not found' }}
-          </div>
-        </div>
-        <div>
-          <label>Virtual Environment</label>
-          <div class="status-value monospace truncate">
-            {{ status?.venv_path || 'Not created' }}
-          </div>
-        </div>
-        <div>
-          <label>Last Error</label>
-          <div class="status-value error-text">
-            {{ status?.last_error || 'None' }}
-          </div>
-        </div>
-      </div>
-
-      <div class="card-actions">
-        <Button
-          label="Install LMDeploy"
-          icon="pi pi-download"
-          severity="success"
-          :loading="installing"
-          :disabled="operationInProgress || installed"
-          @click="startInstall"
-        />
-        <Button
-          label="Remove LMDeploy"
-          icon="pi pi-trash"
-          severity="danger"
-          outlined
-          :loading="removing"
-          :disabled="operationInProgress || !installed"
-          @click="startRemoval"
-        />
-        <Button
-          label="Refresh"
-          icon="pi pi-refresh"
-          severity="secondary"
-          text
-          :loading="statusLoading"
-          @click="refresh"
-        />
-      </div>
-
-      <p class="helper-text">
-        Need to run safetensors models? Install LMDeploy here, then start runtimes from the Safetensors panel.
-      </p>
-    </section>
-
-    <section class="card">
-      <header class="card-header">
-        <div>
-          <h3>Installer Logs</h3>
-          <p class="card-subtitle">Newest lines first. Use this to monitor pip progress if an install is running.</p>
-        </div>
-        <Button
-          icon="pi pi-refresh"
-          severity="secondary"
-          text
-          :loading="logLoading"
-          @click="refreshLogs"
-        />
-      </header>
-      <LogViewer v-if="logContent" :logs="parsedLogLines" mode="structured" />
-      <div v-else class="empty-log">
-        <i class="pi pi-info-circle"></i>
-        <p>No LMDeploy installer logs yet.</p>
-      </div>
-    </section>
-  </div>
-</template>
-
-<script setup>
-import { computed, onMounted, onUnmounted } from 'vue'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-
-import LogViewer from '@/components/common/LogViewer.vue'
-import { useLmdeployStore } from '@/stores/lmdeploy'
-import { formatDate } from '@/utils/formatting'
-
-const store = useLmdeployStore()
-
-const status = computed(() => store.status)
-const installed = computed(() => !!status.value?.installed)
-const operationInProgress = computed(() => !!status.value?.operation)
-const installing = computed(() => store.installing || status.value?.operation === 'install')
-const removing = computed(() => store.removing || status.value?.operation === 'remove')
-const statusLoading = computed(() => store.loading)
-const logLoading = computed(() => store.logLoading)
-const logContent = computed(() => store.logs || '')
-const parsedLogLines = computed(() =>
-  (store.logs || '')
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      const match = line.match(/^\[(.*?)\]\s*(.*)$/)
-      const timestamp = match ? match[1] : ''
-      const data = match ? match[2] : line
-      return {
-        timestamp,
-        log_type: 'install',
-        data,
-        id: `${timestamp || 'log'}-${index}`
-      }
-    })
-)
-
-const refresh = async () => {
-  try {
-    await Promise.all([store.fetchStatus(), store.fetchLogs()])
-  } catch (error) {
-    toast.error('Failed to refresh LMDeploy status')
-  }
-}
-
-const refreshLogs = async () => {
-  try {
-    await store.fetchLogs(16384)
-  } catch (error) {
-    toast.error('Failed to refresh LMDeploy logs')
-  }
-}
-
-const startInstall = async () => {
-  try {
-    await store.install()
-    toast.success('LMDeploy installation started')
-  } catch (error) {
-    toast.error(error?.response?.data?.detail || 'Failed to start installation')
-  }
-}
-
-const startRemoval = async () => {
-  try {
-    await store.remove()
-    toast.success('LMDeploy removal started')
-  } catch (error) {
-    toast.error(error?.response?.data?.detail || 'Failed to start removal')
-  }
-}
-
-// formatDate is now imported from @/utils/formatting
-
-onMounted(() => {
-  refresh()
-  store.startWebSocketSubscriptions()
-})
-
-onUnmounted(() => {
-  store.stopWebSocketSubscriptions()
-})
-</script>
-
-<style scoped>
-.lmdeploy-page {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xl);
-}
-
-.card {
-  background: var(--bg-card);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-lg);
-  border: 1px solid var(--border-primary);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-}
-
-.card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-lg);
-}
-
-.card-subtitle {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.95rem;
-}
-
-.status-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
-  gap: var(--spacing-lg);
-}
-
-.status-grid label {
-  display: block;
-  font-size: 0.75rem;
-  text-transform: uppercase;
-  letter-spacing: 0.08em;
-  color: var(--text-secondary);
-  margin-bottom: var(--spacing-xs);
-}
-
-.status-value {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.monospace {
-  font-family: 'JetBrains Mono', 'Fira Code', monospace;
-}
-
-.truncate {
-  max-width: 320px;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
-}
-
-.error-text {
-  color: var(--status-error);
-}
-
-.card-actions {
-  display: flex;
-  flex-wrap: wrap;
-  gap: var(--spacing-md);
-}
-
-.helper-text {
-  margin: 0;
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-}
-
-.empty-log {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
-  padding: var(--spacing-xl);
-}
-
-.text-success {
-  color: var(--status-success);
-}
-
-.text-warning {
-  color: var(--status-warning);
-}
-
-.text-muted {
-  color: var(--text-secondary);
-}
-</style>
-
diff --git a/frontend/src/views/LlamaCppManager.vue b/frontend/src/views/LlamaCppManager.vue
deleted file mode 100644
index 1e7c3c4..0000000
--- a/frontend/src/views/LlamaCppManager.vue
+++ /dev/null
@@ -1,339 +0,0 @@
-<template>
-  <div class="llama-manager">
-    <BaseCard>
-      <template #header>
-        <div class="card-header">
-          <div class="header-content">
-            <h2 class="card-title">llama.cpp Version Manager</h2>
-            <div class="active-version-header">
-              <Tag 
-                v-if="activeVersion"
-                :value="`Active: ${activeVersion.version}`" 
-                severity="success"
-                class="active-header-badge"
-              />
-              <Tag 
-                v-else-if="systemStore.llamaVersions.length > 0"
-                value="No Active Version" 
-                severity="warning"
-                class="active-header-badge"
-              />
-            </div>
-          </div>
-          <div class="header-actions">
-            <Button 
-              icon="pi pi-refresh" 
-              @click="refreshVersions"
-              :loading="refreshingVersions"
-              severity="secondary"
-              text
-              v-tooltip.top="'Refresh installed versions'"
-            />
-            <Button 
-              label="Check for Updates"
-              icon="pi pi-search" 
-              @click="checkUpdates"
-              :loading="checkingUpdates"
-              severity="info"
-              outlined
-              v-tooltip.top="'Check for available updates'"
-            />
-          </div>
-        </div>
-      </template>
-
-      <!-- Build Progress -->
-      <BuildProgress />
-
-      <!-- Update Information -->
-      <UpdateInfo
-        v-if="updateInfo"
-        :update-info="updateInfo"
-        :installing-release="installingRelease"
-        :building-source="buildingSource"
-        @install-release="openReleaseDialog"
-        @build-source="showBuildDialog"
-      />
-
-      <!-- Installed Versions -->
-      <VersionList
-        :versions="systemStore.llamaVersions"
-        :activating="activating"
-        @activate="activateVersion"
-        @delete="confirmDeleteVersion"
-      />
-    </BaseCard>
-
-    <!-- Release Install Dialog -->
-    <ReleaseDialog
-      v-model:visible="releaseDialogVisible"
-      :release-tag="selectedReleaseTag"
-      @installed="handleReleaseInstalled"
-      @hide="handleReleaseDialogHide"
-    />
-
-    <!-- Build from Source Dialog -->
-    <BuildDialog
-      v-model:visible="buildDialogVisible"
-      :build-capabilities="buildCapabilities"
-      :cuda-status="cudaStatus"
-      @build="handleBuild"
-      @show-cuda-install="showCudaInstallDialog"
-    />
-
-    <!-- CUDA Installation Dialog -->
-    <CudaInstallDialog
-      v-model:visible="cudaInstallDialogVisible"
-      :cuda-status="cudaStatus"
-      :cuda-install-progress="cudaInstallProgress"
-      :cuda-install-logs="cudaInstallLogs"
-      @install="handleCudaInstall"
-    />
-  </div>
-</template>
-
-<script setup>
-// Vue
-import { ref, onMounted, computed } from 'vue'
-
-// PrimeVue
-import Button from 'primevue/button'
-import Tag from 'primevue/tag'
-import { useConfirm } from 'primevue/useconfirm'
-
-// Third-party
-import { toast } from 'vue3-toastify'
-
-// Stores
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
-
-// Components
-import BuildProgress from '@/components/BuildProgress.vue'
-import BaseCard from '@/components/common/BaseCard.vue'
-import UpdateInfo from '@/components/system/LlamaCppManager/UpdateInfo.vue'
-import VersionList from '@/components/system/LlamaCppManager/VersionList.vue'
-import ReleaseDialog from '@/components/system/LlamaCppManager/ReleaseDialog.vue'
-import BuildDialog from '@/components/system/LlamaCppManager/BuildDialog.vue'
-import CudaInstallDialog from '@/components/system/LlamaCppManager/CudaInstallDialog.vue'
-
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
-const confirm = useConfirm()
-
-const updateInfo = ref(null)
-const activeVersion = computed(() => {
-  return systemStore.llamaVersions.find(version => version.is_active)
-})
-const checkingUpdates = ref(false)
-const refreshingVersions = ref(false)
-const installingRelease = ref(false)
-const buildingSource = ref(false)
-const buildDialogVisible = ref(false)
-const activating = ref(null)
-
-// CUDA installation
-const cudaStatus = ref(null)
-const cudaInstallDialogVisible = ref(false)
-const cudaInstallProgress = ref(null)
-const cudaInstallLogs = ref([])
-
-// Release dialog
-const releaseDialogVisible = ref(false)
-const selectedReleaseTag = ref(null)
-
-// Build capabilities
-const buildCapabilities = ref(null)
-
-const fetchBuildCapabilities = async () => {
-  try {
-    const response = await fetch('/api/llama-versions/build-capabilities')
-    if (response.ok) {
-      buildCapabilities.value = await response.json()
-    }
-  } catch (error) {
-    console.error('Failed to fetch build capabilities:', error)
-  }
-}
-
-const fetchCudaStatus = async () => {
-  try {
-    cudaStatus.value = await systemStore.getCudaStatus()
-  } catch (error) {
-    console.error('Failed to fetch CUDA status:', error)
-  }
-}
-
-const checkUpdates = async () => {
-  checkingUpdates.value = true
-  try {
-    updateInfo.value = await systemStore.checkUpdates()
-    toast.success('Updates checked successfully')
-  } catch (error) {
-    let errorMessage = 'Failed to check for updates'
-    if (error.response?.status === 429) {
-      errorMessage = 'GitHub API rate limit exceeded. Please try again later.'
-    } else if (error.response?.status === 404) {
-      errorMessage = 'GitHub repository not found'
-    } else if (error.response?.data?.detail) {
-      errorMessage = error.response.data.detail
-    }
-    toast.error(errorMessage)
-  } finally {
-    checkingUpdates.value = false
-  }
-}
-
-const refreshVersions = async () => {
-  refreshingVersions.value = true
-  try {
-    await systemStore.fetchLlamaVersions()
-    toast.success('Versions refreshed successfully')
-  } catch (error) {
-    toast.error('Failed to refresh versions')
-  } finally {
-    refreshingVersions.value = false
-  }
-}
-
-const openReleaseDialog = (tagName) => {
-  selectedReleaseTag.value = tagName
-  releaseDialogVisible.value = true
-}
-
-const handleReleaseDialogHide = () => {
-  selectedReleaseTag.value = null
-}
-
-const handleReleaseInstalled = () => {
-  releaseDialogVisible.value = false
-  selectedReleaseTag.value = null
-}
-
-const showBuildDialog = async () => {
-  await fetchBuildCapabilities()
-  buildDialogVisible.value = true
-}
-
-const handleBuild = () => {
-  buildingSource.value = true
-  buildDialogVisible.value = false
-  buildingSource.value = false
-}
-
-const activateVersion = async (versionId) => {
-  activating.value = versionId
-  try {
-    await systemStore.activateVersion(versionId)
-    toast.success('Version activated successfully')
-    await systemStore.fetchLlamaVersions()
-  } catch (error) {
-    toast.error('Failed to activate version')
-  } finally {
-    activating.value = null
-  }
-}
-
-const confirmDeleteVersion = (version) => {
-  confirm.require({
-    message: `Are you sure you want to delete version "${version.version}"? This will remove the binary files and cannot be undone.`,
-    header: 'Delete Version',
-    icon: 'pi pi-exclamation-triangle',
-    rejectLabel: 'Cancel',
-    acceptLabel: 'Delete',
-    accept: async () => {
-      try {
-        await systemStore.deleteVersion(version.id)
-        toast.success(`${version.version} has been deleted`)
-      } catch (error) {
-        toast.error('Failed to delete version')
-      }
-    }
-  })
-}
-
-const showCudaInstallDialog = async () => {
-  await fetchCudaStatus()
-  cudaInstallDialogVisible.value = true
-  cudaInstallProgress.value = null
-  cudaInstallLogs.value = []
-}
-
-const handleCudaInstall = async (version) => {
-  // Installation started, WebSocket will handle updates
-}
-
-// WebSocket handlers for CUDA installation
-const handleCudaInstallStatus = (data) => {
-  if (data.status === 'completed') {
-    cudaInstallProgress.value = { progress: 100, message: 'Installation completed!' }
-    toast.success(data.message || 'CUDA installation completed')
-    fetchCudaStatus()
-  } else if (data.status === 'failed') {
-    toast.error(data.message || 'CUDA installation failed')
-  }
-}
-
-const handleCudaInstallProgress = (data) => {
-  cudaInstallProgress.value = {
-    progress: data.progress || 0,
-    message: data.message || 'Installing...',
-    stage: data.stage || 'install'
-  }
-}
-
-const handleCudaInstallLog = (data) => {
-  if (data.line) {
-    cudaInstallLogs.value.push(data.line)
-    if (cudaInstallLogs.value.length > 100) {
-      cudaInstallLogs.value = cudaInstallLogs.value.slice(-100)
-    }
-  }
-}
-
-onMounted(async () => {
-  // Subscribe to CUDA installation events
-  wsStore.subscribe('cuda_install_status', handleCudaInstallStatus)
-  wsStore.subscribe('cuda_install_progress', handleCudaInstallProgress)
-  wsStore.subscribe('cuda_install_log', handleCudaInstallLog)
-
-  await fetchCudaStatus()
-  await systemStore.fetchLlamaVersions()
-  await fetchBuildCapabilities()
-  
-  if (!wsStore.isConnected) {
-    wsStore.connect()
-  }
-})
-</script>
-
-<style scoped>
-.llama-manager {
-  max-width: 1400px;
-  margin: 0 auto;
-}
-
-.card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: 0;
-  padding-bottom: 0;
-  border-bottom: none;
-}
-
-.header-content {
-  display: flex;
-  align-items: center;
-  gap: 1rem;
-}
-
-.active-version-header {
-  margin-left: 1rem;
-}
-
-.active-header-badge {
-  font-size: 0.875rem;
-  font-weight: 600;
-}
-</style>
diff --git a/frontend/src/views/ModelConfig.vue b/frontend/src/views/ModelConfig.vue
index 8e1d969..cb5ea3f 100644
--- a/frontend/src/views/ModelConfig.vue
+++ b/frontend/src/views/ModelConfig.vue
@@ -1,3768 +1,567 @@
 <template>
-  <div class="model-config">
-    <div class="config-layout">
-      <!-- Main Configuration Panel -->
-      <div class="config-main">
-        <div class="card">
-          <div class="card-header">
-            <div class="config-header">
-              <ModelInfoSection
-                :model="model"
-                :model-layer-info="modelLayerInfo"
-                :has-hf-metadata="hasHfMetadata"
-                :hf-metadata="hfMetadata"
-                :hf-metadata-loading="hfMetadataLoading"
-                :regenerating-info="regeneratingInfo"
-                :save-loading="saveLoading"
-              >
-                <template #actions>
-                  <Button 
-                    label="Quick Start" 
-                    icon="pi pi-bolt" 
-                    size="small"
-                    severity="info"
-                    outlined
-                    @click="showQuickStartModal = true"
-                    v-tooltip="'Choose a preset, use the wizard, or let Smart Auto optimize for you'"
-                  />
-                  <Button icon="pi pi-refresh" @click="regenerateModelInfo" :loading="regeneratingInfo"
-                    severity="secondary" size="small" outlined
-                    v-tooltip="'Regenerate model information from GGUF metadata'" />
-                  <Button label="Save Config" icon="pi pi-save" @click="saveConfig" :loading="saveLoading"
-                    severity="success" size="small" />
-                </template>
-              </ModelInfoSection>
-            </div>
-          </div>
-        </div>
-
-        <!-- Quick Start Modal -->
-        <QuickStartModal
-          v-model:visible="showQuickStartModal"
-          :auto-config-loading="autoConfigLoading"
-          :smart-auto-usage-mode="smartAutoUsageMode"
-          @wizard="showWizard = true"
-          @preset="applyPreset"
-          @smart-auto="handleQuickStartSmartAuto"
-        />
-
-        <!-- Empty State for New Models -->
-        <EmptyState
-          v-if="hasNoConfig && !showWizard && !showPreview && !showConfig"
-          :visible="hasNoConfig && !showConfig"
-          icon="🎯"
-          title="Configure Your Model"
-          description="This model doesn't have a configuration yet. Start with Smart Auto, choose a preset, or configure manually."
-          :show-smart-auto="true"
-          :show-presets="true"
-          @smart-auto="generateAutoConfig"
-          @presets="handleEmptyStatePresets"
-          @manual="handleEmptyStateManual"
-        />
-
-        <!-- Configuration Grid -->
-        <div 
-          v-if="!hasNoConfig || showWizard || showPreview || showConfig" 
-          class="config-tabs-wrapper"
-          @touchstart="handleTabSwipeStart"
-          @touchend="handleTabSwipeEnd"
-        >
-          <div class="config-search-bar" :class="{ 'search-focused': searchFocused }">
-            <span class="p-input-icon-left">
-              <i class="pi pi-search" />
-              <InputText 
-                v-model="configSearchQuery" 
-                placeholder="Search settings..."
-                class="config-search-input"
-                aria-label="Search configuration settings"
-                @focus="searchFocused = true"
-                @blur="searchFocused = false"
-              />
-            </span>
-            <Button 
-              v-if="configSearchQuery"
-              icon="pi pi-times" 
-              @click="configSearchQuery = ''" 
-              size="small" 
-              text 
-              rounded
-              severity="secondary"
-              aria-label="Clear search"
-              v-tooltip="'Clear search'"
-            />
-          </div>
-          <TabView v-model:activeIndex="activeTabIndex" :scrollable="true" class="config-tabs">
-          <!-- Essential Settings Tab -->
-          <TabPanel header="Essential" icon="pi pi-microchip">
-            <div class="tab-content">
-              <EssentialSettingsSection
-                :config="config"
-                :max-gpu-layers="maxGpuLayers"
-                :recommended-gpu-layers="recommendedGpuLayers"
-                :gpu-layers-tooltip="gpuLayersTooltip"
-                :gpu-layers-validation="gpuLayersValidation"
-                :gpu-available="gpuAvailable"
-                @update-vram-estimate="updateVramEstimate"
-              />
-            </div>
-          </TabPanel>
-
-          <!-- Memory & Context Tab -->
-          <TabPanel header="Memory & Context" icon="pi pi-memory">
-            <div class="tab-content">
-              <ContextParamsSection
-                :config="config"
-                :max-context-size="maxContextSize"
-                :max-batch-size="maxBatchSize"
-                :recommended-context-size="recommendedContextSize"
-                :recommended-batch-size="recommendedBatchSize"
-                :context-size-validation="contextSizeValidation"
-                :batch-size-validation="batchSizeValidation"
-                :context-size-tooltip="contextSizeTooltip"
-                :batch-size-tooltip="batchSizeTooltip"
-                @update-vram-estimate="updateVramEstimate"
-              />
-            </div>
-          </TabPanel>
-
-          <!-- Generation Tab -->
-          <TabPanel header="Generation" icon="pi pi-cog">
-            <div class="tab-content">
-              <GenerationParamsSection
-                :config="config"
-                :max-top-k="maxTopK"
-                :recommended-temperature="recommendedTemperature"
-                :recommended-top-k="recommendedTopK"
-                :recommended-top-p="recommendedTopP"
-                :is-min-p-supported="isMinPSupported"
-                :is-typical-p-supported="isTypicalPSupported"
-                :is-tfs-z-supported="isTfsZSupported"
-                :is-presence-penalty-supported="isPresencePenaltySupported"
-                :is-frequency-penalty-supported="isFrequencyPenaltySupported"
-                :temperature-tooltip="temperatureTooltip"
-                :top-k-tooltip="topKTooltip"
-                :top-p-tooltip="topPTooltip"
-                :repeat-penalty-tooltip="repeatPenaltyTooltip"
-              />
-              <div class="tab-section">
-                <h4 class="tab-section-title">
-                  <i class="pi pi-cog"></i>
-                  Additional Options
-                </h4>
-                <ConfigField label="Seed" help-text="Random seed (-1 for random)">
-                  <template #input>
-                    <InputNumber v-model="config.seed" :min="-1" :max="2147483647" />
-                  </template>
-                </ConfigField>
-                <ConfigField label="Stop Words (comma-separated)" help-text="Words that stop generation">
-                  <template #input>
-                    <InputText v-model="stopWordsInput" @blur="applyStopWords"
-                      placeholder="e.g. \\n, \\n\\n, &lt;/s&gt;" />
-                  </template>
-                </ConfigField>
-                <ConfigField label="Grammar" help-text="Optional grammar string">
-                  <template #input>
-                    <InputText v-model="config.grammar" placeholder="optional grammar" />
-                  </template>
-                </ConfigField>
-                <ConfigField v-if="isJsonSchemaSupported" label="JSON Schema" help-text="Optional JSON schema">
-                  <template #input>
-                    <InputText v-model="config.json_schema" placeholder="optional JSON schema" />
-                  </template>
-                </ConfigField>
-                <ConfigField label="Use Jinja Template" help-text="Enable Jinja templating">
-                  <template #input>
-                    <Checkbox v-model="config.jinja" binary />
-                  </template>
-                </ConfigField>
-              </div>
-            </div>
-          </TabPanel>
-
-          <!-- Performance Tab -->
-          <TabPanel header="Performance" icon="pi pi-tachometer">
-            <div class="tab-content">
-              <PerformanceSection
-                :config="config"
-                :model-layer-info="modelLayerInfo"
-                :max-parallel="maxParallel"
-                :parallel-validation="parallelValidation"
-                :gpu-available="gpuAvailable"
-                :is-embedding-model="isEmbeddingModel"
-                :is-cache-type-v-supported="isCacheTypeVSupported"
-              />
-            </div>
-          </TabPanel>
-
-          <!-- Advanced Tab -->
-          <TabPanel header="Advanced" icon="pi pi-wrench">
-            <div class="tab-content">
-              <AdvancedSection :config="config" />
-            </div>
-          </TabPanel>
-
-          <!-- Custom Arguments Tab -->
-          <TabPanel header="Custom Arguments" icon="pi pi-code">
-            <div class="tab-content">
-              <CustomArgsSection :config="config" />
-            </div>
-          </TabPanel>
-        </TabView>
-      </div>
-      </div>
-
-      <!-- Configuration Warnings -->
-      <ConfigWarnings :warnings="configWarnings" />
-      
-      <!-- Memory Status Dashboard (prominent top cards) -->
-      <div class="memory-dashboard" id="memory-dashboard">
-        <!-- RAM Monitor Card -->
-        <MemoryMonitor
-          title="System RAM"
-          :current-value="realtimeRamData?.used || null"
-          :estimated-value="ramEstimate?.estimated_ram || null"
-          :total-capacity="totalRamBytes"
-          :loading="ramLoading && !ramEstimate && !realtimeRamData"
-        />
-
-        <!-- VRAM Monitor Card (only shown when GPU is available) -->
-        <MemoryMonitor
-          v-if="!systemStore.gpuInfo.cpu_only_mode && systemStore.gpuInfo.device_count > 0"
-          title="VRAM"
-          :current-value="currentVramBytes"
-          :estimated-value="vramEstimate?.estimated_vram || null"
-          :total-capacity="totalVramBytes"
-          :loading="vramLoading && !vramEstimate && !realtimeVramData"
-        />
-      </div>
-
-    </div>
-
-    <!-- Configuration Wizard -->
-    <ConfigWizard
-      :model-visible="showWizard"
-      :model="model"
-      :gpu-info="systemStore.gpuInfo"
-      :model-layer-info="modelLayerInfo"
-      @close="showWizard = false"
-      @apply-config="handleWizardConfig"
-      @go-to-advanced="showWizard = false"
-    />
-
-    <!-- Configuration Change Preview -->
-    <ConfigChangePreview
-      :visible="showPreview"
-      @update:visible="showPreview = $event"
-      :type="previewData?.type || 'smart-auto'"
-      :preset-name="previewData?.presetName || ''"
-      :changes="previewData?.changes || []"
-      :impact="previewData?.impact"
-      :applying="previewApplying"
-      @apply="applyPreviewChanges"
-      @cancel="showPreview = false"
-    />
-
-    <!-- Onboarding Tour -->
-    <OnboardingTour
-      :visible="showOnboarding"
-      :steps="onboardingSteps"
-      @update:visible="showOnboarding = $event"
-      @complete="handleOnboardingComplete"
-      @skip="handleOnboardingSkip"
-    />
-  </div>
-</template>
-
-<script setup>
-import { ref, computed, onMounted, onUnmounted, watch, nextTick } from 'vue'
-import { useRoute, useRouter } from 'vue-router'
-import { useModelStore } from '@/stores/models'
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
-import { toast } from 'vue3-toastify'
-import Button from 'primevue/button'
-import InputNumber from 'primevue/inputnumber'
-import InputText from 'primevue/inputtext'
-import Textarea from 'primevue/textarea'
-import Checkbox from 'primevue/checkbox'
-import Dropdown from 'primevue/dropdown'
-import TabView from 'primevue/tabview'
-import TabPanel from 'primevue/tabpanel'
-import SliderInput from '@/components/SliderInput.vue'
-import MemoryMonitor from '@/components/config/MemoryMonitor.vue'
-import ConfigField from '@/components/config/ConfigField.vue'
-import ConfigWizard from '@/components/config/ConfigWizard.vue'
-import ConfigChangePreview from '@/components/config/ConfigChangePreview.vue'
-import OnboardingTour from '@/components/config/OnboardingTour.vue'
-import EmptyState from '@/components/config/EmptyState.vue'
-import ModelInfoSection from '@/components/config/ModelInfoSection.vue'
-import QuickStartModal from '@/components/config/QuickStartModal.vue'
-import GenerationParamsSection from '@/components/config/GenerationParamsSection.vue'
-import ContextParamsSection from '@/components/config/ContextParamsSection.vue'
-import MemoryParamsSection from '@/components/config/MemoryParamsSection.vue'
-import AdvancedSettingsSection from '@/components/config/AdvancedSettingsSection.vue'
-import PerformanceSection from '@/components/config/PerformanceSection.vue'
-import AdvancedSection from '@/components/config/AdvancedSection.vue'
-import CustomArgsSection from '@/components/config/CustomArgsSection.vue'
-import EssentialSettingsSection from '@/components/config/EssentialSettingsSection.vue'
-import ConfigWarnings from '@/components/config/ConfigWarnings.vue'
-
-// Utils
-import { formatFileSize } from '@/utils/formatting'
-
-const route = useRoute()
-const router = useRouter()
-const modelStore = useModelStore()
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
-
-// Reactive state
-const model = ref(null)
-const config = ref({})
-const showConfig = ref(false)
-const EMBEDDING_PIPELINE_TAGS = ['text-embedding', 'feature-extraction', 'sentence-similarity']
-const EMBEDDING_KEYWORDS = ['embedding', 'embed', 'nomic', 'gte', 'e5', 'bge', 'minilm']
-const isEmbeddingModel = computed(() => {
-  const current = model.value
-  if (!current) return false
-  if (current.is_embedding_model) {
-    return true
-  }
-  const pipeline = (current.pipeline_tag || '').toLowerCase()
-  if (EMBEDDING_PIPELINE_TAGS.includes(pipeline)) {
-    return true
-  }
-  const combined = [
-    current.huggingface_id,
-    current.base_model_name,
-    current.name
-  ].filter(Boolean).join(' ').toLowerCase()
-  return EMBEDDING_KEYWORDS.some(keyword => combined.includes(keyword))
-})
-const stopWordsInput = ref('')
-const applyStopWords = () => {
-  const parts = (stopWordsInput.value || '').split(',').map(s => s.trim()).filter(Boolean)
-  config.value.stop = parts
-}
-const vramEstimate = ref(null)
-const vramLoading = ref(false)
-const ramEstimate = ref(null)
-const ramLoading = ref(false)
-let vramEstimateTimeout = null
-let ramEstimateTimeout = null
-let vramAbortController = null
-let ramAbortController = null
-const ESTIMATE_DEBOUNCE_MS = 400
-const autoConfigLoading = ref(false)
-const saveLoading = ref(false)
-const modelLayerInfo = ref(null)
-const modelRecommendations = ref(null)
-const hfMetadata = ref(null)
-const hfLayerInfo = ref(null)
-const hfMetadataLoading = ref(false)
-const regeneratingInfo = ref(false)
-const layerInfoLoading = ref(false)
-const recommendationsLoading = ref(false)
-const smartAutoUsageMode = ref('single_user')
-const configSearchQuery = ref('')
-const searchFocused = ref(false)
-const showWizard = ref(false)
-const showQuickStartModal = ref(false)
-const showPreview = ref(false)
-const previewData = ref(null)
-const previewApplying = ref(false)
-
-// Handlers for empty state actions
-const handleEmptyStatePresets = () => {
-  showConfig.value = true
-  activeTabIndex.value = 0
-}
-
-const handleEmptyStateManual = () => {
-  showConfig.value = true
-  // Keep activeTabIndex on whatever tab the user last interacted with (default 0)
-}
-
-watch(isEmbeddingModel, (value) => {
-  if (value && !config.value.embedding) {
-    config.value.embedding = true
-  }
-})
-
-// Real-time memory data from WebSocket
-const realtimeRamData = ref(null)
-const realtimeVramData = ref(null)
-
-// Performance metrics from unified monitoring
-
-// Active tab index for tabbed interface
-const activeTabIndex = ref(0)
-
-// Tab labels for search matching
-const tabLabels = [
-  { key: 'essential', label: 'Essential', keywords: ['gpu', 'layer', 'thread', 'cpu', 'loading', 'model'] },
-  { key: 'memory', label: 'Memory & Context', keywords: ['context', 'memory', 'batch', 'size', 'mmap', 'mlock', 'ram'] },
-  { key: 'generation', label: 'Generation', keywords: ['temperature', 'temp', 'top-k', 'top-p', 'repeat', 'penalty', 'sampling', 'token', 'generate', 'mirostat', 'seed', 'grammar', 'jinja'] },
-  { key: 'performance', label: 'Performance', keywords: ['parallel', 'flash', 'attention', 'vram', 'low', 'batching', 'offload', 'logits', 'embedding', 'cache', 'kv', 'quantization', 'moe', 'expert'] },
-  { key: 'advanced', label: 'Advanced', keywords: ['rope', 'yarn', 'freq', 'scale', 'scaling', 'yaml'] },
-  { key: 'custom', label: 'Custom Arguments', keywords: ['custom', 'argument', 'args', 'command', 'line'] }
-]
-
-// Swipe between tabs functionality
-const tabSwipeStartX = ref(0)
-const tabSwipeThreshold = 100
-
-const handleTabSwipeStart = (e) => {
-  if (!e.touches || e.touches.length === 0) return
-  tabSwipeStartX.value = e.touches[0].clientX
-}
-
-const handleTabSwipeEnd = (e) => {
-  if (!e.changedTouches || e.changedTouches.length === 0) return
-  
-  const deltaX = e.changedTouches[0].clientX - tabSwipeStartX.value
-  
-  // Swipe left (next tab) or right (previous tab)
-  if (Math.abs(deltaX) > tabSwipeThreshold) {
-    if (deltaX < 0 && activeTabIndex.value < tabLabels.length - 1) {
-      // Swipe left - next tab
-      activeTabIndex.value++
-    } else if (deltaX > 0 && activeTabIndex.value > 0) {
-      // Swipe right - previous tab
-      activeTabIndex.value--
-    }
-  }
-  
-  tabSwipeStartX.value = 0
-}
-
-// Supported flags from llama-server
-const supportedFlags = ref(null)
-const supportedConfigKeys = ref({})
-
-// GPU availability
-const gpuAvailable = computed(() => {
-  const dc = systemStore.gpuInfo?.device_count || 0
-  return dc > 0
-})
-
-watch(gpuAvailable, (avail) => {
-  if (!avail) {
-    if (config.value && config.value.n_gpu_layers !== 0) {
-      config.value.n_gpu_layers = 0
-      updateVramEstimate(true)
-    }
-  }
-})
-
-onMounted(() => {
-  if (!gpuAvailable.value) {
-    if (config.value && config.value.n_gpu_layers !== 0) {
-      config.value.n_gpu_layers = 0
-      updateVramEstimate(true)
-    }
-  }
-})
-
-// VRAM usage percentage
-const vramUsagePercentage = computed(() => {
-  if (!vramEstimate.value || !systemStore.gpuInfo.total_vram) return 0
-  return Math.round((vramEstimate.value.estimated_vram / systemStore.gpuInfo.total_vram) * 100)
-})
-
-// RAM usage percentage
-const ramUsagePercentage = computed(() => {
-  if (!ramEstimate.value || !systemStore.systemStatus.system?.memory?.total) return 0
-  return Math.round((ramEstimate.value.estimated_ram / systemStore.systemStatus.system.memory.total) * 100)
-})
-
-// Only show RAM warning when we have valid totals and estimate exceeds available
-const showRamWarning = computed(() => {
-  const est = ramEstimate.value
-  if (!est) return false
-  const total = est.system_ram_total || systemStore.systemStatus.system?.memory?.total || 0
-  const estimated = est.estimated_ram || 0
-  return total > 0 && estimated > total
-})
-
-// Stacked progress computed values for RAM (current + estimated additional)
-const totalRamBytes = computed(() => ramEstimate.value?.system_ram_total || systemStore.systemStatus.system?.memory?.total || 0)
-const currentRamBytes = computed(() => ramEstimate.value?.system_ram_used || systemStore.systemStatus.system?.memory?.used || 0)
-const estimatedRamBytes = computed(() => ramEstimate.value?.estimated_ram || 0)
-const totalEstimatedRamBytes = computed(() => {
-  // Total estimated RAM = current system usage + estimated additional RAM
-  return currentRamBytes.value + estimatedRamBytes.value
-})
-const currentRamPercent = computed(() => {
-  const total = totalRamBytes.value || 1
-  return Math.min(100, Math.max(0, Math.round((currentRamBytes.value / total) * 100)))
-})
-const additionalRamPercent = computed(() => {
-  const total = totalRamBytes.value || 1
-  const add = ramEstimate.value?.estimated_ram || 0
-  // Cap so current + additional does not exceed 100
-  const pct = Math.round((add / total) * 100)
-  return Math.max(0, Math.min(100 - currentRamPercent.value, pct))
-})
-
-// Stacked progress computed values for VRAM (current + estimated additional)
-const totalVramBytes = computed(() => {
-  if (realtimeVramData.value) {
-    if (typeof realtimeVramData.value.total === 'number') {
-      return realtimeVramData.value.total
-    }
-    if (typeof realtimeVramData.value.total_vram === 'number') {
-      return realtimeVramData.value.total_vram
-    }
-  }
-  return systemStore.gpuInfo.total_vram || 0
-})
-const currentVramBytes = computed(() => {
-  if (realtimeVramData.value && typeof realtimeVramData.value.used_vram === 'number') {
-    return realtimeVramData.value.used_vram
-  }
-  const total = systemStore.gpuInfo.total_vram || 0
-  const available = systemStore.gpuInfo.available_vram
-  if (total && typeof available === 'number') {
-    return Math.max(0, total - available)
-  }
-  return 0
-})
-const estimatedVramBytes = computed(() => vramEstimate.value?.estimated_vram || 0)
-const totalEstimatedVramBytes = computed(() => {
-  // Total estimated VRAM = current GPU usage + estimated additional VRAM
-  return currentVramBytes.value + estimatedVramBytes.value
-})
-const currentVramPercent = computed(() => {
-  const total = totalVramBytes.value || 1
-  return Math.min(100, Math.max(0, Math.round(((currentVramBytes.value || 0) / total) * 100)))
-})
-const additionalVramPercent = computed(() => {
-  const total = totalVramBytes.value || 1
-  const add = vramEstimate.value?.estimated_vram || 0
-  const pct = Math.round((add / total) * 100)
-  return Math.max(0, Math.min(100 - currentVramPercent.value, pct))
-})
-
-// RAM Status indicators for new dashboard
-const ramStatus = computed(() => {
-  if (!ramEstimate.value && !realtimeRamData.value) return 'unknown'
-  
-  let usagePercent = 0
-  if (ramEstimate.value?.system_ram_total) {
-    usagePercent = ((totalEstimatedRamBytes.value / ramEstimate.value.system_ram_total) * 100)
-  } else if (realtimeRamData.value?.percent) {
-    usagePercent = realtimeRamData.value.percent
-  }
-  
-  if (usagePercent < 70) return 'good'
-  if (usagePercent < 90) return 'warning'
-  return 'critical'
-})
-
-const ramStatusClass = computed(() => {
-  const status = ramStatus.value
-  if (status === 'good') return 'status-good'
-  if (status === 'warning') return 'status-warning'
-  if (status === 'critical') return 'status-critical'
-  return 'status-unknown'
-})
-
-const ramStatusText = computed(() => {
-  const status = ramStatus.value
-  if (status === 'good') return 'Fits Comfortably'
-  if (status === 'warning') return 'Tight Fit'
-  if (status === 'critical') return 'Won\'t Fit'
-  return 'Unknown'
-})
-
-
-const hasHfMetadata = computed(() => {
-  return (
-    hfMetadata.value ||
-    hfLayerInfo.value ||
-    (!!hfMetadataLoading.value)
-  )
-})
-
-
-const ramProgressText = computed(() => {
-  const current = currentRamPercent.value
-  const additional = additionalRamPercent.value
-  return `${current}% used + ${additional}% est • ${formatFileSize(totalEstimatedRamBytes.value)} total`
-})
-
-const ramStatusMessage = computed(() => {
-  const status = ramStatus.value
-  if (status === 'good') {
-    const available = totalRamBytes.value - totalEstimatedRamBytes.value
-    return `✅ Fits Comfortably - ${formatFileSize(available)} buffer remaining`
-  }
-  if (status === 'warning') {
-    return '⚠️ Memory usage is high - consider reducing context size or batch size'
-  }
-  if (status === 'critical') {
-    return '❌ Usage exceeds available memory - configuration will not work'
-  }
-  return 'Loading memory information...'
-})
-
-// VRAM Status indicators for new dashboard
-const vramStatus = computed(() => {
-  if (!vramEstimate.value && !realtimeVramData.value) return 'unknown'
-  
-  const usagePercent = totalVramBytes.value > 0 
-    ? (totalEstimatedVramBytes.value / totalVramBytes.value) * 100
-    : 0
-  
-  if (usagePercent < 70) return 'good'
-  if (usagePercent < 90) return 'warning'
-  return 'critical'
-})
-
-const vramStatusClass = computed(() => {
-  const status = vramStatus.value
-  if (status === 'good') return 'status-good'
-  if (status === 'warning') return 'status-warning'
-  if (status === 'critical') return 'status-critical'
-  return 'status-unknown'
-})
-
-const vramStatusText = computed(() => {
-  const status = vramStatus.value
-  if (status === 'good') return 'Fits Comfortably'
-  if (status === 'warning') return 'Tight Fit'
-  if (status === 'critical') return 'Won\'t Fit'
-  return 'Unknown'
-})
-
-const vramProgressText = computed(() => {
-  const current = currentVramPercent.value
-  const additional = additionalVramPercent.value
-  return `${current}% used + ${additional}% est • ${formatFileSize(totalEstimatedVramBytes.value)} total`
-})
-
-const vramStatusMessage = computed(() => {
-  const status = vramStatus.value
-  if (status === 'good') {
-    const available = totalVramBytes.value - totalEstimatedVramBytes.value
-    return `✅ Fits Comfortably - ${formatFileSize(available)} VRAM available`
-  }
-  if (status === 'warning') {
-    return '⚠️ VRAM usage is high - consider reducing GPU layers or batch size'
-  }
-  if (status === 'critical') {
-    return '❌ Usage exceeds available VRAM - reduce GPU layers or context size'
-  }
-  return 'Loading VRAM information...'
-})
-
-// Maximum values from backend recommendations
-const maxGpuLayers = computed(() => {
-  return modelRecommendations.value?.gpu_layers?.max || modelLayerInfo.value?.layer_count || 32
-})
-
-const maxContextSize = computed(() => {
-  return modelRecommendations.value?.context_size?.max || modelLayerInfo.value?.context_length || 131072
-})
-
-const maxBatchSize = computed(() => {
-  return modelRecommendations.value?.batch_size?.max || 512
-})
-
-// Maximum parallel processing based on attention head count
-const maxParallel = computed(() => {
-  return modelRecommendations.value?.parallel?.max || 8
-})
-
-const maxTopK = computed(() => {
-  return modelRecommendations.value?.top_k?.max || 200
-})
-
-// Recommended values from backend
-const recommendedGpuLayers = computed(() => modelRecommendations.value?.gpu_layers?.recommended_value)
-const recommendedContextSize = computed(() => modelRecommendations.value?.context_size?.recommended_value)
-const recommendedBatchSize = computed(() => modelRecommendations.value?.batch_size?.recommended_value)
-const recommendedTemperature = computed(() => modelRecommendations.value?.temperature?.recommended_value)
-const recommendedTopK = computed(() => modelRecommendations.value?.top_k?.recommended_value)
-const recommendedTopP = computed(() => modelRecommendations.value?.top_p?.recommended_value)
-
-// Load supported flags from the active llama-server binary
-const loadSupportedFlags = async () => {
-  try {
-    const response = await fetch('/api/models/supported-flags')
-    if (response.ok) {
-      const data = await response.json()
-      supportedFlags.value = data.supported_flags || []
-      supportedConfigKeys.value = data.supported_config_keys || {}
-    } else {
-      console.warn('Failed to load supported flags:', response.statusText)
-      supportedFlags.value = []
-      supportedConfigKeys.value = {}
-    }
-  } catch (error) {
-    console.error('Error loading supported flags:', error)
-    supportedFlags.value = []
-    supportedConfigKeys.value = {}
-  }
-}
-
-// Computed properties to check if specific flags are supported
-const isTypicalPSupported = computed(() => supportedConfigKeys.value.typical_p === true)
-const isMinPSupported = computed(() => supportedConfigKeys.value.min_p === true)
-const isTfsZSupported = computed(() => supportedConfigKeys.value.tfs_z === true)
-const isPresencePenaltySupported = computed(() => supportedConfigKeys.value.presence_penalty === true)
-const isFrequencyPenaltySupported = computed(() => supportedConfigKeys.value.frequency_penalty === true)
-const isJsonSchemaSupported = computed(() => supportedConfigKeys.value.json_schema === true)
-const isCacheTypeVSupported = computed(() => supportedConfigKeys.value.cache_type_v === true)
-
-// Individual field validation
-const gpuLayersValidation = computed(() => {
-  if (!modelLayerInfo.value) return null
-  if (config.value.n_gpu_layers > modelLayerInfo.value.layer_count) {
-    return {
-      type: 'error',
-      message: `Exceeds model's ${modelLayerInfo.value.layer_count} layers`
-    }
-  }
-  if (config.value.n_gpu_layers === modelLayerInfo.value.layer_count && modelLayerInfo.value.layer_count > 0) {
-    return {
-      type: 'success',
-      message: 'Fully offloaded to GPU'
-    }
-  }
-  return null
-})
-
-// Automatically translate context sizes that exceed the GGUF-reported
-// context_length into YARN-style scaling parameters for llama.cpp. This makes
-// "I want a bigger context" a single control while keeping advanced fields
-// in sync with that choice.
-watch(
-  () => [config.value.ctx_size, modelLayerInfo.value?.context_length],
-  ([ctxSize, ctxLengthRaw]) => {
-    const targetCtx = Number(ctxSize) || 0
-    const baseCtx = Number(ctxLengthRaw) || 0
-
-    // If we don't have a meaningful base context, or the user is within the
-    // native context window, clear auto YARN settings (when they still match
-    // our previous auto values) and leave any manual tuning untouched.
-    if (!baseCtx || targetCtx <= baseCtx) {
-      if (config.value.rope_scaling === autoYarnConfig.value.rope_scaling) {
-        config.value.rope_scaling = ''
-      }
-      if (config.value.yarn_ext_factor === autoYarnConfig.value.yarn_ext_factor) {
-        config.value.yarn_ext_factor = null
-      }
-      if (config.value.yarn_attn_factor === autoYarnConfig.value.yarn_attn_factor) {
-        config.value.yarn_attn_factor = null
-      }
-      autoYarnConfig.value = {
-        rope_scaling: '',
-        yarn_ext_factor: null,
-        yarn_attn_factor: null
-      }
-      return
-    }
-
-    const rawFactor = targetCtx / baseCtx
-    const factor = rawFactor > 1 ? Number(rawFactor.toFixed(2)) : 1.0
-
-    const nextAuto = {
-      rope_scaling: 'yarn',
-      yarn_ext_factor: factor,
-      // Keep attention factor neutral by default; users can override if needed.
-      yarn_attn_factor: 1.0
-    }
-
-    // Only auto-populate fields when they are empty or still equal to the
-    // last auto-generated values, so manual edits win.
-    if (!config.value.rope_scaling || config.value.rope_scaling === autoYarnConfig.value.rope_scaling) {
-      config.value.rope_scaling = nextAuto.rope_scaling
-    }
-    if (
-      config.value.yarn_ext_factor == null ||
-      config.value.yarn_ext_factor === autoYarnConfig.value.yarn_ext_factor
-    ) {
-      config.value.yarn_ext_factor = nextAuto.yarn_ext_factor
-    }
-    if (
-      config.value.yarn_attn_factor == null ||
-      config.value.yarn_attn_factor === autoYarnConfig.value.yarn_attn_factor
-    ) {
-      config.value.yarn_attn_factor = nextAuto.yarn_attn_factor
-    }
-
-    autoYarnConfig.value = nextAuto
-  }
-)
-
-const contextSizeValidation = computed(() => {
-  if (!modelLayerInfo.value) return null
-  if (config.value.ctx_size > modelLayerInfo.value.context_length) {
-    return {
-      type: 'warning',
-      message: `Exceeds model's max of ${modelLayerInfo.value.context_length.toLocaleString()}`
-    }
-  }
-  return null
-})
-
-const batchSizeValidation = computed(() => {
-  if (!modelLayerInfo.value) return null
-  if (config.value.batch_size > maxBatchSize.value) {
-    return {
-      type: 'warning',
-      message: `Exceeds recommended max of ${maxBatchSize.value}`
-    }
-  }
-  return null
-})
-
-const parallelValidation = computed(() => {
-  if (!modelLayerInfo.value) return null
-  if (config.value.parallel > maxParallel.value) {
-    return {
-      type: 'warning',
-      message: `Exceeds recommended max of ${maxParallel.value}`
-    }
-  }
-  return null
-})
-
-// Configuration validation warnings
-const configWarnings = computed(() => {
-  const warnings = []
-
-  if (modelLayerInfo.value) {
-    // Check context size
-    if (config.value.ctx_size > modelLayerInfo.value.context_length) {
-      warnings.push({
-        type: 'warning',
-        message: `Context size (${config.value.ctx_size}) exceeds model's maximum context length (${modelLayerInfo.value.context_length})`,
-        field: 'ctx_size'
-      })
-    }
-
-    // Check batch size
-    if (config.value.batch_size > maxBatchSize.value) {
-      warnings.push({
-        type: 'warning',
-        message: `Batch size (${config.value.batch_size}) exceeds recommended maximum (${maxBatchSize.value}) based on model architecture`,
-        field: 'batch_size'
-      })
-    }
-
-    // Check GPU layers
-    if (config.value.n_gpu_layers > modelLayerInfo.value.layer_count) {
-      warnings.push({
-        type: 'error',
-        message: `GPU layers (${config.value.n_gpu_layers}) exceeds model's total layers (${modelLayerInfo.value.layer_count})`,
-        field: 'n_gpu_layers'
-      })
-    }
-
-    // Check parallel processing
-    if (config.value.parallel > maxParallel.value) {
-      warnings.push({
-        type: 'warning',
-        message: `Parallel processing (${config.value.parallel}) exceeds recommended maximum (${maxParallel.value}) based on attention head count`,
-        field: 'parallel'
-      })
-    }
-  }
-
-  return warnings
-})
-
-onMounted(async () => {
-  await loadModel()
-  await systemStore.fetchSystemStatus()
-  await loadSupportedFlags()
-  initializeConfig()
-
-  // Subscribe to unified monitoring updates for real-time memory data
-  wsStore.subscribeToUnifiedMonitoring((data) => {
-    // Extract RAM data from unified stream
-    if (data.system?.memory) {
-      const memory = data.system.memory
-      const total = memory.total ?? 0
-      const available = memory.available ?? memory.free ?? 0
-      const used = memory.used ?? (total > 0 ? Math.max(0, total - available) : 0)
-      const percent = memory.percent ?? (total > 0 ? (used / total) * 100 : 0)
-      const swapTotal = memory.swap_total ?? 0
-      const swapUsed = memory.swap_used ?? 0
-      const swapPercent = memory.swap_percent ?? (swapTotal > 0 ? (swapUsed / swapTotal) * 100 : 0)
-
-      realtimeRamData.value = {
-        total,
-        available,
-        used,
-        percent,
-        free: memory.free ?? null,
-        cached: memory.cached ?? null,
-        buffers: memory.buffers ?? null,
-        swap_total: swapTotal,
-        swap_used: swapUsed,
-        swap_percent: swapPercent,
-        timestamp: Date.now()
-      }
-    }
-
-    // Extract VRAM data from unified stream
-    if (data.gpu?.vram_data) {
-      realtimeVramData.value = data.gpu.vram_data
-    }
-
-    // Extract running instance data for performance metrics
-  })
-
-  // Automatically trigger initial estimates if no real-time data yet
-  if (!realtimeRamData.value) {
-    estimateRam()
-  }
-  if (!realtimeVramData.value && !systemStore.gpuInfo.cpu_only_mode) {
-    estimateVram()
-  }
-})
-
-onUnmounted(() => {
-  // WebSocket subscriptions are automatically cleaned up by the store
-  if (vramEstimateTimeout) {
-    clearTimeout(vramEstimateTimeout)
-    vramEstimateTimeout = null
-  }
-  if (ramEstimateTimeout) {
-    clearTimeout(ramEstimateTimeout)
-    ramEstimateTimeout = null
-  }
-  if (vramAbortController) {
-    vramAbortController.abort()
-    vramAbortController = null
-  }
-  if (ramAbortController) {
-    ramAbortController.abort()
-    ramAbortController = null
-  }
-})
-
-const loadModel = async () => {
-  const modelId = route.params.id
-  if (modelId) {
-    try {
-      await modelStore.fetchModels()
-      // Find the specific quantization in the grouped structure
-      model.value = modelStore.allQuantizations.find(m => m.id === parseInt(modelId))
-      if (!model.value) {
-        toast.error('The requested model quantization could not be found')
-        router.push('/models')
-        return
-      }
-
-      // Load model layer information
-      await loadModelLayerInfo()
-      
-      // Load model recommendations
-      await loadModelRecommendations()
-      await loadHfMetadata()
-
-    } catch (error) {
-      console.error('Failed to load model:', error)
-      toast.error('Failed to load model')
-    }
-  }
-}
-
-const loadModelLayerInfo = async () => {
-  if (!model.value) return
-
-  layerInfoLoading.value = true
-  try {
-    const response = await fetch(`/api/models/${model.value.id}/layer-info`)
-    if (response.ok) {
-      modelLayerInfo.value = await response.json()
-      console.log('Model layer info:', modelLayerInfo.value)
-    } else {
-      console.warn('Failed to load model layer info, using defaults')
-      modelLayerInfo.value = { layer_count: 32 }
-    }
-  } catch (error) {
-    console.error('Error loading model layer info:', error)
-    modelLayerInfo.value = { layer_count: 32 }
-  } finally {
-    layerInfoLoading.value = false
-  }
-}
-
-const loadModelRecommendations = async () => {
-  if (!model.value) return
-
-  recommendationsLoading.value = true
-  try {
-    const response = await fetch(`/api/models/${model.value.id}/recommendations`)
-    if (response.ok) {
-      modelRecommendations.value = await response.json()
-      console.log('Model recommendations:', modelRecommendations.value)
-    } else {
-      console.warn('Failed to load model recommendations')
-      modelRecommendations.value = null
-    }
-  } catch (error) {
-    console.error('Error loading model recommendations:', error)
-    modelRecommendations.value = null
-  } finally {
-    recommendationsLoading.value = false
-  }
-}
-
-const loadHfMetadata = async () => {
-  if (!model.value) return
-  hfMetadataLoading.value = true
-  try {
-    const data = await modelStore.fetchHfMetadata(model.value.id)
-    hfMetadata.value = data?.metadata || null
-    hfLayerInfo.value = data?.gguf_layer_info || null
-  } catch (error) {
-    console.error('Failed to load Hugging Face metadata:', error)
-    hfMetadata.value = null
-    hfLayerInfo.value = null
-  } finally {
-    hfMetadataLoading.value = false
-  }
-}
-
-const regenerateModelInfo = async () => {
-  if (!model.value) return
-
-  regeneratingInfo.value = true
-  try {
-    const response = await fetch(`/api/models/${model.value.id}/regenerate-info`, {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json'
-      }
-    })
-
-    if (response.ok) {
-      const result = await response.json()
-      toast.success('Model information regenerated successfully')
-
-      // Update model from store if available
-      if (modelStore) {
-        await modelStore.fetchModels()
-        // Reload the current model
-        model.value = modelStore.allQuantizations.find(m => m.id === model.value.id)
-      }
-
-      // Reload layer info to get updated metadata
-      await loadModelLayerInfo()
-      
-      // Reload recommendations with updated metadata
-      await loadModelRecommendations()
-      await loadHfMetadata()
-
-      console.log('Regenerated model info:', result)
-    } else {
-      const error = await response.json().catch(() => ({ detail: 'Unknown error' }))
-      toast.error(`Failed to regenerate model info: ${error.detail || 'Unknown error'}`)
-    }
-  } catch (error) {
-    console.error('Error regenerating model info:', error)
-    toast.error('Failed to regenerate model information')
-  } finally {
-    regeneratingInfo.value = false
-  }
-}
-
-const applyEmbeddingDefaults = () => {
-  if (isEmbeddingModel.value && !config.value.embedding) {
-    config.value.embedding = true
-  }
-}
-
-const parseModelConfig = (rawConfig) => {
-  if (!rawConfig) return null
-  if (typeof rawConfig === 'object') {
-    return { ...rawConfig }
-  }
-  if (typeof rawConfig === 'string') {
-    try {
-      return JSON.parse(rawConfig)
-    } catch (error) {
-      console.error('Failed to parse model config:', error)
-      return null
-    }
-  }
-  return null
-}
-
-const initializeConfig = () => {
-  const defaults = getDefaultConfig()
-  const parsed = parseModelConfig(model.value?.config)
-  if (parsed) {
-    config.value = { ...defaults, ...parsed }
-  } else {
-    config.value = { ...defaults }
-  }
-  if ('port' in config.value) {
-    delete config.value.port
-  }
-  for (const key in defaults) {
-    if (config.value[key] === undefined || config.value[key] === null) {
-      config.value[key] = defaults[key]
-    }
-    if (typeof defaults[key] === 'boolean' && typeof config.value[key] !== 'boolean') {
-      config.value[key] = Boolean(config.value[key])
-    }
-    if (typeof defaults[key] === 'number' && typeof config.value[key] !== 'number') {
-      const num = Number(config.value[key])
-      config.value[key] = isNaN(num) ? defaults[key] : num
-    }
-  }
-  applyEmbeddingDefaults()
-}
-
-const getDefaultConfig = () => ({
-  n_gpu_layers: 0,
-  main_gpu: 0,
-  tensor_split: '',
-  ctx_size: 4096,
-  batch_size: 256,
-  ubatch_size: 128,
-  no_mmap: false,
-  mlock: false,
-  low_vram: false,
-  logits_all: false,
-  embedding: false,
-  cont_batching: true,
-  no_kv_offload: false,
-  n_predict: -1,
-  temp: 0.8,
-  temperature: 0.8,
-  top_k: 40,
-  top_p: 0.9,
-  repeat_penalty: 1.1,
-  threads: 6,
-  threads_batch: 6,
-  parallel: 1,
-  flash_attn: false,
-  cache_type_k: 'f16',
-  cache_type_v: null,
-  moe_offload_pattern: 'none',
-  moe_offload_custom: '',
-  rope_freq_base: null,
-  rope_freq_scale: null,
-  yarn_ext_factor: null,
-  yarn_attn_factor: null,
-  rope_scaling: '',
-  yaml: '',
-  customArgs: '',
-  min_p: null,
-  typical_p: null,
-  tfs_z: null,
-  presence_penalty: null,
-  frequency_penalty: null,
-  mirostat: null,
-  mirostat_tau: null,
-  mirostat_eta: null,
-  seed: null,
-  stop: [],
-  grammar: '',
-  json_schema: '',
-  jinja: false,
-  host: '0.0.0.0',
-  timeout: 300
-})
-
-// Track the last auto-generated YARN settings so we don't overwrite or erase
-// user-provided values when they manually tune advanced parameters.
-const autoYarnConfig = ref({
-  rope_scaling: '',
-  yarn_ext_factor: null,
-  yarn_attn_factor: null
-})
-
-// Apply architecture preset
-const selectedPreset = ref(null)
-const applyPreset = async (presetName, skipPreview = false) => {
-  if (!model.value) return
-
-  try {
-    // Verify preset exists
-    const response = await fetch(`/api/models/${model.value.id}/architecture-presets`)
-    if (!response.ok) throw new Error('Failed to fetch presets')
-
-    const data = await response.json()
-    const preset = data.presets[presetName]
-
-    if (!preset) {
-      toast.error(`Preset '${presetName}' not found`)
-      return
-    }
-
-    // Presets should generate full smart-auto config with preset tuning
-    // Set the preset and generate smart-auto configuration
-    selectedPreset.value = presetName
-    autoConfigLoading.value = true
-
-    try {
-      // Call smart-auto with the preset parameter
-      const params = new URLSearchParams()
-      params.append('preset', presetName)
-      if (smartAutoUsageMode.value) {
-        params.append('usage_mode', smartAutoUsageMode.value)
-      }
-      const url = `/api/models/${model.value.id}/smart-auto?${params.toString()}`
-      const smartAutoResponse = await fetch(url, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' }
-      })
-
-      if (!smartAutoResponse.ok) {
-        throw new Error(`Smart auto failed: ${smartAutoResponse.statusText}`)
-      }
-
-      const smartConfig = await smartAutoResponse.json()
-      const defaults = getDefaultConfig()
-      const newConfig = { ...defaults, ...smartConfig }
-
-      // Calculate changes - merge defaults with current config to get accurate "before" values
-      const currentConfig = { ...defaults, ...config.value }
-      const changes = calculateChanges(newConfig, currentConfig)
-      
-      // Calculate impact (only if changes detected)
-      let impact = null
-      if (changes.length > 0) {
-        try {
-          impact = await calculateImpact(newConfig, config.value)
-        } catch (impactError) {
-          console.warn('Failed to calculate impact for preset:', impactError)
-        }
-      }
-
-      // Always show preview for presets (unless skipPreview is true)
-      if (!skipPreview) {
-        // Show preview (even if no changes detected)
-        previewData.value = {
-          type: 'preset',
-          presetName: presetName,
-          changes: changes.length > 0 ? changes : [{
-            field: 'Configuration',
-            before: 'Current settings',
-            after: `${presetName} preset settings`,
-            description: `The ${presetName} preset will be applied`
-          }],
-          impact: impact,
-          newConfig: newConfig
-        }
-        showPreview.value = true
-        autoConfigLoading.value = false
-        return
-      }
-
-      // Apply directly if skipPreview is true
-      // Use proper merging to ensure reactivity (reuse defaults from above)
-      const mergedConfig = { ...defaults, ...config.value, ...newConfig }
-      config.value = mergedConfig
-      
-      // Ensure configuration UI is visible
-      showConfig.value = true
-
-      // Re-estimate memory to reflect config changes
-      await estimateVram()
-      await estimateRam()
-
-      toast.success(`${presetName.charAt(0).toUpperCase() + presetName.slice(1)} preset applied successfully`)
-    } finally {
-      autoConfigLoading.value = false
-    }
-  } catch (error) {
-    console.error('Error applying preset:', error)
-    toast.error(`Failed to apply preset: ${error.message || 'Unknown error'}`)
-    autoConfigLoading.value = false
-  }
-}
-
-// Watch for flash attention changes to warn about V cache
-watch(() => config.value.flash_attn, (newVal) => {
-  // Ensure cache_type_v has a safe value based on flash_attn state
-  if (!newVal) {
-    // When Flash Attention is disabled, V cache should be null or f16
-    if (!config.value.cache_type_v || typeof config.value.cache_type_v !== 'string') {
-      config.value.cache_type_v = null
-    }
-  } else {
-    // When Flash Attention is enabled, ensure V cache has a valid string value
-    if (!config.value.cache_type_v || typeof config.value.cache_type_v !== 'string') {
-      config.value.cache_type_v = config.value.cache_type_k || 'f16'
-    }
-  }
-})
-
-// Handle wizard-generated configuration
-const handleWizardConfig = async (wizardConfig) => {
-  try {
-    if (!model.value) {
-      toast.error('No model selected')
-      return
-    }
-
-    // Apply wizard configuration
-    const defaults = getDefaultConfig()
-    Object.assign(config.value, defaults, wizardConfig)
-
-    // Re-estimate memory to reflect config changes
-    await estimateVram()
-    await estimateRam()
-
-    // Automatically save to backend
-    saveLoading.value = true
-    try {
-      await modelStore.updateModelConfig(model.value.id, config.value)
-      
-      // Reload model to get updated config
-      await loadModel()
-      
-      toast.success('Configuration wizard settings applied and saved successfully')
-      showWizard.value = false
-    } catch (saveError) {
-      console.error('Save config error:', saveError)
-      toast.error('Configuration applied but failed to save. Please click Save Config manually.')
-    } finally {
-      saveLoading.value = false
-    }
-  } catch (error) {
-    console.error('Error applying wizard config:', error)
-    toast.error('Failed to apply wizard configuration')
-  }
-}
-
-// Handle smart auto from Quick Start Modal
-const handleQuickStartSmartAuto = (usageMode) => {
-  smartAutoUsageMode.value = usageMode
-  generateAutoConfig()
-}
-
-const generateAutoConfig = async (skipPreview = false) => {
-  autoConfigLoading.value = true
-  try {
-    if (!model.value) {
-      toast.error('No model selected')
-      return
-    }
-
-    // Call backend smart auto API (send preset and usage_mode if provided)
-    const params = new URLSearchParams()
-    if (selectedPreset.value) {
-      params.append('preset', selectedPreset.value)
-    }
-    if (smartAutoUsageMode.value) {
-      params.append('usage_mode', smartAutoUsageMode.value)
-    }
-    const queryString = params.toString()
-    const url = `/api/models/${model.value.id}/smart-auto${queryString ? '?' + queryString : ''}`
-    const response = await fetch(url, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' }
-    })
-
-    if (!response.ok) {
-      throw new Error(`Smart auto failed: ${response.statusText}`)
-    }
-
-    const smartConfig = await response.json()
-    console.log('Smart auto API response received:', smartConfig)
-    
-    const defaults = getDefaultConfig()
-    const newConfig = { ...defaults, ...smartConfig }
-    console.log('Merged new config for application:', newConfig)
-
-    // Calculate changes - merge defaults with current config to get accurate "before" values
-    const currentConfig = { ...defaults, ...config.value }
-    const changes = calculateChanges(newConfig, currentConfig)
-    console.log(`Detected ${changes.length} configuration changes`, changes)
-    
-    // Calculate impact (don't fail if this errors) - only if we have changes
-    let impact = null
-    if (changes.length > 0) {
-      try {
-        impact = await calculateImpact(newConfig, config.value)
-      } catch (impactError) {
-        console.warn('Failed to calculate impact, continuing anyway:', impactError)
-        // Continue without impact calculation
-      }
-    }
-
-    // Always show preview for smart-auto (unless skipPreview is true)
-    // This ensures users can see what will be applied
-    if (!skipPreview) {
-      console.log('Showing preview dialog', changes.length > 0 ? 'with changes' : 'to confirm application')
-      // Show preview (even if no changes detected, to confirm application)
-      previewData.value = {
-        type: 'smart-auto',
-        presetName: '',
-        changes: changes.length > 0 ? changes : [{
-          field: 'Configuration',
-          before: 'Current settings',
-          after: 'Optimized settings',
-          description: 'Smart Auto will apply optimized configuration'
-        }],
-        impact: impact,
-        newConfig: newConfig
-      }
-      showPreview.value = true
-      autoConfigLoading.value = false
-      return
-    }
-
-    // Apply the smart configuration directly (either skipPreview=true or no changes detected)
-    console.log('Applying smart auto config directly (no preview)')
-    
-    // Merge all fields from newConfig into current config, preserving reactivity
-    // Start with defaults, then current config, then overlay with new config
-    const mergedConfig = { ...defaults, ...config.value, ...newConfig }
-    console.log('Merged config before applying:', mergedConfig)
-    
-    // Ensure all fields have safe values (handle nulls/undefined)
-    for (const key in mergedConfig) {
-      const value = mergedConfig[key]
-      // Handle null values - use default if available, otherwise keep null for optional fields
-      if (value === null && key in defaults) {
-        mergedConfig[key] = defaults[key]
-      }
-      // Type coercion for critical fields based on defaults
-      if (key in defaults) {
-        const defaultVal = defaults[key]
-        if (typeof defaultVal === 'boolean' && typeof mergedConfig[key] !== 'boolean') {
-          mergedConfig[key] = Boolean(mergedConfig[key])
-        }
-        if (typeof defaultVal === 'number' && typeof mergedConfig[key] !== 'number') {
-          const num = Number(mergedConfig[key])
-          mergedConfig[key] = isNaN(num) ? defaultVal : num
-        }
-        if (typeof defaultVal === 'string' && typeof mergedConfig[key] !== 'string') {
-          mergedConfig[key] = String(mergedConfig[key] ?? defaultVal)
-        }
-      }
-    }
-    
-    // Assign the merged config to trigger reactivity
-    config.value = mergedConfig
-    
-    // Ensure configuration UI is visible
-    showConfig.value = true
-
-    // Show success message with optimization details
-    const isCpuOnlyMode = systemStore.gpuInfo.cpu_only_mode
-    const optimizationType = isCpuOnlyMode ? 'CPU-optimized' : 'GPU-optimized'
-
-    toast.success(`${optimizationType} configuration generated and applied successfully`)
-
-    // Update estimates after applying smart config
-    try {
-      await estimateVram()
-      await estimateRam()
-    } catch (estimateError) {
-      console.warn('Failed to update estimates:', estimateError)
-      // Don't fail if estimates fail
-    }
-
-  } catch (error) {
-    console.error('Error generating auto config:', error)
-    toast.error(`Failed to generate automatic configuration: ${error.message || 'Unknown error'}`)
-  } finally {
-    autoConfigLoading.value = false
-  }
-}
-
-const estimateVram = async () => {
-  if (!model.value) return
-
-  if (vramAbortController) {
-    vramAbortController.abort()
-  }
-
-  const controller = new AbortController()
-  vramAbortController = controller
-  vramLoading.value = true
-
-  try {
-    const response = await fetch('/api/models/vram-estimate', {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json'
-      },
-      body: JSON.stringify({
-        model_id: model.value.id,
-        config: config.value,
-        usage_mode: smartAutoUsageMode.value
-      }),
-      signal: controller.signal
-    })
-
-    if (!response.ok) {
-      throw new Error('VRAM estimation failed')
-    }
-
-    const data = await response.json()
-    vramEstimate.value = data
-  } catch (error) {
-    if (error?.name === 'AbortError') {
-      return
-    }
-    console.error('VRAM estimation error:', error)
-    toast.error('Could not estimate VRAM usage')
-  } finally {
-    if (vramAbortController === controller) {
-      vramLoading.value = false
-      vramAbortController = null
-    }
-  }
-}
-
-const estimateRam = async () => {
-  if (!model.value) return
-
-  if (ramAbortController) {
-    ramAbortController.abort()
-  }
-
-  const controller = new AbortController()
-  ramAbortController = controller
-  ramLoading.value = true
-
-  try {
-    const response = await fetch('/api/models/ram-estimate', {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json'
-      },
-      body: JSON.stringify({
-        model_id: model.value.id,
-        config: config.value,
-        usage_mode: smartAutoUsageMode.value
-      }),
-      signal: controller.signal
-    })
-
-    if (!response.ok) {
-      throw new Error('RAM estimation failed')
-    }
-
-    ramEstimate.value = await response.json()
-  } catch (error) {
-    if (error?.name === 'AbortError') {
-      return
-    }
-    console.error('RAM estimation error:', error)
-    toast.error('Could not estimate RAM usage')
-  } finally {
-    if (ramAbortController === controller) {
-      ramLoading.value = false
-      ramAbortController = null
-    }
-  }
-}
-
-const updateVramEstimate = (force = false) => {
-  if (force || !vramEstimate.value) {
-    return estimateVram()
-  }
-  if (vramEstimateTimeout) {
-    clearTimeout(vramEstimateTimeout)
-  }
-  vramEstimateTimeout = setTimeout(() => {
-    estimateVram()
-  }, ESTIMATE_DEBOUNCE_MS)
-  return Promise.resolve()
-}
-
-const updateRamEstimate = (force = false) => {
-  if (force || !ramEstimate.value) {
-    return estimateRam()
-  }
-  if (ramEstimateTimeout) {
-    clearTimeout(ramEstimateTimeout)
-  }
-  ramEstimateTimeout = setTimeout(() => {
-    estimateRam()
-  }, ESTIMATE_DEBOUNCE_MS)
-  return Promise.resolve()
-}
-
-const saveConfig = async () => {
-  if (!model.value) return
-
-  saveLoading.value = true
-  try {
-    await modelStore.updateModelConfig(model.value.id, config.value)
-
-    toast.success('Model configuration has been updated')
-  } catch (error) {
-    console.error('Save config error:', error)
-    toast.error('Failed to save configuration')
-  } finally {
-    saveLoading.value = false
-  }
-}
-
-// Calculate changes between old and new config
-const calculateChanges = (newConfig, oldConfig) => {
-  const changes = []
-  const importantFields = {
-    n_gpu_layers: 'GPU Layers',
-    ctx_size: 'Context Size',
-    batch_size: 'Batch Size',
-    ubatch_size: 'Micro Batch Size',
-    temp: 'Temperature',
-    temperature: 'Temperature',
-    top_k: 'Top-K',
-    top_p: 'Top-P',
-    repeat_penalty: 'Repeat Penalty',
-    threads: 'CPU Threads',
-    threads_batch: 'Batch Threads',
-    flash_attn: 'Flash Attention',
-    cont_batching: 'Continuous Batching',
-    tensor_split: 'Tensor Split',
-    parallel: 'Parallel Processing',
-    cache_type_k: 'KV Cache Type',
-    cache_type_v: 'V Cache Type',
-    main_gpu: 'Main GPU'
-  }
-
-  for (const [key, label] of Object.entries(importantFields)) {
-    const oldValue = oldConfig[key]
-    const newValue = newConfig[key]
-    
-    // Compare values more carefully (handle null/undefined, strings vs numbers)
-    const oldVal = oldValue === null || oldValue === undefined ? null : String(oldValue)
-    const newVal = newValue === null || newValue === undefined ? null : String(newValue)
-    
-    if (oldVal !== newVal) {
-      changes.push({
-        field: label,
-        before: oldValue,
-        after: newValue,
-        description: getFieldDescription(key, oldValue, newValue)
-      })
-    }
-  }
-
-  return changes
-}
-
-// Get description for field change
-const getFieldDescription = (key, oldValue, newValue) => {
-  const descriptions = {
-    n_gpu_layers: oldValue < newValue 
-      ? 'More layers offloaded to GPU for faster inference' 
-      : 'Fewer layers offloaded to GPU to reduce VRAM usage',
-    ctx_size: oldValue < newValue 
-      ? 'Increased context window for longer conversations' 
-      : 'Reduced context window to save memory',
-    batch_size: oldValue < newValue 
-      ? 'Increased batch size for better throughput' 
-      : 'Reduced batch size to save memory',
-    temp: oldValue < newValue 
-      ? 'Higher temperature for more creative outputs' 
-      : 'Lower temperature for more focused outputs'
-  }
-  return descriptions[key] || ''
-}
-
-/**
- * Calculate impact of config changes (VRAM usage and performance).
- * Used by the preview dialog (ConfigChangePreview.vue) to show expected impact.
- * Called before showing preview dialog when applying smart-auto or presets.
- * Returns null if no significant impact detected.
- */
-const calculateImpact = async (newConfig, oldConfig) => {
-  const impact = {}
-  
-  // Estimate VRAM changes
-  const oldVram = vramEstimate.value?.estimated_vram || 0
-  const newVram = await estimateVramForConfig(newConfig)
-  
-  if (oldVram && newVram) {
-    const diff = newVram - oldVram
-    if (Math.abs(diff) > 1000000000) { // > 1GB
-      const diffPercent = ((diff / oldVram) * 100).toFixed(1)
-      impact.vram = diff > 0 
-        ? `VRAM Usage: ${formatFileSize(oldVram)} → ${formatFileSize(newVram)} (+${diffPercent}%)`
-        : `VRAM Usage: ${formatFileSize(oldVram)} → ${formatFileSize(newVram)} (${diffPercent}%)`
-    }
-  }
-
-  // Estimate performance impact
-  const oldLayers = oldConfig.n_gpu_layers || 0
-  const newLayers = newConfig.n_gpu_layers || 0
-  if (newLayers > oldLayers) {
-    const layerIncrease = newLayers - oldLayers
-    const estimatedSpeedup = Math.min(50, Math.round((layerIncrease / 32) * 50))
-    if (estimatedSpeedup > 5) {
-      impact.performance = `Performance: ~${estimatedSpeedup}% faster with more GPU layers`
-    }
-  }
-
-  return Object.keys(impact).length > 0 ? impact : null
-}
-
-// Estimate VRAM for a config without applying it
-const estimateVramForConfig = async (testConfig) => {
-  if (!model.value) return 0
-  
-  try {
-    const response = await fetch('/api/models/vram-estimate', {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        model_id: model.value.id,
-        config: testConfig
-      })
-    })
-    if (response.ok) {
-      const estimate = await response.json()
-      return estimate.estimated_vram || 0
-    }
-  } catch (error) {
-    console.error('VRAM estimation error:', error)
-  }
-  return vramEstimate.value?.estimated_vram || 0
-}
-
-// Apply preview changes
-const applyPreviewChanges = async () => {
-  if (!previewData.value) {
-    console.error('No preview data to apply')
-    return
-  }
-  
-  previewApplying.value = true
-  try {
-    const newConfig = previewData.value.newConfig
-    const defaults = getDefaultConfig()
-    
-    // Merge all fields from newConfig into current config, preserving reactivity
-    // Start with defaults, then current config, then overlay with new config
-    const mergedConfig = { ...defaults, ...config.value, ...newConfig }
-    
-    // Ensure all fields have safe values (handle nulls/undefined)
-    for (const key in mergedConfig) {
-      const value = mergedConfig[key]
-      // Handle null values - use default if available, otherwise keep null for optional fields
-      if (value === null && key in defaults) {
-        mergedConfig[key] = defaults[key]
-      }
-      // Type coercion for critical fields based on defaults
-      if (key in defaults) {
-        const defaultVal = defaults[key]
-        if (typeof defaultVal === 'boolean' && typeof mergedConfig[key] !== 'boolean') {
-          mergedConfig[key] = Boolean(mergedConfig[key])
-        }
-        if (typeof defaultVal === 'number' && typeof mergedConfig[key] !== 'number') {
-          const num = Number(mergedConfig[key])
-          mergedConfig[key] = isNaN(num) ? defaultVal : num
-        }
-        if (typeof defaultVal === 'string' && typeof mergedConfig[key] !== 'string') {
-          mergedConfig[key] = String(mergedConfig[key] ?? defaultVal)
-        }
-      }
-    }
-    
-    // Assign the merged config to trigger reactivity
-    config.value = mergedConfig
-    console.log('Applied config from preview:', config.value)
-    
-    // Ensure configuration UI is visible
-    showConfig.value = true
-    
-    // Re-estimate memory
-    await estimateVram()
-    await estimateRam()
-    
-    // Save the preset name and close preview
-    const previewType = previewData.value.type
-    const presetName = previewData.value.presetName
-    showPreview.value = false
-    previewData.value = null
-    
-    // Show success message
-    if (previewType === 'preset' && presetName) {
-      selectedPreset.value = presetName
-      toast.success(`${presetName.charAt(0).toUpperCase() + presetName.slice(1)} preset applied`)
-    } else {
-      const isCpuOnlyMode = systemStore.gpuInfo.cpu_only_mode
-      const optimizationType = isCpuOnlyMode ? 'CPU-optimized' : 'GPU-optimized'
-      toast.success(`${optimizationType} configuration applied successfully`)
-    }
-  } catch (error) {
-    console.error('Error applying changes:', error)
-    toast.error('Failed to apply configuration changes')
-  } finally {
-    previewApplying.value = false
-  }
-}
-
-// Tooltip generators with architecture-specific recommendations
-const getTemperatureTooltip = () => {
-  const architecture = modelLayerInfo.value?.architecture?.toLowerCase() || ''
-  let baseMsg = 'Controls randomness (0.1=deterministic, 2.0=creative)'
-
-  if (architecture.includes('glm')) {
-    baseMsg += ' | GLM: Recommended 1.0'
-  } else if (architecture.includes('deepseek')) {
-    baseMsg += ' | DeepSeek: Recommended 1.0'
-  } else if (architecture.includes('qwen')) {
-    baseMsg += ' | Qwen: Recommended 0.7'
-  } else if (architecture.includes('codellama') || model.value?.name.toLowerCase().includes('code')) {
-    baseMsg += ' | Coding: Recommended 0.1-0.7'
-  }
-
-  return baseMsg
-}
-
-const getTopKTooltip = () => {
-  return `Top-K sampling (limit to top K tokens) | Recommended: 40 for GLM/DeepSeek, 50 for others`
-}
-
-const getTopPTooltip = () => {
-  const architecture = modelLayerInfo.value?.architecture?.toLowerCase() || ''
-  let baseMsg = 'Top-P (nucleus) sampling'
-
-  if (architecture.includes('glm') || architecture.includes('deepseek')) {
-    baseMsg += ' | Recommended: 0.95'
-  } else if (architecture.includes('qwen')) {
-    baseMsg += ' | Recommended: 0.9-0.95'
-  } else {
-    baseMsg += ' | Recommended: 0.95'
-  }
-
-  return baseMsg
-}
-
-const getRepeatPenaltyTooltip = () => {
-  let baseMsg = 'Penalty for repeating tokens (1.0=no penalty)'
-
-  const ctxLength = modelLayerInfo.value?.context_length || 0
-  if (ctxLength > 32768) {
-    baseMsg += ' | Long context: Use 1.0-1.05'
-  } else if (ctxLength < 2048) {
-    baseMsg += ' | Short context: Use 1.1-1.2'
-  } else {
-    baseMsg += ' | Standard: Use 1.1'
-  }
-
-  return baseMsg
-}
-
-// Rich tooltip data for key settings
-const gpuLayersTooltip = computed(() => {
-  const rec = modelRecommendations.value?.gpu_layers
-  if (!rec) {
-    return {
-      description: 'Controls how many model layers are offloaded to GPU. More layers = faster inference but higher VRAM usage.',
-      whenToAdjust: 'Increase for faster generation if you have VRAM. Decrease if running out of VRAM.',
-      tradeoffs: [
-        'Higher values: Faster inference, better for long conversations',
-        'Lower values: Lower VRAM usage, falls back to CPU which is slower'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Controls how many model layers are offloaded to GPU. More layers = faster inference but higher VRAM usage.',
-    whenToAdjust: 'Increase for faster generation if you have VRAM. Decrease if running out of VRAM.',
-    tradeoffs: [
-      'Higher values: Faster inference, better for long conversations',
-      'Lower values: Lower VRAM usage, falls back to CPU which is slower'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.value} layers: ${r.description}`)
-  }
-})
-
-const contextSizeTooltip = computed(() => {
-  const rec = modelRecommendations.value?.context_size
-  if (!rec) {
-    return {
-      description: 'Maximum number of tokens the model can process in context. Higher = can remember more but uses more memory.',
-      whenToAdjust: 'Increase for long conversations or documents. Decrease if running low on memory.',
-      tradeoffs: [
-        'Higher values: Can handle longer conversations, better for documents',
-        'Lower values: Lower memory usage, faster processing'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Maximum number of tokens the model can process in context. Higher = can remember more but uses more memory.',
-    whenToAdjust: 'Increase for long conversations or documents. Decrease if running low on memory.',
-    tradeoffs: [
-      'Higher values: Can handle longer conversations, better for documents',
-      'Lower values: Lower memory usage, faster processing'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.min}-${r.max}: ${r.description}`)
-  }
-})
-
-const temperatureTooltip = computed(() => {
-  const rec = modelRecommendations.value?.temperature
-  if (!rec) {
-    return {
-      description: 'Controls randomness in model responses. Lower = more focused and deterministic, Higher = more creative and varied.',
-      whenToAdjust: 'Lower for code/technical tasks. Higher for creative writing. Adjust if outputs are too repetitive or too random.',
-      tradeoffs: [
-        'Low (0.1-0.3): Focused, deterministic, good for code',
-        'Medium (0.7-1.0): Balanced, natural conversations',
-        'High (1.5-2.0): Creative, varied, unpredictable'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Controls randomness in model responses. Lower = more focused and deterministic, Higher = more creative and varied.',
-    whenToAdjust: 'Lower for code/technical tasks. Higher for creative writing. Adjust if outputs are too repetitive or too random.',
-    tradeoffs: [
-      'Low (0.1-0.3): Focused, deterministic, good for code',
-      'Medium (0.7-1.0): Balanced, natural conversations',
-      'High (1.5-2.0): Creative, varied, unpredictable'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.min}-${r.max}: ${r.description}`)
-  }
-})
-
-const batchSizeTooltip = computed(() => {
-  const rec = modelRecommendations.value?.batch_size
-  if (!rec) {
-    return {
-      description: 'Number of tokens processed in parallel. Higher = faster but uses more memory.',
-      whenToAdjust: 'Increase if you have VRAM available. Decrease if getting out-of-memory errors.',
-      tradeoffs: [
-        'Higher values: Faster token generation, better throughput',
-        'Lower values: Lower memory usage, more sequential processing'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Number of tokens processed in parallel. Higher = faster but uses more memory.',
-    whenToAdjust: 'Increase if you have VRAM available. Decrease if getting out-of-memory errors.',
-    tradeoffs: [
-      'Higher values: Faster token generation, better throughput',
-      'Lower values: Lower memory usage, more sequential processing'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.min}-${r.max}: ${r.description}`)
-  }
-})
-
-const topKTooltip = computed(() => {
-  const rec = modelRecommendations.value?.top_k
-  if (!rec) {
-    return {
-      description: 'Limits sampling to the top K most likely tokens. Lower = more focused, Higher = more diverse.',
-      whenToAdjust: 'Lower for focused outputs. Higher for more variety. Works together with Top-P.',
-      tradeoffs: [
-        'Lower values (10-30): More focused, deterministic outputs',
-        'Medium values (40-50): Balanced diversity (recommended)',
-        'Higher values (100+): More random, less coherent outputs'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Limits sampling to the top K most likely tokens. Lower = more focused, Higher = more diverse.',
-    whenToAdjust: 'Lower for focused outputs. Higher for more variety. Works together with Top-P.',
-    tradeoffs: [
-      'Lower values (10-30): More focused, deterministic outputs',
-      'Medium values (40-50): Balanced diversity (recommended)',
-      'Higher values (100+): More random, less coherent outputs'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.min}-${r.max}: ${r.description}`)
-  }
-})
-
-const topPTooltip = computed(() => {
-  const rec = modelRecommendations.value?.top_p
-  if (!rec) {
-    return {
-      description: 'Nucleus sampling: considers tokens with cumulative probability mass up to P. Works with Top-K.',
-      whenToAdjust: 'Lower for more focused outputs. Higher for more diversity. Typically keep at 0.9-0.95.',
-      tradeoffs: [
-        'Lower values (0.7-0.8): More conservative, focused sampling',
-        'Medium values (0.9-0.95): Balanced diversity (recommended)',
-        'Higher values (0.98-1.0): Includes very low-probability tokens'
-      ],
-      recommended: 'For this model: Loading recommendations...',
-      ranges: []
-    }
-  }
-  
-  return {
-    description: 'Nucleus sampling: considers tokens with cumulative probability mass up to P. Works with Top-K.',
-    whenToAdjust: 'Lower for more focused outputs. Higher for more diversity. Typically keep at 0.9-0.95.',
-    tradeoffs: [
-      'Lower values (0.7-0.8): More conservative, focused sampling',
-      'Medium values (0.9-0.95): Balanced diversity (recommended)',
-      'Higher values (0.98-1.0): Includes very low-probability tokens'
-    ],
-    recommended: rec.description,
-    ranges: rec.ranges.map(r => `${r.min}-${r.max}: ${r.description}`)
-  }
-})
-
-const repeatPenaltyTooltip = computed(() => {
-  const ctxLength = modelLayerInfo.value?.context_length || 0
-  let recommended = '1.1 for standard contexts'
-  let ranges = [
-    '1.0: No penalty (allows repetition)',
-    '1.1: Standard penalty (recommended)',
-    '1.2: Strong penalty (prevents most repetition)'
-  ]
-
-  if (ctxLength > 32768) {
-    recommended = '1.0-1.05 for long contexts'
-    ranges = [
-      '1.0-1.05: Minimal penalty (long context models)',
-      '1.1: Standard (good for most cases)',
-      '1.2+: Strong penalty (short contexts)'
-    ]
-  } else if (ctxLength < 2048) {
-    recommended = '1.1-1.2 for short contexts'
-  }
-
-  return {
-    description: 'Penalty applied to tokens that have appeared in the context. Higher = less repetition.',
-    whenToAdjust: 'Increase if model repeats too much. Decrease if model avoids valid repetition. Adjust based on context length.',
-    tradeoffs: [
-      'Lower values (1.0-1.05): Allows natural repetition, good for long contexts',
-      'Medium values (1.1): Balanced, prevents excessive repetition (recommended)',
-      'Higher values (1.2-2.0): Strong prevention, may avoid valid repetition'
-    ],
-    recommended: recommended,
-    ranges: ranges
-  }
-})
-
-// Search functionality - computed properties to check section/field matches
-// Tab matching for search - find which tab contains matching content
-const matchingTabIndex = computed(() => {
-  const query = configSearchQuery.value.toLowerCase().trim()
-  if (!query) return null
-  
-  const searchTerms = query.split(/\s+/)
-  
-  for (let i = 0; i < tabLabels.length; i++) {
-    const tab = tabLabels[i]
-    const matchesAny = searchTerms.some(term => 
-      tab.keywords.some(keyword => keyword.includes(term) || term.includes(keyword))
-    )
-    if (matchesAny) return i
-  }
-  
-  return null
-})
-
-// Auto-switch to matching tab when searching
-watch(matchingTabIndex, (tabIndex) => {
-  if (tabIndex !== null && tabIndex !== undefined) {
-    activeTabIndex.value = tabIndex
-  }
-})
-
-// Check if current model is running
-// Handle stop model
-const handleStopModel = async () => {
-  if (!model.value) return
-  
-  try {
-    await modelStore.stopModel(model.value.id)
-    toast.success('Model stopped successfully')
-  } catch (error) {
-    console.error('Error stopping model:', error)
-    toast.error('Failed to stop model')
-  }
-}
-
-// Watch for config changes to update estimates
-watch(config, () => {
-  updateVramEstimate()
-  updateRamEstimate()
-}, { deep: true })
-
-// Watch for model changes to update running state
-// Handle onboarding tour
-const handleOnboardingComplete = () => {
-  localStorage.setItem('model-config-onboarding-completed', 'true')
-  showOnboarding.value = false
-}
-
-const handleOnboardingSkip = () => {
-  showOnboarding.value = false
-  // Don't mark as completed if skipped
-}
-
-// Check if model has no configuration (empty state)
-const valuesDiffer = (value, defaultValue) => {
-  if (Array.isArray(value) && Array.isArray(defaultValue)) {
-    if (value.length !== defaultValue.length) return true
-    return value.some((v, idx) => valuesDiffer(v, defaultValue[idx]))
-  }
-  return value !== defaultValue && value !== null && value !== undefined && value !== ''
-}
-
-const hasNoConfig = computed(() => {
-  if (!model.value) return false
-  if (!model.value.config) return true
-  const parsed = parseModelConfig(model.value.config)
-  if (!parsed) return true
-  const defaults = getDefaultConfig()
-  return !Object.keys(parsed).some(key => valuesDiffer(parsed[key], defaults[key]))
-})
-
-</script>
-
-<style scoped>
-.model-config {
-  min-height: 100vh;
-  background: var(--bg-primary);
-  overflow-x: hidden;
-}
-
-.config-layout {
-  display: grid;
-  grid-template-columns: 1fr 320px;
-  gap: var(--spacing-lg);
-  max-width: 1400px;
-  margin: 0 auto;
-  padding: var(--spacing-lg);
-  box-sizing: border-box;
-  align-items: start;
-}
-
-.config-main {
-  min-width: 0;
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-  overflow-x: hidden;
-}
-
-.config-tabs-wrapper {
-  width: 100%;
-  margin-top: var(--spacing-sm);
-}
-
-.config-tabs {
-  width: 100%;
-}
-
-.config-tabs :deep(.p-tabview-nav) {
-  background: transparent;
-  border-bottom: 2px solid var(--border-primary);
-  border-radius: 0;
-  padding: 0;
-  margin-bottom: var(--spacing-md);
-  gap: 0;
-  display: flex;
-  align-items: center;
-}
-
-.config-tabs :deep(.p-tabview-nav li) {
-  margin-right: var(--spacing-sm);
-}
-
-.config-tabs :deep(.p-tabview-nav li:last-child) {
-  margin-right: 0;
-}
-
-.config-tabs :deep(.p-tabview-nav li .p-tabview-nav-link) {
-  padding: var(--spacing-sm) var(--spacing-md);
-  border-radius: var(--radius-md) var(--radius-md) 0 0;
-  transition: all var(--transition-normal);
-  color: var(--text-secondary);
-  border: none;
-  border-bottom: 3px solid transparent;
-  margin-bottom: -2px;
-  background: transparent;
-  font-weight: 500;
-}
-
-.config-tabs :deep(.p-tabview-nav li.p-highlight .p-tabview-nav-link) {
-  background: transparent;
-  color: var(--accent-cyan);
-  font-weight: 600;
-  border-bottom-color: var(--accent-cyan);
-}
-
-.config-tabs :deep(.p-tabview-nav li .p-tabview-nav-link:hover) {
-  background: var(--bg-surface);
-  color: var(--text-primary);
-  border-bottom-color: var(--text-secondary);
-}
-
-.config-tabs :deep(.p-tabview-nav li.p-highlight .p-tabview-nav-link:hover) {
-  border-bottom-color: var(--accent-cyan);
-  color: var(--accent-cyan);
-}
-
-.config-tabs :deep(.p-tabview-panels) {
-  padding: 0;
-  background: transparent;
-  border: none;
-}
-
-.tab-content {
-  padding: var(--spacing-md);
-  min-height: 200px;
-  background: transparent;
-}
-
-.tab-section {
-  margin-bottom: var(--spacing-md);
-  display: grid;
-  grid-template-columns: repeat(3, 1fr);
-  gap: var(--spacing-md);
-}
-
-.tab-section:last-child {
-  margin-bottom: 0;
-}
-
-.tab-section-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  margin-bottom: var(--spacing-md);
-  padding-bottom: var(--spacing-xs);
-  border-bottom: 1px solid var(--border-primary);
-  color: var(--text-primary);
-  font-size: 1rem;
-  font-weight: 600;
-  grid-column: 1 / -1;
-}
-
-.tab-section-title i {
-  font-size: 1.2rem;
-  color: var(--accent-cyan);
-}
-
-.config-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.model-info {
-  flex: 1;
-}
-
-.model-tag.tag-architecture {
-  background: rgba(59, 130, 246, 0.1);
-  color: var(--accent-blue);
-  border: 1px solid rgba(59, 130, 246, 0.2);
-}
-
-.model-tag.tag-layers {
-  background: rgba(34, 211, 238, 0.1);
-  color: var(--accent-cyan);
-  border: 1px solid rgba(34, 211, 238, 0.2);
-}
-
-.model-tag.tag-pipeline {
-  background: rgba(59, 130, 246, 0.12);
-  color: var(--accent-blue);
-  border: 1px solid rgba(59, 130, 246, 0.35);
-  text-transform: uppercase;
-  letter-spacing: 0.05em;
-  font-weight: 600;
-}
-
-.embedding-notice {
-  margin-top: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  border-radius: var(--radius-md);
-  border: 1px solid rgba(59, 130, 246, 0.3);
-  background: rgba(59, 130, 246, 0.08);
-  display: flex;
-  gap: var(--spacing-sm);
-  align-items: flex-start;
-}
-
-.embedding-notice i {
-  font-size: 1.25rem;
-  color: var(--accent-blue);
-}
-
-.embedding-notice p {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 0.85rem;
-}
-
-/* Configuration Warnings */
-.config-warnings {
-  margin-top: var(--spacing-lg);
-  padding: var(--spacing-md);
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  box-shadow: var(--shadow-md);
-}
-
-.warnings-list {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  margin-top: var(--spacing-md);
-}
-
-.warning-item {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm) var(--spacing-md);
-  border-radius: var(--radius-md);
-  font-size: 0.9rem;
-  transition: all var(--transition-normal);
-}
-
-.warning-item.warning {
-  background: rgba(245, 158, 11, 0.1);
-  border: 1px solid rgba(245, 158, 11, 0.3);
-  color: var(--status-warning);
-}
-
-.warning-item.error {
-  background: rgba(239, 68, 68, 0.1);
-  border: 1px solid rgba(239, 68, 68, 0.3);
-  color: var(--status-error);
-}
-
-.warning-item i {
-  font-size: 1rem;
-  flex-shrink: 0;
-}
-
-.warning-item.warning i {
-  color: var(--status-warning);
-}
-
-.warning-item.error i {
-  color: var(--status-error);
-}
-
-.meta-item {
-  padding: var(--spacing-xs) var(--spacing-sm);
-  background: var(--bg-surface);
-  border-radius: var(--radius-sm);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.header-actions {
-  display: flex;
-  flex-wrap: wrap;
-  gap: var(--spacing-sm);
-  align-items: center;
-}
-
-.preset-buttons {
-  display: flex;
-  gap: var(--spacing-xs);
-}
-
-.smart-auto-settings-button {
-  margin-left: var(--spacing-xs);
-}
-
-/* Menu styling for Smart Auto */
-:deep(.p-menu) {
-  padding: 0.75rem !important;
-  min-width: 220px;
-  background: var(--surface-ground) !important;
-  border: 1px solid var(--surface-border) !important;
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15) !important;
-}
-
-:deep(.p-menu .p-menuitem) {
-  margin: 0.375rem 0 !important;
-}
-
-:deep(.p-menu .p-menuitem-link) {
-  padding: 0.875rem 1.125rem !important;
-  border-radius: 0.5rem !important;
-  transition: all 0.2s ease !important;
-  color: var(--text-primary) !important;
-  background: transparent !important;
-}
-
-:deep(.p-menu .p-menuitem-link .p-menuitem-text) {
-  color: var(--text-primary) !important;
-  font-weight: 500 !important;
-}
-
-:deep(.p-menu .p-menuitem-link .p-menuitem-icon) {
-  color: var(--text-secondary) !important;
-  margin-right: 0.75rem !important;
-}
-
-:deep(.p-menu .p-menuitem-link:hover) {
-  background: var(--primary-color) !important;
-}
-
-:deep(.p-menu .p-menuitem-link:hover .p-menuitem-text) {
-  color: white !important;
-}
-
-:deep(.p-menu .p-menuitem-link:hover .p-menuitem-icon) {
-  color: white !important;
-}
-
-:deep(.p-menu .p-menuitem-separator) {
-  margin: 0.625rem 0 !important;
-  border-top: 1px solid var(--surface-border) !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected),
-:deep(.p-menu .p-menuitem-link.active) {
-  background: color-mix(in srgb, var(--accent-blue) 18%, transparent) !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected .p-menuitem-text),
-:deep(.p-menu .p-menuitem-link.active .p-menuitem-text) {
-  color: var(--primary-color) !important;
-  font-weight: 600 !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected .p-menuitem-icon),
-:deep(.p-menu .p-menuitem-link.active .p-menuitem-icon) {
-  color: var(--primary-color) !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected:hover),
-:deep(.p-menu .p-menuitem-link.active:hover) {
-  background: var(--primary-color) !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected:hover .p-menuitem-text),
-:deep(.p-menu .p-menuitem-link.active:hover .p-menuitem-text) {
-  color: white !important;
-}
-
-:deep(.p-menu .p-menuitem-link.menu-item-selected:hover .p-menuitem-icon),
-:deep(.p-menu .p-menuitem-link.active:hover .p-menuitem-icon) {
-  color: white !important;
-}
-
-/* Badge styling for Smart Auto button */
-:deep(.p-button .p-badge) {
-  padding: 0.4rem 0.7rem !important;
-  margin-left: 0.5rem !important;
-  border-radius: 0.5rem !important;
-  font-size: 0.7rem !important;
-  font-weight: 600 !important;
-  line-height: 1 !important;
-  min-width: auto !important;
-  background: rgba(255, 255, 255, 0.35) !important;
-  color: rgba(0, 0, 0, 0.9) !important;
-  border: 1px solid rgba(0, 0, 0, 0.15) !important;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
-}
-
-.usage-mode-badge {
-  padding: 0.4rem 0.7rem !important;
-  margin-left: 0.5rem !important;
-  border-radius: 0.5rem !important;
-  font-size: 0.7rem !important;
-  font-weight: 600 !important;
-  line-height: 1 !important;
-  min-width: auto !important;
-  background: rgba(255, 255, 255, 0.35) !important;
-  color: rgba(0, 0, 0, 0.9) !important;
-  border: 1px solid rgba(0, 0, 0, 0.15) !important;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
-  display: inline-flex !important;
-  align-items: center !important;
-  justify-content: center !important;
-}
-
-
-.action-buttons {
-  display: flex;
-  gap: var(--spacing-xs);
-  align-items: center;
-}
-
-/* Quick Start Button - Now in header, no separate styles needed */
-
-/* Quick Start Modal Styles */
-:deep(.quick-start-modal) {
-  max-width: 800px;
-  width: 90vw;
-}
-
-:deep(.quick-start-modal .p-dialog-header) {
-  padding: var(--spacing-xl);
-  border-bottom: 1px solid var(--border-primary);
-}
-
-:deep(.quick-start-modal .p-dialog-content) {
-  padding: var(--spacing-xl);
-  max-height: 70vh;
-  overflow-y: auto;
-}
-
-:deep(.quick-start-modal .p-dialog-footer) {
-  padding: var(--spacing-lg) var(--spacing-xl);
-  border-top: 1px solid var(--border-primary);
-}
-
-.quick-start-modal-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-lg);
-}
-
-.quick-start-icon {
-  font-size: 3rem;
-  flex-shrink: 0;
-  line-height: 1;
-}
-
-.quick-start-modal-header h3 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.75rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.quick-start-modal-header p {
-  margin: 0;
-  color: var(--text-secondary);
-  font-size: 1rem;
-}
-
-.quick-start-content {
-  display: grid;
-  grid-template-columns: 1fr 1.5fr;
-  gap: var(--spacing-xl);
-}
-
-.preset-cards {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.preset-card {
-  background: var(--bg-surface);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-  cursor: pointer;
-  transition: all var(--transition-normal);
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.preset-card:hover {
-  border-color: var(--accent-cyan);
-  transform: translateY(-2px);
-  box-shadow: var(--shadow-md);
-}
-
-.preset-card.wizard-card {
-  border-color: rgba(34, 211, 238, 0.5);
-  background: linear-gradient(135deg, rgba(34, 211, 238, 0.05), rgba(59, 130, 246, 0.05));
-}
-
-.preset-card.wizard-card:hover {
-  border-color: var(--accent-cyan);
-  background: linear-gradient(135deg, rgba(34, 211, 238, 0.1), rgba(59, 130, 246, 0.1));
-}
-
-.preset-icon {
-  font-size: 2rem;
-  flex-shrink: 0;
-}
-
-.preset-info h4 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.preset-info p {
-  margin: 0;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  line-height: 1.4;
-}
-
-.smart-auto-section {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-lg);
-  padding: var(--spacing-lg);
-}
-
-.smart-auto-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin-bottom: var(--spacing-sm);
-}
-
-.smart-auto-header i {
-  font-size: 1.5rem;
-  color: var(--accent-primary);
-}
-
-.smart-auto-header h4 {
-  margin: 0;
-  font-size: 1.1rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.smart-auto-description {
-  margin: 0 0 var(--spacing-lg) 0;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-  line-height: 1.5;
-}
-
-.usage-mode-selector {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  margin-bottom: var(--spacing-lg);
-}
-
-.radio-option {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-md);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-md);
-  cursor: pointer;
-  transition: all var(--transition-normal);
-  background: transparent;
-}
-
-.radio-option:hover {
-  border-color: var(--accent-cyan);
-  background: rgba(34, 211, 238, 0.05);
-}
-
-.radio-option.active {
-  border-color: var(--accent-cyan);
-  background: rgba(34, 211, 238, 0.1);
-}
-
-.radio-option i {
-  font-size: 1.5rem;
-  color: var(--accent-cyan);
-  flex-shrink: 0;
-}
-
-.radio-option strong {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-  font-size: 1rem;
-  color: var(--text-primary);
-}
-
-.radio-option small {
-  display: block;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.smart-auto-button {
-  width: 100%;
-}
-
-.flash-attention-warning {
-  display: flex;
-  align-items: flex-start;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  margin-bottom: var(--spacing-md);
-  background: rgba(239, 68, 68, 0.1);
-  border: 1px solid rgba(239, 68, 68, 0.3);
-  border-radius: var(--radius-md);
-  color: var(--status-error);
-}
-
-.flash-attention-warning i {
-  font-size: 1.5rem;
-  flex-shrink: 0;
-  margin-top: 2px;
-}
-
-.warning-content strong {
-  display: block;
-  margin-bottom: var(--spacing-xs);
-  font-weight: 600;
-}
-
-.warning-content p {
-  margin: 0;
-  font-size: 0.875rem;
-  line-height: 1.5;
-}
-
-.expert-info {
-  display: flex;
-  gap: var(--spacing-sm);
-  align-items: center;
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-/* Configuration Grid */
-
-.config-search-bar {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  width: 250px;
-  transition: width var(--transition-normal);
-  margin-bottom: var(--spacing-md);
-}
-
-.config-search-bar.search-focused {
-  width: 350px;
-}
-
-.search-section {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  flex: 0 0 auto;
-  width: 200px;
-  transition: width var(--transition-normal);
-}
-
-.search-section.search-focused {
-  width: 300px;
-}
-
-.config-search-input {
-  width: 100%;
-  padding-left: 2rem;
-  transition: all var(--transition-normal);
-}
-
-.config-search-bar .p-input-icon-left {
-  position: relative;
-  width: 100%;
-}
-
-.config-search-bar .p-input-icon-left i {
-  position: absolute;
-  left: 0.5rem;
-  top: 50%;
-  transform: translateY(-50%);
-  color: var(--text-secondary);
-  z-index: 1;
-  font-size: 0.875rem;
-}
-
-.section-controls {
-  display: flex;
-  gap: var(--spacing-sm);
-  flex-shrink: 0;
-}
-
-.config-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
-  gap: var(--spacing-xl);
-  margin-top: var(--spacing-lg);
-}
-
-.config-section {
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-}
-
-.config-section > summary {
-  cursor: pointer;
-  padding: var(--spacing-lg);
-  list-style: none;
-}
-
-.config-section > summary::-webkit-details-marker {
-  display: none;
-}
-
-.config-section[open] > summary {
-  border-bottom: 1px solid var(--border-primary);
-}
-
-.config-section:hover {
-  box-shadow: var(--shadow-lg);
-  transform: translateY(-2px);
-}
-
-.section-title {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  margin: 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-  font-weight: 600;
-  user-select: none;
-}
-
-.title-left {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.section-badge {
-  display: inline-block;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 600;
-  letter-spacing: 0.5px;
-  text-transform: uppercase;
-}
-
-.section-badge.essential-badge {
-  background: rgba(16, 185, 129, 0.15);
-  color: var(--status-success);
-}
-
-.section-badge.advanced-badge {
-  background: rgba(245, 158, 11, 0.15);
-  color: var(--status-warning);
-}
-
-.section-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
-  gap: var(--spacing-lg);
-  width: 100%;
-  min-width: 0;
-  padding: var(--spacing-xl);
-}
-
-.config-field {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  min-width: 0;
-  width: 100%;
-}
-
-.config-field.full-width {
-  grid-column: 1 / -1;
-}
-
-.config-field label {
-  font-weight: 500;
-  color: var(--text-primary);
-  font-size: 0.9rem;
-}
-
-.config-field small {
-  color: var(--text-secondary);
-  font-size: 0.75rem;
-  line-height: 1.3;
-}
-
-.inline-validation {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 500;
-  margin-top: var(--spacing-xs);
-  animation: slideIn 0.2s ease-out;
-}
-
-.inline-validation.error {
-  background: rgba(239, 68, 68, 0.1);
-  color: var(--status-error);
-  border: 1px solid rgba(239, 68, 68, 0.2);
-}
-
-.inline-validation.warning {
-  background: rgba(245, 158, 11, 0.1);
-  color: var(--status-warning);
-  border: 1px solid rgba(245, 158, 11, 0.2);
-}
-
-.inline-validation.success {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--status-success);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.inline-validation i {
-  font-size: 0.875rem;
-  flex-shrink: 0;
-}
-
-@keyframes slideIn {
-  from {
-    opacity: 0;
-    transform: translateY(-4px);
-  }
-  to {
-    opacity: 1;
-    transform: translateY(0);
-  }
-}
-
-/* Sidebar */
-.config-sidebar {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-lg);
-  position: sticky;
-  top: var(--spacing-xl);
-  max-height: calc(100vh - 2 * var(--spacing-xl));
-  overflow-y: auto;
-  align-self: start;
-}
-
-.vram-monitor {
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-lg);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-}
-
-.vram-monitor:hover {
-  box-shadow: var(--shadow-lg);
-  transform: translateY(-2px);
-}
-
-.monitor-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-lg);
-}
-
-.monitor-header h3 {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin: 0;
-  color: var(--text-primary);
-  font-size: 1.1rem;
-}
-
-.monitor-meta {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.mode-badge {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  padding: 2px 6px;
-  font-weight: 500;
-  color: var(--text-primary);
-}
-
-.ram-snapshot {
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  padding: 2px 6px;
-  font-weight: 500;
-  color: var(--text-primary);
-}
-
-.monitor-content {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
-
-.vram-summary {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.vram-total {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.total-label {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-}
-
-.total-value {
-  font-weight: 700;
-  font-size: 1.1rem;
-}
-
-.total-value.success {
-  color: var(--status-success);
-}
-
-.total-value.warning {
-  color: var(--status-warning);
-}
-
-.vram-progress {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-
-.progress-text {
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-  text-align: center;
-}
-
-.vram-breakdown {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-
-.breakdown-item {
-  display: flex;
-  justify-content: space-between;
-  font-size: 0.875rem;
-}
-
-.item-label {
-  color: var(--text-secondary);
-}
-
-.item-value {
-  color: var(--text-primary);
-  font-weight: 500;
-}
-
-.vram-warning {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  background: rgba(245, 158, 11, 0.1);
-  border: 1px solid rgba(245, 158, 11, 0.3);
-  border-radius: var(--radius-md);
-  color: var(--status-warning);
-  font-size: 0.875rem;
-}
-
-.nvlink-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  background: rgba(34, 211, 238, 0.1);
-  border: 1px solid rgba(34, 211, 238, 0.3);
-  border-radius: var(--radius-md);
-  color: var(--accent-cyan);
-  font-size: 0.875rem;
-}
-
-.monitor-empty {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-xl);
-  color: var(--text-muted);
-  text-align: center;
-}
-
-.monitor-empty i {
-  font-size: 2rem;
-  color: var(--text-muted);
-}
-
-.monitor-loading {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-xl);
-  color: var(--text-secondary);
-  text-align: center;
-}
+  <div class="model-config-view">
 
-.monitor-loading i {
-  font-size: 2rem;
-  color: var(--accent-primary);
-  animation: spin 1s linear infinite;
-}
-
-@keyframes spin {
-  from {
-    transform: rotate(0deg);
-  }
-
-  to {
-    transform: rotate(360deg);
-  }
-}
+    <div v-if="loading" class="loading-state">
+      <ProgressSpinner style="width:40px;height:40px" />
+      <span>Loading configuration…</span>
+    </div>
 
-.cpu-mode-info {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
+    <div v-else-if="!model" class="empty-state">
+      <i class="pi pi-exclamation-circle" style="font-size:3rem;color:var(--text-secondary)" />
+      <h3>Model not found</h3>
+      <Button label="Back to Models" icon="pi pi-arrow-left" @click="$router.push('/models')" />
+    </div>
 
-.cpu-mode-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 1rem;
-}
+    <template v-else>
+      <!-- Header -->
+      <div class="config-header">
+        <Button icon="pi pi-arrow-left" text severity="secondary" @click="$router.push('/models')" />
+        <div class="header-info">
+          <h1>{{ model.display_name || model.base_model_name }}</h1>
+          <div class="header-meta">
+            <Tag :value="model.format || 'gguf'" severity="info" />
+            <Tag v-if="model.quantization" :value="model.quantization" severity="secondary" />
+            <a :href="`https://huggingface.co/${model.huggingface_id}`" target="_blank" class="hf-link">
+              <i class="pi pi-external-link" /> {{ model.huggingface_id }}
+            </a>
+          </div>
+        </div>
+      </div>
 
-.cpu-threads-info {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
+      <!-- Engine Selector -->
+      <div class="config-card">
+        <div class="section-label">Engine</div>
+        <div class="engine-selector">
+          <div
+            v-for="eng in engineOptions"
+            :key="eng.value"
+            class="engine-option"
+            :class="{ selected: config.engine === eng.value }"
+            @click="changeEngine(eng.value)"
+          >
+            <i :class="['pi', eng.icon]" />
+            <span>{{ eng.label }}</span>
+          </div>
+        </div>
+      </div>
 
-.threads-item {
-  display: flex;
-  justify-content: space-between;
-  font-size: 0.875rem;
-}
+      <!-- Basic Parameters -->
+      <div class="config-card">
+        <div class="section-label">Basic Parameters</div>
+        <div class="params-grid">
+          <div v-for="param in basicParams" :key="param.key" class="param-field">
+            <label :for="`param-${param.key}`">
+              {{ param.label }}
+              <i class="pi pi-info-circle param-info" v-tooltip.top="param.description" />
+            </label>
+            <InputNumber
+              v-if="param.type === 'int'"
+              :id="`param-${param.key}`"
+              v-model="config[param.key]"
+              :placeholder="String(param.default ?? '')"
+              class="param-input"
+            />
+            <InputNumber
+              v-else-if="param.type === 'float'"
+              :id="`param-${param.key}`"
+              v-model="config[param.key]"
+              :minFractionDigits="1"
+              :maxFractionDigits="4"
+              :placeholder="String(param.default ?? '')"
+              class="param-input"
+            />
+            <InputSwitch
+              v-else-if="param.type === 'bool'"
+              :id="`param-${param.key}`"
+              v-model="config[param.key]"
+            />
+            <InputText
+              v-else
+              :id="`param-${param.key}`"
+              v-model="config[param.key]"
+              :placeholder="param.default != null ? String(param.default) : ''"
+              class="param-input"
+            />
+          </div>
+        </div>
+      </div>
 
-.threads-label {
-  color: var(--text-secondary);
-}
+      <!-- Advanced Parameters -->
+      <div class="config-card">
+        <div class="section-label">
+          Advanced Parameters
+          <span class="section-count" v-if="activeAdvancedParams.length">
+            {{ activeAdvancedParams.length }} active
+          </span>
+        </div>
 
-.threads-value {
-  color: var(--text-primary);
-  font-weight: 500;
-}
+        <div v-if="activeAdvancedParams.length" class="params-grid" style="margin-bottom:1rem">
+          <div v-for="param in activeAdvancedParams" :key="param.key" class="param-field">
+            <label :for="`adv-${param.key}`">
+              {{ param.label }}
+              <i class="pi pi-info-circle param-info" v-tooltip.top="param.description" />
+              <Button
+                icon="pi pi-times"
+                text
+                severity="danger"
+                size="small"
+                class="remove-param-btn"
+                v-tooltip.top="'Remove parameter'"
+                @click="removeAdvancedParam(param.key)"
+              />
+            </label>
+            <InputNumber
+              v-if="param.type === 'int'"
+              :id="`adv-${param.key}`"
+              v-model="config[param.key]"
+              :placeholder="String(param.default ?? '')"
+              class="param-input"
+            />
+            <InputNumber
+              v-else-if="param.type === 'float'"
+              :id="`adv-${param.key}`"
+              v-model="config[param.key]"
+              :minFractionDigits="1"
+              :maxFractionDigits="4"
+              :placeholder="String(param.default ?? '')"
+              class="param-input"
+            />
+            <InputSwitch
+              v-else-if="param.type === 'bool'"
+              :id="`adv-${param.key}`"
+              v-model="config[param.key]"
+            />
+            <InputText
+              v-else
+              :id="`adv-${param.key}`"
+              v-model="config[param.key]"
+              :placeholder="param.default != null ? String(param.default) : ''"
+              class="param-input"
+            />
+          </div>
+        </div>
 
-.cpu-performance-tip {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  background: rgba(34, 211, 238, 0.1);
-  border: 1px solid rgba(34, 211, 238, 0.3);
-  border-radius: var(--radius-md);
-  color: var(--accent-cyan);
-  font-size: 0.875rem;
-}
+        <div class="add-param-row">
+          <Dropdown
+            v-model="selectedNewParam"
+            :options="availableAdvancedParams"
+            optionLabel="label"
+            optionValue="key"
+            placeholder="Add parameter…"
+            filter
+            :filterPlaceholder="'Search parameters…'"
+            class="add-param-dropdown"
+          >
+            <template #option="{ option }">
+              <div class="param-option">
+                <span class="param-option-label">{{ option.label }}</span>
+                <span class="param-option-desc">{{ option.description }}</span>
+              </div>
+            </template>
+          </Dropdown>
+          <Button
+            icon="pi pi-plus"
+            label="Add"
+            severity="info"
+            outlined
+            :disabled="!selectedNewParam"
+            @click="addAdvancedParam"
+          />
+        </div>
+      </div>
 
+      <!-- Custom CLI Arguments -->
+      <div class="config-card">
+        <div class="section-label">
+          Custom Arguments
+          <small class="section-hint">Raw CLI flags appended to the server command</small>
+        </div>
+        <Textarea
+          v-model="config.custom_args"
+          rows="2"
+          placeholder="e.g. --some-flag value --another-flag"
+          style="width:100%;font-family:monospace;font-size:0.875rem"
+          autoResize
+        />
+      </div>
 
-/* Responsive */
-@media (max-width: 1200px) {
-  .config-grid {
-    grid-template-columns: 1fr;
-  }
+      <!-- Actions -->
+      <div class="config-actions">
+        <Button
+          label="Save Configuration"
+          icon="pi pi-save"
+          severity="success"
+          :loading="saving"
+          @click="saveConfig"
+        />
+        <Button
+          label="Reset to Saved"
+          icon="pi pi-refresh"
+          severity="secondary"
+          outlined
+          @click="resetConfig"
+        />
+      </div>
+    </template>
+  </div>
+</template>
 
-  .section-grid {
-    grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
-  }
-}
+<script setup>
+import { ref, computed, watch, onMounted } from 'vue'
+import { useRoute, useRouter } from 'vue-router'
+import { useToast } from 'primevue/usetoast'
+import axios from 'axios'
+import Button from 'primevue/button'
+import Tag from 'primevue/tag'
+import InputText from 'primevue/inputtext'
+import InputNumber from 'primevue/inputnumber'
+import InputSwitch from 'primevue/inputswitch'
+import Dropdown from 'primevue/dropdown'
+import Textarea from 'primevue/textarea'
+import ProgressSpinner from 'primevue/progressspinner'
+import { useModelStore } from '@/stores/models'
 
-@media (max-width: 768px) {
-  .config-layout {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-lg);
-  }
+const route = useRoute()
+const router = useRouter()
+const toast = useToast()
+const modelStore = useModelStore()
 
-  /* Tablet: Stack memory dashboard */
-  .memory-dashboard {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-lg);
-  }
+// ── State ──────────────────────────────────────────────────
+const loading = ref(true)
+const saving = ref(false)
+const model = ref(null)
+const config = ref({})
+const savedConfig = ref({})          // for reset
+const paramRegistry = ref({ basic: [], advanced: [] })
+const selectedNewParam = ref(null)
+const activeAdvancedKeys = ref([])   // keys of advanced params currently in the form
+
+const allEngineOptions = [
+  { value: 'llama_cpp', label: 'llama.cpp', icon: 'pi-microchip' },
+  { value: 'ik_llama',  label: 'ik_llama.cpp', icon: 'pi-microchip' },
+  { value: 'lmdeploy',  label: 'LMDeploy', icon: 'pi-server' },
+]
 
-  /* Tablet: Adjust quick start layout */
-  .quick-start-content {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-xl);
-  }
+// GGUF is not compatible with LMDeploy; show LMDeploy only for safetensors
+const engineOptions = computed(() => {
+  const fmt = model.value?.format
+  if (fmt === 'safetensors') return allEngineOptions
+  return allEngineOptions.filter(eng => eng.value !== 'lmdeploy')
+})
 
-  /* Tablet: Full width preset cards */
-  .preset-cards {
-    width: 100%;
-  }
+// ── Computed ───────────────────────────────────────────────
+const basicParams = computed(() => paramRegistry.value.basic || [])
+const allAdvancedParams = computed(() => paramRegistry.value.advanced || [])
 
-  /* Tablet: Adjust search section */
-  .search-section {
-    max-width: 100%;
-  }
+const activeAdvancedParams = computed(() =>
+  allAdvancedParams.value.filter(p => activeAdvancedKeys.value.includes(p.key))
+)
 
-  /* Tablet: 2 column grid for config fields */
-  .tab-section {
-    grid-template-columns: repeat(2, 1fr);
-  }
+const availableAdvancedParams = computed(() =>
+  allAdvancedParams.value.filter(p => !activeAdvancedKeys.value.includes(p.key))
+)
 
-  .config-search-bar {
-    width: 100%;
+// ── Helpers ────────────────────────────────────────────────
+function findModelById(id) {
+  for (const group of modelStore.models) {
+    for (const q of group.quantizations || []) {
+      if (q.id === id) return { ...q, base_model_name: group.base_model_name, huggingface_id: group.huggingface_id }
+    }
   }
+  // Fallback: search allQuantizations
+  return modelStore.allQuantizations.find(m => m.id === id) ?? null
 }
 
-@media (max-width: 600px) {
-  .config-layout {
-    padding: var(--spacing-md);
-  }
-
-  .config-header {
-    flex-direction: column;
-    align-items: stretch;
-    gap: var(--spacing-md);
-  }
-
-  .header-actions {
-    flex-wrap: wrap;
-    justify-content: flex-start;
-    gap: var(--spacing-sm);
-  }
-
-  .action-buttons {
-    flex-wrap: wrap;
-  }
-
-  .config-grid {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-lg);
-  }
-
-  /* Mobile: Single column grid for config fields */
-  .tab-section {
-    grid-template-columns: 1fr;
-  }
-
-  .config-search-bar {
-    width: 100%;
-  }
-
-  /* Memory Dashboard - Stack on mobile */
-  .memory-dashboard {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-md);
-  }
-
-  /* Quick Start Modal - Full width on mobile */
-  :deep(.quick-start-modal) {
-    width: 95vw;
-    max-width: none;
-  }
-  
-  :deep(.quick-start-modal .p-dialog-content) {
-    max-height: 85vh;
-    padding: var(--spacing-lg);
-  }
-  
-  .quick-start-content {
-    padding: var(--spacing-lg);
-  }
-
-  .quick-start-content {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-lg);
-  }
-
-  .preset-cards {
-    flex-direction: column;
-  }
-
-  .preset-card {
-    min-height: 80px;
-    padding: var(--spacing-md);
-  }
-
-  .preset-icon {
-    font-size: 1.5rem;
-  }
-
-  .preset-info h4 {
-    font-size: 1rem;
-  }
-
-  .preset-info p {
-    font-size: 0.85rem;
-  }
-
-  /* Smart Auto Section - Full width */
-  .smart-auto-section {
-    width: 100%;
-  }
-
-  .usage-mode-selector {
-    flex-direction: column;
-    gap: var(--spacing-sm);
-  }
-
-  .usage-mode-selector .radio-option {
-    min-height: 60px;
-    padding: var(--spacing-md);
-  }
-
-  /* Config Controls - Full width search */
-  .config-controls {
-    padding: var(--spacing-md);
-  }
-
-  .controls-row {
-    flex-direction: column;
-    gap: var(--spacing-md);
-  }
-
-  .search-section {
-    width: 100%;
-  }
-
-  .config-search-input {
-    width: 100%;
-  }
-
-  .section-controls {
-    width: 100%;
-    justify-content: space-between;
-  }
-
-  /* Performance Card - Stack metrics */
-  .performance-metrics {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-md);
-  }
-
-  .metric-item {
-    padding: var(--spacing-md);
-  }
-
-  /* Touch targets - Minimum 44x44px */
-  .preset-card,
-  .radio-option,
-  button,
-  .p-button {
-    min-height: 44px;
-    min-width: 44px;
-  }
-
-  /* Larger spacing for touch */
-  .config-field {
-    gap: var(--spacing-md);
-  }
-
-  /* Full width inputs on mobile */
-  input[type="number"],
-  input[type="text"],
-  textarea,
-  .p-inputnumber-input,
-  .p-inputtext {
-    width: 100%;
-    min-width: 0;
-  }
-
-  /* Adjust section spacing */
-  .config-section {
-    margin-bottom: var(--spacing-lg);
-  }
-
-  /* Collapsible sections - Larger tap area */
-  .config-section summary {
-    min-height: 48px;
-    padding: var(--spacing-md);
-  }
-
-  .section-grid {
-    grid-template-columns: 1fr;
-  }
-
-  .model-meta {
-    flex-wrap: wrap;
+async function fetchParamRegistry(engine) {
+  try {
+    const { data } = await axios.get('/api/models/param-registry', { params: { engine } })
+    paramRegistry.value = data
+  } catch (e) {
+    console.error('Failed to fetch param registry:', e)
+    paramRegistry.value = { basic: [], advanced: [] }
   }
 }
 
-/* Enhanced Focus Styles for Accessibility */
-.preset-card:focus {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-  box-shadow: 0 0 0 3px var(--focus-ring), var(--shadow-md);
-}
-
-.preset-card:focus-visible {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-.radio-option:focus {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-  box-shadow: 0 0 0 3px var(--focus-ring);
-}
-
-.radio-option:focus-visible {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-.config-section summary:focus {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-.config-section summary:focus-visible {
-  outline: 3px solid var(--accent-cyan);
-  outline-offset: 2px;
-}
-
-/* Skip to main content link for keyboard navigation */
-.skip-link {
-  position: absolute;
-  top: -40px;
-  left: 0;
-  background: var(--accent-cyan);
-  color: white;
-  padding: var(--spacing-sm) var(--spacing-md);
-  text-decoration: none;
-  z-index: 1000;
-  border-radius: var(--radius-md);
-}
-
-.skip-link:focus {
-  top: var(--spacing-md);
-  outline: 3px solid var(--accent-blue);
-  outline-offset: 2px;
-}
-
-.stacked-bar {
-  position: relative;
-  width: 100%;
-  height: 10px;
-  background: var(--bg-secondary);
-  border-radius: 5px;
-  overflow: hidden;
-}
-
-.stacked-bar .bar-current {
-  position: absolute;
-  left: 0;
-  top: 0;
-  bottom: 0;
-  background: var(--accent-red);
-}
-
-.stacked-bar .bar-additional {
-  position: absolute;
-  top: 0;
-  bottom: 0;
-  background: linear-gradient(90deg, var(--accent-blue), var(--accent-cyan));
-  opacity: 0.8;
-  transform-origin: left;
-  /* left position is set via inline style dynamically */
+function detectActiveAdvancedKeys(cfg) {
+  const basicKeys = new Set([
+    ...(paramRegistry.value.basic || []).map(p => p.key),
+    'engine', 'custom_args',
+  ])
+  return Object.keys(cfg).filter(
+    k => !basicKeys.has(k) && cfg[k] != null && cfg[k] !== ''
+  )
 }
 
-/* Use inline styles for widths; classes control colors */
-.stacked-bar.success {
-  box-shadow: inset 0 0 0 1px rgba(16, 185, 129, 0.2);
+// ── Engine change ──────────────────────────────────────────
+async function changeEngine(engine) {
+  config.value.engine = engine
+  await fetchParamRegistry(engine)
+  // Recompute which advanced keys are active with new registry
+  activeAdvancedKeys.value = detectActiveAdvancedKeys(config.value)
 }
 
-.stacked-bar.warning {
-  box-shadow: inset 0 0 0 1px rgba(234, 179, 8, 0.3);
+// ── Advanced param management ──────────────────────────────
+function addAdvancedParam() {
+  if (!selectedNewParam.value) return
+  const param = allAdvancedParams.value.find(p => p.key === selectedNewParam.value)
+  if (!param) return
+  if (!activeAdvancedKeys.value.includes(param.key)) {
+    activeAdvancedKeys.value.push(param.key)
+    if (config.value[param.key] == null) {
+      config.value[param.key] = param.default ?? null
+    }
+  }
+  selectedNewParam.value = null
 }
 
-/* New Memory Dashboard Styles */
-.memory-dashboard {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(min(400px, 100%), 1fr));
-  gap: var(--spacing-lg);
-  margin-bottom: var(--spacing-xl);
-  width: 100%;
-  max-width: 100%;
-  box-sizing: border-box;
-  min-width: 0;
-  overflow-x: visible;
+function removeAdvancedParam(key) {
+  activeAdvancedKeys.value = activeAdvancedKeys.value.filter(k => k !== key)
+  delete config.value[key]
 }
 
-@media (max-width: 1400px) {
-  .memory-dashboard {
-    grid-template-columns: 1fr;
+// ── Load ───────────────────────────────────────────────────
+async function loadAll() {
+  loading.value = true
+  try {
+    if (!modelStore.models.length) await modelStore.fetchModels()
+    const found = findModelById(route.params.id)
+    if (!found) { loading.value = false; return }
+    model.value = found
+
+    let engine = found.engine || 'llama_cpp'
+    // GGUF is not compatible with LMDeploy; force llama_cpp if saved config had lmdeploy
+    if (found.format !== 'safetensors' && engine === 'lmdeploy') engine = 'llama_cpp'
+    await fetchParamRegistry(engine)
+
+    const { data: cfg } = await axios.get(`/api/models/${route.params.id}/config`)
+    const merged = { engine, ...cfg }
+    config.value = merged
+    savedConfig.value = JSON.parse(JSON.stringify(merged))
+    activeAdvancedKeys.value = detectActiveAdvancedKeys(merged)
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed to load config', detail: e.message, life: 4000 })
+  } finally {
+    loading.value = false
   }
 }
 
-@media (max-width: 900px) {
-  .memory-dashboard {
-    grid-template-columns: 1fr;
-    min-width: 0;
+// ── Save ───────────────────────────────────────────────────
+async function saveConfig() {
+  saving.value = true
+  try {
+    // Build clean config: only include non-null values for advanced params
+    const toSave = { ...config.value }
+    // Remove advanced params with null/empty values (treat as "not set")
+    for (const key of activeAdvancedKeys.value) {
+      if (toSave[key] == null || toSave[key] === '') {
+        delete toSave[key]
+      }
+    }
+    await axios.put(`/api/models/${route.params.id}/config`, toSave)
+    savedConfig.value = JSON.parse(JSON.stringify(toSave))
+    toast.add({ severity: 'success', summary: 'Saved', detail: 'Configuration saved', life: 2000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Save failed', detail: e.message, life: 4000 })
+  } finally {
+    saving.value = false
   }
 }
 
-@media (max-width: 480px) {
-  .memory-dashboard {
-    grid-template-columns: 1fr;
-    gap: var(--spacing-md);
-  }
+// ── Reset ──────────────────────────────────────────────────
+function resetConfig() {
+  config.value = JSON.parse(JSON.stringify(savedConfig.value))
+  activeAdvancedKeys.value = detectActiveAdvancedKeys(config.value)
+  toast.add({ severity: 'info', summary: 'Reset', detail: 'Config reset to saved values', life: 2000 })
 }
 
-.memory-status-card {
-  background: var(--gradient-card);
-  border: 2px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  min-width: 0;
-  max-width: 100%;
-  box-sizing: border-box;
-  word-wrap: break-word;
-}
+// ── Lifecycle ──────────────────────────────────────────────
+onMounted(loadAll)
+</script>
 
-.memory-status-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 4px;
-  background: var(--border-primary);
-  transition: all var(--transition-normal);
+<style scoped>
+.model-config-view {
+  max-width: 960px;
+  margin: 0 auto;
+  padding: var(--spacing-lg, 1.5rem);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-lg, 1.5rem);
 }
 
-.memory-status-card.status-good::before {
-  background: var(--gradient-success);
+/* ── Loading / Empty ──────────────────────────────────── */
+.loading-state,
+.empty-state {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 1rem;
+  padding: 4rem 0;
+  color: var(--text-secondary, #9ca3af);
 }
 
-.memory-status-card.status-warning::before {
-  background: var(--gradient-warning);
+/* ── Header ───────────────────────────────────────────── */
+.config-header {
+  display: flex;
+  align-items: flex-start;
+  gap: 0.75rem;
 }
 
-.memory-status-card.status-critical::before {
-  background: var(--gradient-error);
+.header-info { flex: 1; }
+
+.header-info h1 {
+  font-size: 1.25rem;
+  font-weight: 700;
+  margin: 0 0 0.4rem;
+  line-height: 1.2;
 }
 
-.memory-card-header {
+.header-meta {
   display: flex;
   align-items: center;
-  gap: var(--spacing-md);
-  margin-bottom: var(--spacing-lg);
+  gap: 0.5rem;
+  flex-wrap: wrap;
 }
 
-.memory-status-icon {
-  width: 48px;
-  height: 48px;
+.hf-link {
+  font-size: 0.875rem;
+  color: var(--accent-cyan, #22d3ee);
+  text-decoration: none;
   display: flex;
   align-items: center;
-  justify-content: center;
-  border-radius: var(--radius-lg);
-  font-size: 1.5rem;
-  flex-shrink: 0;
-}
-
-.memory-status-card.status-good .memory-status-icon {
-  background: rgba(16, 185, 129, 0.15);
-  color: var(--status-success);
-}
-
-.memory-status-card.status-warning .memory-status-icon {
-  background: rgba(245, 158, 11, 0.15);
-  color: var(--status-warning);
+  gap: 0.25rem;
 }
 
-.memory-status-card.status-critical .memory-status-icon {
-  background: rgba(239, 68, 68, 0.15);
-  color: var(--status-error);
-}
+.hf-link:hover { text-decoration: underline; }
 
-.memory-card-title {
-  flex: 1;
+/* ── Card ─────────────────────────────────────────────── */
+.config-card {
+  background: var(--bg-card, #161b2e);
+  border: 1px solid var(--border-primary, #2a2f45);
+  border-radius: var(--radius-lg, 0.75rem);
+  padding: 1.25rem;
 }
 
-.memory-card-title h4 {
-  margin: 0 0 var(--spacing-xs) 0;
-  font-size: 1.25rem;
-  font-weight: 600;
-  color: var(--text-primary);
+.section-label {
+  font-size: 0.75rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  color: var(--text-secondary, #9ca3af);
+  margin-bottom: 0.875rem;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
 }
 
-.memory-status-badge {
-  display: inline-block;
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.875rem;
+.section-count {
+  background: var(--accent-cyan, #22d3ee);
+  color: #000;
+  border-radius: 999px;
+  padding: 0.1em 0.5em;
+  font-size: 0.7rem;
   font-weight: 600;
-  letter-spacing: 0.5px;
 }
 
-.memory-status-card.status-good .memory-status-badge {
-  background: rgba(16, 185, 129, 0.15);
-  color: var(--status-success);
-}
-
-.memory-status-card.status-warning .memory-status-badge {
-  background: rgba(245, 158, 11, 0.15);
-  color: var(--status-warning);
-}
-
-.memory-status-card.status-critical .memory-status-badge {
-  background: rgba(239, 68, 68, 0.15);
-  color: var(--status-error);
-}
-
-.memory-status-content {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
+.section-hint {
+  font-weight: 400;
+  text-transform: none;
+  letter-spacing: normal;
+  color: var(--text-secondary, #9ca3af);
+  opacity: 0.7;
 }
 
-.memory-usage-display {
+/* ── Engine selector ──────────────────────────────────── */
+.engine-selector {
   display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
+  gap: 0.5rem;
+  flex-wrap: wrap;
 }
 
-.usage-item {
+.engine-option {
   display: flex;
   align-items: center;
-  gap: var(--spacing-sm);
-  font-size: 0.9rem;
-}
-
-.usage-item.total {
-  font-weight: 600;
-  padding-top: var(--spacing-sm);
-  border-top: 1px solid var(--border-primary);
-  font-size: 1rem;
-}
-
-.usage-label {
-  color: var(--text-secondary);
-  min-width: 80px;
+  gap: 0.5rem;
+  padding: 0.5rem 1rem;
+  border-radius: var(--radius-md, 0.5rem);
+  border: 1px solid var(--border-primary, #2a2f45);
+  cursor: pointer;
+  transition: all 0.15s;
+  font-size: 0.875rem;
+  user-select: none;
 }
 
-.usage-value {
-  color: var(--text-primary);
-  font-weight: 500;
-  flex: 1;
+.engine-option:hover {
+  border-color: var(--accent-cyan, #22d3ee);
+  background: rgba(34, 211, 238, 0.05);
 }
 
-.usage-item.total .usage-value {
-  font-weight: 700;
-  color: var(--accent-cyan);
+.engine-option.selected {
+  border-color: var(--accent-cyan, #22d3ee);
+  background: rgba(34, 211, 238, 0.1);
+  color: var(--accent-cyan, #22d3ee);
+  font-weight: 600;
 }
 
-.usage-fraction {
-  color: var(--text-secondary);
-  font-size: 0.875rem;
+/* ── Params grid ──────────────────────────────────────── */
+.params-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
+  gap: 0.875rem;
 }
 
-.memory-progress-bar {
+.param-field {
   display: flex;
   flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.progress-label {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  text-align: center;
+  gap: 0.25rem;
 }
 
-.memory-message {
-  padding: var(--spacing-md);
-  border-radius: var(--radius-md);
-  font-size: 0.9rem;
-  line-height: 1.5;
+.param-field label {
+  font-size: 0.8rem;
   font-weight: 500;
+  color: var(--text-secondary, #9ca3af);
+  display: flex;
+  align-items: center;
+  gap: 0.3rem;
 }
 
-.memory-message.status-good {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--status-success);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
+.param-input { width: 100%; }
 
-.memory-message.status-warning {
-  background: rgba(245, 158, 11, 0.1);
-  color: var(--status-warning);
-  border: 1px solid rgba(245, 158, 11, 0.2);
+.param-info {
+  font-size: 0.7rem;
+  cursor: help;
+  opacity: 0.6;
 }
 
-.memory-message.status-critical {
-  background: rgba(239, 68, 68, 0.1);
-  color: var(--status-error);
-  border: 1px solid rgba(239, 68, 68, 0.2);
+.remove-param-btn {
+  margin-left: auto;
+  padding: 0 !important;
+  height: auto !important;
+  width: auto !important;
 }
 
-.memory-loading {
+/* ── Add param row ────────────────────────────────────── */
+.add-param-row {
   display: flex;
-  flex-direction: column;
+  gap: 0.5rem;
   align-items: center;
-  justify-content: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-xl);
-  color: var(--text-secondary);
 }
 
-.memory-loading i {
-  font-size: 2rem;
-  color: var(--accent-primary);
-  animation: spin 1s linear infinite;
-}
+.add-param-dropdown { flex: 1; }
 
-.advanced-section {
-  grid-column: 1 / -1;
-  padding: var(--spacing-sm) 0;
+.param-option {
+  display: flex;
+  flex-direction: column;
+  gap: 0.15rem;
 }
 
-.advanced-section summary {
-  cursor: pointer;
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: var(--spacing-sm);
-}
+.param-option-label { font-size: 0.875rem; font-weight: 500; }
+.param-option-desc  { font-size: 0.75rem; color: var(--text-secondary, #9ca3af); }
 
-.advanced-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
-  gap: var(--spacing-md);
+/* ── Actions ──────────────────────────────────────────── */
+.config-actions {
+  display: flex;
+  gap: 0.75rem;
+  justify-content: flex-end;
+  padding-bottom: var(--spacing-lg, 1.5rem);
 }
-</style>
\ No newline at end of file
+</style>
diff --git a/frontend/src/views/ModelLibrary.vue b/frontend/src/views/ModelLibrary.vue
index a49820c..fe75ba8 100644
--- a/frontend/src/views/ModelLibrary.vue
+++ b/frontend/src/views/ModelLibrary.vue
@@ -1,775 +1,554 @@
-<template>
-  <div class="model-library">
-    <div class="card">
-      <div class="card-header">
-        <h2 class="card-title">Downloaded Models</h2>
-        <div class="header-actions">
-          <div class="connection-info">
-            <div class="live-indicator" v-if="wsStore.isConnected">
-              <i class="pi pi-circle-fill" style="color: #22d3ee; font-size: 0.5rem;"></i>
-              <span>Live</span>
-            </div>
-            <div class="connection-status" v-else>
-              <i class="pi pi-circle" style="color: #ef4444; font-size: 0.5rem;"></i>
-              <span>{{ wsStore.connectionStatus }}</span>
-            </div>
-          </div>
-          <Button 
-            icon="pi pi-refresh" 
-            @click="refreshModels"
-            :loading="modelStore.loading"
-            severity="secondary"
-            text
-          />
-        </div>
-      </div>
-
-      <!-- Download Progress -->
-      <DownloadProgress />
-
-      <!-- Downloaded Models -->
-      <div 
-        v-if="hasAnyModels" 
-        class="downloaded-models"
-        @touchstart="handlePullToRefreshStart"
-        @touchmove="handlePullToRefreshMove"
-        @touchend="handlePullToRefreshEnd"
-      >
-        <div v-if="pullToRefreshDistance > 0" class="pull-to-refresh-indicator" :style="{ transform: `translateY(${Math.min(pullToRefreshDistance, 60)}px)` }">
-          <i v-if="!modelStore.loading" class="pi pi-arrow-down" :class="{ 'rotated': pullToRefreshDistance >= 60 }"></i>
-          <i v-else class="pi pi-spin pi-spinner"></i>
-          <span>{{ pullToRefreshDistance >= 60 ? 'Release to refresh' : 'Pull to refresh' }}</span>
-        </div>
-        <GgufModelList
-          v-if="hasGgufModels"
-          :model-groups="modelStore.modelGroups"
-          :selected-quantization="selectedQuantization"
-          :starting-models="startingModels"
-          :stopping-models="stoppingModels"
-          @select-quantization="handleSelectQuantization"
-          @start="startSelectedQuantization"
-          @stop="stopRunningQuantization"
-          @configure="configureSelectedQuantization"
-          @delete-quantization="confirmDeleteQuantization"
-          @delete-group="confirmDeleteGroup"
-        />
-        <SafetensorsModelList
-          v-if="hasSafetensorsModels"
-          :models="modelStore.safetensorsModels"
-          :loading="modelStore.safetensorsLoading"
-          @refresh="refreshSafetensors"
-          @delete="confirmDeleteSafetensors"
-        />
-      </div>
-
-      <!-- Empty State -->
-      <div v-else class="empty-state">
-        <i class="pi pi-download"></i>
-        <h3>No Models Downloaded</h3>
-        <p>Download models from HuggingFace to get started.</p>
-        <Button 
-          label="Search Models" 
-          icon="pi pi-search"
-          @click="goToSearch"
-          severity="info"
-        />
-      </div>
-
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, onMounted, onUnmounted, computed } from 'vue'
-import { useRouter } from 'vue-router'
-import { useModelStore } from '@/stores/models'
-import { useWebSocketStore } from '@/stores/websocket'
-import { toast } from 'vue3-toastify'
-import { useConfirm } from 'primevue/useconfirm'
-import Button from 'primevue/button'
-import DownloadProgress from '@/components/DownloadProgress.vue'
-import GgufModelList from '@/components/GgufModelList.vue'
-import SafetensorsModelList from '@/components/SafetensorsModelList.vue'
-
-const router = useRouter()
-const modelStore = useModelStore()
-const wsStore = useWebSocketStore()
-const confirm = useConfirm()
-
-// Reactive state
-const startingModels = ref({})
-const stoppingModels = ref({})
-const selectedQuantization = ref({}) // Track selected quantization per model group
-
-// Pull-to-refresh state
-const pullToRefreshStartY = ref(0)
-const pullToRefreshDistance = ref(0)
-const pullToRefreshThreshold = 60
-const isPullToRefreshActive = ref(false)
-
-let unsubscribeModelStatus = null
-let unsubscribeUnifiedMonitoring = null
-let unsubscribeModelEvents = null
-
-const hasGgufModels = computed(() => modelStore.modelGroups.length > 0)
-const hasSafetensorsModels = computed(() => (modelStore.safetensorsModels || []).length > 0)
-const hasAnyModels = computed(() => hasGgufModels.value || hasSafetensorsModels.value)
-
-const autoSelectQuantizations = () => {
-  modelStore.modelGroups.forEach(group => {
-    if (!selectedQuantization.value[group.huggingface_id] && group.quantizations.length > 0) {
-      selectedQuantization.value[group.huggingface_id] = group.quantizations[0].id
-    }
-  })
-}
-
-onMounted(async () => {
-  await modelStore.fetchModels()
-  await modelStore.fetchSafetensorsModels()
-
-  try {
-    await modelStore.fetchLmdeployStatus()
-  } catch (error) {
-    console.error('Failed to load LMDeploy status', error)
-  }
-
-  // Subscribe to model status updates
-  unsubscribeModelStatus = wsStore.subscribeToModelStatus((data) => {
-    if (data.model_id) {
-      // Find the quantization in the grouped structure
-      modelStore.modelGroups.forEach(group => {
-        const quantization = group.quantizations.find(q => q.id === data.model_id)
-        if (quantization) {
-          quantization.is_active = data.is_active
-          quantization.loading = false
-        }
-      })
-    }
-  })
-  
-  // Subscribe to unified monitoring for real-time model status updates
-  unsubscribeUnifiedMonitoring = wsStore.subscribeToUnifiedMonitoring((data) => {
-    if (data.models) {
-      const runningInstances = data.models.running_instances || []
-      const loadingModels = data.models.loading || {}
-      
-      // Update loading models in the store
-      modelStore.updateLoadingModels(loadingModels)
-      
-      // Create a set of all running model proxy names
-      const runningProxyNames = new Set()
-      
-      // Add proxy names from running instances
-      runningInstances.forEach(instance => {
-        if (instance.proxy_model_name) {
-          runningProxyNames.add(instance.proxy_model_name)
-        }
-      })
-      
-      // Create a set of loading model proxy names
-      const loadingProxyNames = new Set(Object.keys(loadingModels))
-      
-      // Update all model quantizations based on running/loading status
-      modelStore.modelGroups.forEach(group => {
-        group.quantizations.forEach(quantization => {
-          const proxyName = quantization.proxy_name || ''
-          const isRunning = runningProxyNames.has(proxyName)
-          const isLoading = loadingProxyNames.has(proxyName)
-          
-          // Determine state
-          let state = null
-          if (isLoading) {
-            state = 'loading'
-          } else if (isRunning) {
-            state = 'ready'
-          }
-          
-          // Update model status based on whether it's running, loading, or stopped
-          modelStore.updateModelStatus(quantization.id, {
-            is_active: isRunning,
-            llama_swap_status: isLoading ? 'loading' : (isRunning ? 'running' : 'stopped'),
-            llama_swap_model_name: isRunning || isLoading ? proxyName : null,
-            llama_swap_state: state
-          })
-        })
-      })
-    }
-  })
-  
-  // Subscribe to model events for instant updates (no polling)
-  unsubscribeModelEvents = wsStore.subscribeToModelEvents((data) => {
-    const { event, model: proxyName } = data
-    
-    // Find the quantization by proxy name and update immediately
-    modelStore.modelGroups.forEach(group => {
-      group.quantizations.forEach(quantization => {
-        if (quantization.proxy_name === proxyName) {
-          switch (event) {
-            case 'loading':
-              modelStore.updateModelStatus(quantization.id, {
-                llama_swap_status: 'loading',
-                llama_swap_state: 'loading'
-              })
-              break
-            case 'ready':
-              modelStore.updateModelStatus(quantization.id, {
-                is_active: true,
-                llama_swap_status: 'running',
-                llama_swap_state: 'ready'
-              })
-              break
-            case 'stopped':
-              modelStore.updateModelStatus(quantization.id, {
-                is_active: false,
-                llama_swap_status: 'stopped',
-                llama_swap_state: null
-              })
-              break
-          }
-        }
-      })
-    })
-  })
-  
-  autoSelectQuantizations()
-})
-
-onUnmounted(() => {
-  if (typeof unsubscribeModelStatus === 'function') {
-    unsubscribeModelStatus()
-    unsubscribeModelStatus = null
-  }
-  if (typeof unsubscribeUnifiedMonitoring === 'function') {
-    unsubscribeUnifiedMonitoring()
-    unsubscribeUnifiedMonitoring = null
-  }
-  if (typeof unsubscribeModelEvents === 'function') {
-    unsubscribeModelEvents()
-    unsubscribeModelEvents = null
-  }
-})
-
-const handleSelectQuantization = ({ huggingfaceId, quantizationId }) => {
-  if (!huggingfaceId || !quantizationId) return
-  selectedQuantization.value[huggingfaceId] = quantizationId
-}
-
-const startSelectedQuantization = async (modelGroup) => {
-  const quantizationId = selectedQuantization.value[modelGroup.huggingface_id]
-  if (!quantizationId) return
-  
-  startingModels.value[quantizationId] = true
-  try {
-    await modelStore.startModel(quantizationId)
-    toast.success('Model is starting up')
-  } catch (error) {
-    toast.error('Failed to start model')
-  } finally {
-    startingModels.value[quantizationId] = false
-  }
-}
-
-const stopRunningQuantization = async ({ quantizationId }) => {
-  const runningId = quantizationId
-  if (!runningId) return
-  
-  stoppingModels.value[runningId] = true
-  try {
-    await modelStore.stopModel(runningId)
-    toast.success('Model has been stopped')
-  } catch (error) {
-    toast.error('Failed to stop model')
-  } finally {
-    stoppingModels.value[runningId] = false
-  }
-}
-
-const configureSelectedQuantization = (modelGroup) => {
-  const quantizationId = selectedQuantization.value[modelGroup.huggingface_id]
-  if (!quantizationId) return
-  
-  router.push(`/models/${quantizationId}/config`)
-}
-
-const confirmDeleteQuantization = (quantization) => {
-  confirm.require({
-    message: `Are you sure you want to delete the "${quantization.quantization}" quantization? This will remove the model file and cannot be undone.`,
-    header: 'Delete Quantization',
-    icon: 'pi pi-exclamation-triangle',
-    rejectLabel: 'Cancel',
-    acceptLabel: 'Delete',
-    accept: async () => {
-      try {
-        await modelStore.deleteModel(quantization.id)
-        toast.success(`${quantization.quantization} quantization has been deleted`)
-        
-        // If this was the selected quantization, select another one
-        const modelGroup = modelStore.modelGroups.find(g => 
-          g.quantizations.some(q => q.id === quantization.id)
-        )
-        if (modelGroup && selectedQuantization.value[modelGroup.huggingface_id] === quantization.id) {
-          const remaining = modelGroup.quantizations.filter(q => q.id !== quantization.id)
-          if (remaining.length > 0) {
-            selectedQuantization.value[modelGroup.huggingface_id] = remaining[0].id
-          } else {
-            delete selectedQuantization.value[modelGroup.huggingface_id]
-          }
-        }
-      } catch (error) {
-        toast.error('Failed to delete quantization')
-      }
-    }
-  })
-}
-
-const confirmDeleteGroup = (modelGroup) => {
-  confirm.require({
-    message: `Are you sure you want to delete all quantizations of "${modelGroup.huggingface_id}"? This will remove all model files and cannot be undone.`,
-    header: 'Delete All Quantizations',
-    icon: 'pi pi-exclamation-triangle',
-    rejectLabel: 'Cancel',
-    acceptLabel: 'Delete All',
-    accept: async () => {
-      try {
-        await modelStore.deleteModelGroup(modelGroup.huggingface_id)
-        toast.success(`${modelGroup.huggingface_id} has been deleted`)
-        
-        // Remove from selected quantizations
-        delete selectedQuantization.value[modelGroup.huggingface_id]
-      } catch (error) {
-        toast.error('Failed to delete model group')
-      }
-    }
-  })
-}
-
-const confirmDeleteSafetensors = (group) => {
-  const modelName = group?.huggingface_id || 'this model'
-  const fileCount = group?.files?.length || 0
-  confirm.require({
-    message: `Delete safetensors model "${modelName}" (${fileCount} file${fileCount !== 1 ? 's' : ''})? This action cannot be undone.`,
-    header: 'Delete Safetensors Model',
-    icon: 'pi pi-exclamation-triangle',
-    rejectLabel: 'Cancel',
-    acceptLabel: 'Delete',
-    accept: async () => {
-      try {
-        await modelStore.deleteSafetensorsModel(group.huggingface_id)
-        toast.success('Safetensors model deleted')
-      } catch (error) {
-        toast.error('Failed to delete safetensors model')
-      }
-    }
-  })
-}
-
-const refreshModels = async () => {
-  try {
-    await modelStore.fetchModels()
-    await modelStore.fetchSafetensorsModels()
-    autoSelectQuantizations()
-    toast.success('Models refreshed')
-  } catch (error) {
-    toast.error('Failed to refresh models')
-  }
-}
-
-const refreshSafetensors = async () => {
-  try {
-    await modelStore.fetchSafetensorsModels()
-    await modelStore.fetchLmdeployStatus()
-    toast.success('Safetensors list refreshed')
-  } catch (error) {
-    toast.error('Failed to refresh safetensors list')
-  }
-}
-
-// Pull-to-refresh handlers
-const handlePullToRefreshStart = (e) => {
-  // Only trigger if user is at the top of the page
-  if (window.scrollY === 0 && e.touches && e.touches.length > 0) {
-    pullToRefreshStartY.value = e.touches[0].clientY
-    isPullToRefreshActive.value = true
-  }
-}
-
-const handlePullToRefreshMove = (e) => {
-  if (!isPullToRefreshActive.value || !e.touches || e.touches.length === 0) return
-  
-  const currentY = e.touches[0].clientY
-  const deltaY = currentY - pullToRefreshStartY.value
-  
-  // Only allow pull if scrolling from top
-  if (window.scrollY === 0 && deltaY > 0) {
-    pullToRefreshDistance.value = deltaY
-    // Prevent default scrolling if pulling down significantly
-    if (deltaY > 10) {
-      e.preventDefault()
-    }
-  } else {
-    // Reset if user scrolls up
-    pullToRefreshDistance.value = 0
-    isPullToRefreshActive.value = false
-  }
-}
-
-const handlePullToRefreshEnd = (e) => {
-  if (pullToRefreshDistance.value >= pullToRefreshThreshold && window.scrollY === 0) {
-    // Trigger refresh
-    refreshModels()
-  }
-  
-  // Reset state
-  pullToRefreshDistance.value = 0
-  pullToRefreshStartY.value = 0
-  isPullToRefreshActive.value = false
-}
-
-const goToSearch = () => {
-  router.push('/search')
-}
-
-
-
-</script>
-
-<style scoped>
-.model-library {
-  max-width: 1400px;
-  margin: 0 auto;
-}
-
-.downloaded-models {
-  position: relative;
-  margin-top: var(--spacing-md);
-}
-
-.pull-to-refresh-indicator {
-  position: absolute;
-  top: -50px;
-  left: 50%;
-  transform: translateX(-50%);
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-md);
-  color: var(--accent-cyan);
-  font-size: 0.9rem;
-  font-weight: 500;
-  z-index: 10;
-  transition: transform 0.2s ease-out;
-  pointer-events: none;
-}
-
-.pull-to-refresh-indicator i {
-  transition: transform 0.3s ease-out;
-}
-
-.pull-to-refresh-indicator i.rotated {
-  transform: rotate(180deg);
-}
-
-.model-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-  gap: var(--spacing-md);
-}
-
-.model-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-lg);
-  transition: all var(--transition-normal);
-  box-shadow: var(--shadow-md);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.model-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.model-card:hover {
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-  transform: translateY(-5px) scale(1.02);
-  border-color: var(--accent-cyan);
-}
-
-.model-card:hover::before {
-  opacity: 1;
-}
-
-.model-card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: var(--spacing-sm);
-}
-
-.model-name {
-  font-weight: 700;
-  color: var(--text-primary);
-  margin-bottom: var(--spacing-sm);
-  font-size: 1.1rem;
-  line-height: 1.3;
-}
-
-.model-status {
-  display: flex;
-  align-items: center;
-}
-
-.status-indicator {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 500;
-}
-
-.status-running {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--accent-green);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.status-stopped {
-  background: var(--bg-surface);
-  color: var(--text-secondary);
-  border: 1px solid var(--border-secondary);
-}
-
-.model-actions {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-sm);
-  margin-top: var(--spacing-md);
-  padding-top: var(--spacing-sm);
-  border-top: 1px solid var(--border-primary);
-}
-
-.action-group {
-  display: flex;
-  gap: var(--spacing-xs);
-  flex-wrap: wrap;
-}
-
-.quantization-list {
-  margin: var(--spacing-sm) 0;
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-
-.quantization-item {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  padding: var(--spacing-sm);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  transition: all var(--transition-normal);
-}
-
-.quantization-item:hover {
-  border-color: var(--accent-cyan);
-  background: var(--bg-tertiary);
-}
-
-.quantization-item.selected {
-  border-color: var(--accent-blue);
-  background: rgba(59, 130, 246, 0.1);
-}
-
-.quantization-info {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-  flex: 1;
-}
-
-.quantization-name {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.875rem;
-}
-
-.quantization-details {
-  display: flex;
-  gap: var(--spacing-sm);
-  align-items: center;
-  font-size: 0.75rem;
-}
-
-.quantization-size {
-  color: var(--text-secondary);
-}
-
-.quantization-status {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: 2px 6px;
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-}
-
-.quantization-status.running {
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--accent-green);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-}
-
-.quantization-status.running.llama-swap-running {
-  background: rgba(59, 130, 246, 0.1);
-  color: var(--accent-blue);
-  border: 1px solid rgba(59, 130, 246, 0.2);
-}
-
-.status-indicator.llama-swap-running {
-  background: rgba(59, 130, 246, 0.1);
-  color: var(--accent-blue);
-  border: 1px solid rgba(59, 130, 246, 0.2);
-}
-
-.upstream-link {
-  font-size: 0.7rem !important;
-  padding: 1px 3px !important;
-  height: auto !important;
-  background: rgba(34, 211, 238, 0.1) !important;
-  color: var(--accent-cyan) !important;
-  border: 1px solid rgba(34, 211, 238, 0.2) !important;
-  border-radius: var(--radius-sm) !important;
-  transition: all var(--transition-normal) !important;
-  min-width: 20px !important;
-  margin-left: var(--spacing-xs) !important;
-}
-
-.upstream-link:hover {
-  background: rgba(34, 211, 238, 0.2) !important;
-  border-color: var(--accent-cyan) !important;
-  transform: translateY(-1px) !important;
-  box-shadow: var(--shadow-sm) !important;
-}
-
-.connection-info {
-  display: flex;
-  align-items: center;
-}
-
-.live-indicator,
-.connection-status {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.live-indicator i {
-  animation: pulse 2s infinite;
-}
-
-.connection-status {
-  color: var(--status-error);
-}
-
-@keyframes pulse {
-  0% { opacity: 1; }
-  50% { opacity: 0.5; }
-  100% { opacity: 1; }
-}
-
-.quantization-actions {
-  display: flex;
-  gap: var(--spacing-xs);
-  align-items: center;
-}
-
-.quantization-actions .p-button {
-  padding: 2px 4px !important;
-  min-width: 24px !important;
-}
-
-.model-tag.tag-count {
-  background: var(--accent-cyan-soft);
-  color: var(--accent-cyan);
-  border: 1px solid color-mix(in srgb, var(--accent-cyan) 40%, transparent);
-}
-
-.empty-state {
-  text-align: center;
-  padding: var(--spacing-3xl) var(--spacing-xl);
-  color: var(--text-secondary);
-  background: var(--gradient-surface);
-  border-radius: var(--radius-xl);
-  border: 2px dashed var(--border-secondary);
-  margin: var(--spacing-xl) 0;
-  position: relative;
-  overflow: hidden;
-}
-
-.empty-state::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 2px;
-  background: var(--gradient-primary);
-  opacity: 0.3;
-}
-
-.empty-state i {
-  font-size: 3rem !important;
-  background: var(--gradient-primary);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-  margin-bottom: var(--spacing-lg);
-}
-
-.empty-state h3 {
-  margin: var(--spacing-lg) 0 var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.3rem;
-}
-
-.empty-state p {
-  font-size: 1rem;
-  line-height: 1.6;
-  max-width: 400px;
-  margin: 0 auto var(--spacing-lg);
-}
-
-/* Responsive */
-@media (max-width: 768px) {
-  .model-grid {
-    grid-template-columns: 1fr;
-  }
-  
-  .model-actions {
-    flex-direction: column;
-  }
-}
-</style>
\ No newline at end of file
+<template>
+  <div class="model-library">
+
+    <!-- Header -->
+    <div class="library-header">
+      <div class="header-left">
+        <h1>Models</h1>
+        <Tag v-if="totalModels" :value="`${totalModels} model${totalModels !== 1 ? 's' : ''}`" severity="info" />
+      </div>
+      <div class="header-actions">
+        <Button
+          icon="pi pi-refresh"
+          text
+          severity="secondary"
+          :loading="modelStore.loading"
+          v-tooltip.top="'Refresh'"
+          @click="modelStore.fetchModels()"
+        />
+        <Button
+          label="Search &amp; Download"
+          icon="pi pi-search"
+          severity="success"
+          outlined
+          @click="$router.push('/search')"
+        />
+      </div>
+    </div>
+
+    <!-- Token Warning -->
+    <div v-if="!modelStore.hasHuggingfaceToken" class="token-warning">
+      <i class="pi pi-key" />
+      <span>No HuggingFace token set. Gated models won't be accessible.</span>
+      <Button label="Set Token" icon="pi pi-pencil" size="small" text @click="showTokenDialog = true" />
+    </div>
+
+    <!-- Loading -->
+    <div v-if="modelStore.loading && !modelStore.models.length" class="loading-state">
+      <ProgressSpinner style="width:40px;height:40px" />
+      <span>Loading models…</span>
+    </div>
+
+    <!-- Empty state -->
+    <div v-else-if="!modelStore.loading && !modelStore.models.length" class="empty-state">
+      <i class="pi pi-inbox" style="font-size:3rem;color:var(--text-secondary)" />
+      <h3>No models downloaded yet</h3>
+      <p>Search HuggingFace to find and download models.</p>
+      <Button label="Search Models" icon="pi pi-search" @click="$router.push('/search')" />
+    </div>
+
+    <!-- Model groups -->
+    <div v-else class="model-groups">
+      <div
+        v-for="group in modelStore.models"
+        :key="group.huggingface_id"
+        class="model-group"
+      >
+        <!-- Group header -->
+        <div class="group-header" @click="toggleGroup(group.huggingface_id)">
+          <div class="group-title">
+            <i :class="['pi', 'group-chevron', expandedGroups.has(group.huggingface_id) ? 'pi-chevron-down' : 'pi-chevron-right']" />
+            <span class="group-name">{{ group.base_model_name || group.huggingface_id }}</span>
+            <Tag
+              v-if="group.quantizations?.some(q => q.is_active)"
+              value="Running"
+              severity="success"
+              class="running-badge"
+            />
+          </div>
+          <div class="group-meta">
+            <small>{{ group.huggingface_id }}</small>
+            <Button
+              icon="pi pi-trash"
+              text
+              severity="danger"
+              size="small"
+              v-tooltip.top="'Delete all quantizations'"
+              @click.stop="confirmDeleteGroup(group.huggingface_id)"
+            />
+          </div>
+        </div>
+
+        <!-- Quantizations list -->
+        <Transition name="group-collapse">
+          <div v-if="expandedGroups.has(group.huggingface_id)" class="quantizations">
+            <div
+              v-for="quant in group.quantizations"
+              :key="quant.id"
+              class="quant-row"
+              :class="{ 'is-active': quant.is_active }"
+            >
+              <div class="quant-info">
+                <div class="quant-main">
+                  <code class="quant-name">{{ quant.quantization || quant.name }}</code>
+                  <Tag v-if="quant.is_active" value="Running" severity="success" />
+                  <Tag :value="quant.engine || 'llama_cpp'" severity="secondary" />
+                  <Tag v-if="quant.format" :value="quant.format" severity="info" />
+                </div>
+                <div class="quant-sub">
+                  <span v-if="quant.file_size" class="file-size">
+                    {{ formatBytes(quant.file_size) }}
+                  </span>
+                  <span v-if="quant.downloaded_at" class="downloaded-at">
+                    Downloaded {{ formatDate(quant.downloaded_at) }}
+                  </span>
+                </div>
+              </div>
+
+              <div class="quant-actions">
+                <Button
+                  v-if="!quant.is_active"
+                  label="Start"
+                  icon="pi pi-play"
+                  size="small"
+                  severity="success"
+                  outlined
+                  :loading="startingModels.has(quant.id)"
+                  @click="startModel(quant.id)"
+                />
+                <Button
+                  v-else
+                  label="Stop"
+                  icon="pi pi-stop"
+                  size="small"
+                  severity="warning"
+                  outlined
+                  :loading="stoppingModels.has(quant.id)"
+                  @click="stopModel(quant.id)"
+                />
+                <Button
+                  icon="pi pi-cog"
+                  text
+                  severity="secondary"
+                  size="small"
+                  v-tooltip.top="'Configure'"
+                  @click="configureModel(quant.id)"
+                />
+                <Button
+                  icon="pi pi-trash"
+                  text
+                  severity="danger"
+                  size="small"
+                  v-tooltip.top="'Delete'"
+                  @click="confirmDeleteModel(quant.id)"
+                />
+              </div>
+            </div>
+          </div>
+        </Transition>
+      </div>
+    </div>
+
+    <!-- HuggingFace Token Dialog -->
+    <Dialog v-model:visible="showTokenDialog" header="HuggingFace Token" modal :style="{ width: '420px' }">
+      <div class="token-form">
+        <p class="token-desc">Required to access gated models (e.g. Llama, Gemma).</p>
+        <div class="form-field">
+          <label>Token</label>
+          <Password v-model="tokenInput" placeholder="hf_…" :feedback="false" toggleMask style="width:100%" />
+        </div>
+        <div v-if="modelStore.hasHuggingfaceToken" class="token-current">
+          <i class="pi pi-check-circle" style="color:#22c55e" />
+          <span>Token set: {{ modelStore.huggingfaceToken || '••••••••' }}</span>
+          <Button label="Clear" severity="danger" text size="small" @click="clearToken" />
+        </div>
+      </div>
+      <template #footer>
+        <Button label="Cancel" severity="secondary" outlined @click="showTokenDialog = false" />
+        <Button label="Save Token" icon="pi pi-save" severity="success"
+          :disabled="!tokenInput" :loading="savingToken" @click="saveToken" />
+      </template>
+    </Dialog>
+
+    <ConfirmDialog />
+  </div>
+</template>
+
+<script setup>
+import { ref, computed, onMounted, onUnmounted } from 'vue'
+import { useRouter } from 'vue-router'
+import { useConfirm } from 'primevue/useconfirm'
+import { useToast } from 'primevue/usetoast'
+import Button from 'primevue/button'
+import Tag from 'primevue/tag'
+import ProgressSpinner from 'primevue/progressspinner'
+import Dialog from 'primevue/dialog'
+import Password from 'primevue/password'
+import ConfirmDialog from 'primevue/confirmdialog'
+import { useModelStore } from '@/stores/models'
+
+const router = useRouter()
+const confirm = useConfirm()
+const toast = useToast()
+const modelStore = useModelStore()
+
+// ── State ──────────────────────────────────────────────────
+const expandedGroups = ref(new Set())
+const startingModels = ref(new Set())
+const stoppingModels = ref(new Set())
+const showTokenDialog = ref(false)
+const tokenInput = ref('')
+const savingToken = ref(false)
+let pollTimer = null
+
+// ── Computed ───────────────────────────────────────────────
+const totalModels = computed(() =>
+  modelStore.models.reduce((acc, g) => acc + (g.quantizations?.length ?? 0), 0)
+)
+
+// ── Group expand/collapse ──────────────────────────────────
+function toggleGroup(hfId) {
+  if (expandedGroups.value.has(hfId)) {
+    expandedGroups.value.delete(hfId)
+  } else {
+    expandedGroups.value.add(hfId)
+  }
+}
+
+function expandAllGroups() {
+  modelStore.models.forEach(g => expandedGroups.value.add(g.huggingface_id))
+}
+
+// ── Model actions ──────────────────────────────────────────
+async function startModel(modelId) {
+  startingModels.value.add(modelId)
+  try {
+    await modelStore.startModel(modelId)
+    toast.add({ severity: 'success', summary: 'Model started', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed to start', detail: e.message, life: 4000 })
+  } finally {
+    startingModels.value.delete(modelId)
+    startingModels.value = new Set(startingModels.value) // trigger reactivity
+  }
+}
+
+async function stopModel(modelId) {
+  stoppingModels.value.add(modelId)
+  try {
+    await modelStore.stopModel(modelId)
+    toast.add({ severity: 'info', summary: 'Model stopped', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed to stop', detail: e.message, life: 4000 })
+  } finally {
+    stoppingModels.value.delete(modelId)
+    stoppingModels.value = new Set(stoppingModels.value)
+  }
+}
+
+function configureModel(modelId) {
+  router.push(`/models/${encodeURIComponent(modelId)}/config`)
+}
+
+function confirmDeleteModel(modelId) {
+  confirm.require({
+    message: 'Remove this model from the library? (Files in HF cache are NOT deleted.)',
+    header: 'Confirm Remove',
+    icon: 'pi pi-exclamation-triangle',
+    acceptClass: 'p-button-danger',
+    accept: async () => {
+      try {
+        await modelStore.deleteModel(modelId)
+        toast.add({ severity: 'info', summary: 'Model removed', life: 3000 })
+      } catch (e) {
+        toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+      }
+    },
+  })
+}
+
+function confirmDeleteGroup(huggingfaceId) {
+  confirm.require({
+    message: `Remove all quantizations for "${huggingfaceId}"?`,
+    header: 'Confirm Remove Group',
+    icon: 'pi pi-exclamation-triangle',
+    acceptClass: 'p-button-danger',
+    accept: async () => {
+      try {
+        await modelStore.deleteModelGroup(huggingfaceId)
+        toast.add({ severity: 'info', summary: 'Group removed', life: 3000 })
+      } catch (e) {
+        toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+      }
+    },
+  })
+}
+
+// ── Token management ───────────────────────────────────────
+async function saveToken() {
+  savingToken.value = true
+  try {
+    await modelStore.setHuggingfaceToken(tokenInput.value)
+    tokenInput.value = ''
+    showTokenDialog.value = false
+    toast.add({ severity: 'success', summary: 'Token saved', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  } finally {
+    savingToken.value = false
+  }
+}
+
+async function clearToken() {
+  try {
+    await modelStore.clearHuggingfaceToken()
+    toast.add({ severity: 'info', summary: 'Token cleared', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Failed', detail: e.message, life: 4000 })
+  }
+}
+
+// ── Formatters ─────────────────────────────────────────────
+// Decimal (1000) so MB/GB match Hugging Face
+function formatBytes(bytes) {
+  if (!bytes) return ''
+  const units = ['B', 'KB', 'MB', 'GB', 'TB']
+  let i = 0; let val = bytes
+  while (val >= 1000 && i < units.length - 1) { val /= 1000; i++ }
+  return `${val.toFixed(1)} ${units[i]}`
+}
+
+function formatDate(iso) {
+  if (!iso) return ''
+  try {
+    return new Intl.RelativeTimeFormat('en', { numeric: 'auto' }).format(
+      Math.round((new Date(iso) - Date.now()) / 86400000), 'day'
+    )
+  } catch {
+    return iso.slice(0, 10)
+  }
+}
+
+// ── Lifecycle ──────────────────────────────────────────────
+onMounted(async () => {
+  await modelStore.fetchModels()
+  await modelStore.fetchHuggingfaceTokenStatus()
+  expandAllGroups()
+  // Poll every 10 seconds for status updates
+  pollTimer = setInterval(() => modelStore.fetchModels(), 10000)
+})
+
+onUnmounted(() => {
+  if (pollTimer) clearInterval(pollTimer)
+})
+</script>
+
+<style scoped>
+.model-library {
+  max-width: 960px;
+  margin: 0 auto;
+  padding: var(--spacing-lg, 1.5rem);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-md, 0.75rem);
+}
+
+/* ── Header ───────────────────────────────────────────── */
+.library-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 0.75rem;
+  flex-wrap: wrap;
+}
+
+.header-left {
+  display: flex;
+  align-items: center;
+  gap: 0.75rem;
+}
+
+.header-left h1 { font-size: 1.5rem; font-weight: 700; margin: 0; }
+
+.header-actions {
+  display: flex;
+  gap: 0.5rem;
+  align-items: center;
+}
+
+/* ── Token warning ────────────────────────────────────── */
+.token-warning {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.5rem 0.875rem;
+  background: rgba(234, 179, 8, 0.08);
+  border: 1px solid rgba(234, 179, 8, 0.25);
+  border-radius: var(--radius-md, 0.5rem);
+  font-size: 0.875rem;
+  color: #eab308;
+}
+
+/* ── Loading / Empty ──────────────────────────────────── */
+.loading-state,
+.empty-state {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 1rem;
+  padding: 4rem 0;
+  text-align: center;
+  color: var(--text-secondary, #9ca3af);
+}
+
+.empty-state h3 { margin: 0; font-size: 1.1rem; color: var(--text-primary, #f1f5f9); }
+.empty-state p  { margin: 0; font-size: 0.875rem; }
+
+/* ── Groups ───────────────────────────────────────────── */
+.model-groups {
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+
+.model-group {
+  background: var(--bg-card, #161b2e);
+  border: 1px solid var(--border-primary, #2a2f45);
+  border-radius: var(--radius-lg, 0.75rem);
+  overflow: hidden;
+}
+
+.group-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.75rem 1rem;
+  cursor: pointer;
+  user-select: none;
+  background: var(--bg-surface, #1e2235);
+  transition: background 0.15s;
+  gap: 0.5rem;
+}
+
+.group-header:hover { background: var(--bg-card-hover, #232a42); }
+
+.group-title {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  flex: 1;
+  min-width: 0;
+}
+
+.group-chevron { font-size: 0.75rem; color: var(--text-secondary, #9ca3af); }
+
+.group-name {
+  font-weight: 600;
+  font-size: 0.9rem;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+
+.running-badge { flex-shrink: 0; }
+
+.group-meta {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  flex-shrink: 0;
+}
+
+.group-meta small {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #9ca3af);
+  font-family: monospace;
+  max-width: 200px;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+/* ── Quantizations ────────────────────────────────────── */
+.group-collapse-enter-active,
+.group-collapse-leave-active { transition: all 0.2s ease; overflow: hidden; }
+.group-collapse-enter-from,
+.group-collapse-leave-to    { max-height: 0; opacity: 0; }
+.group-collapse-enter-to,
+.group-collapse-leave-from  { max-height: 1000px; opacity: 1; }
+
+.quantizations {
+  padding: 0.5rem;
+  display: flex;
+  flex-direction: column;
+  gap: 0.375rem;
+}
+
+.quant-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.5rem 0.75rem;
+  background: var(--bg-surface, #1e2235);
+  border: 1px solid var(--border-primary, #2a2f45);
+  border-radius: var(--radius-md, 0.5rem);
+  gap: 0.75rem;
+  transition: border-color 0.15s;
+}
+
+.quant-row.is-active {
+  border-color: rgba(34, 197, 94, 0.4);
+  background: rgba(34, 197, 94, 0.04);
+}
+
+.quant-info { flex: 1; min-width: 0; }
+
+.quant-main {
+  display: flex;
+  align-items: center;
+  gap: 0.4rem;
+  flex-wrap: wrap;
+}
+
+.quant-name {
+  font-weight: 600;
+  font-size: 0.875rem;
+  font-family: monospace;
+}
+
+.quant-sub {
+  display: flex;
+  gap: 0.75rem;
+  margin-top: 0.2rem;
+}
+
+.file-size,
+.downloaded-at {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #9ca3af);
+}
+
+.quant-actions {
+  display: flex;
+  gap: 0.25rem;
+  flex-shrink: 0;
+  align-items: center;
+}
+
+/* ── Token dialog ─────────────────────────────────────── */
+.token-form { display: flex; flex-direction: column; gap: 0.75rem; }
+.token-desc { font-size: 0.875rem; color: var(--text-secondary, #9ca3af); margin: 0; }
+.form-field { display: flex; flex-direction: column; gap: 0.25rem; }
+.form-field label { font-size: 0.875rem; font-weight: 500; }
+
+.token-current {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  font-size: 0.875rem;
+  background: rgba(34, 197, 94, 0.08);
+  border: 1px solid rgba(34, 197, 94, 0.2);
+  border-radius: var(--radius-md, 0.5rem);
+  padding: 0.5rem 0.75rem;
+}
+</style>
diff --git a/frontend/src/views/ModelSearch.vue b/frontend/src/views/ModelSearch.vue
index e42a4f9..15cde29 100644
--- a/frontend/src/views/ModelSearch.vue
+++ b/frontend/src/views/ModelSearch.vue
@@ -1,1762 +1,633 @@
 <template>
   <div class="model-search">
-    <div class="card">
-      <div class="card-header">
-        <h2 class="card-title">Model Search</h2>
-        <div class="header-actions">
-          <Button 
-            icon="pi pi-refresh" 
-            @click="refreshSearch"
-            :loading="modelStore.searchLoading"
-            severity="secondary"
-            text
-          />
-        </div>
-      </div>
 
-      <!-- HuggingFace Token Section -->
-      <div class="token-section">
-        <Accordion :multiple="false" :activeIndex="tokenAccordionIndex">
-          <AccordionTab>
-            <template #header>
-              <div class="token-header">
-                <i class="pi pi-key"></i>
-                <span>HuggingFace API Token</span>
-                <span v-if="modelStore.hasHuggingfaceToken" class="token-status-indicator">
-                  <i class="pi pi-check-circle"></i>
-                  <span v-if="modelStore.tokenFromEnvironment">Set via Environment</span>
-                  <span v-else>Configured</span>
-                </span>
-              </div>
-            </template>
-            <div v-if="!modelStore.hasHuggingfaceToken" class="token-setup">
-              <p class="token-info">
-                <i class="pi pi-info-circle"></i>
-                Set a HuggingFace API token to search for models. Without a token, model search will be disabled.
-              </p>
-              <div class="token-input">
-                <InputText 
-                  v-model="newToken"
-                  placeholder="Enter your HuggingFace API token"
-                  type="password"
-                  class="token-field"
-                />
-                <Button 
-                  label="Set Token" 
-                  icon="pi pi-key"
-                  @click="setToken"
-                  :loading="settingToken"
-                  :disabled="!newToken.trim()"
-                />
-              </div>
-              <p class="token-help">
-                Get your token from <a href="https://huggingface.co/settings/tokens" target="_blank">HuggingFace Settings</a>
-              </p>
-            </div>
-            <div v-else class="token-status">
-              <p class="token-success">
-                <i class="pi pi-check-circle"></i>
-                HuggingFace token is set ({{ modelStore.huggingfaceToken }})
-                <span v-if="modelStore.tokenFromEnvironment" class="env-badge">
-                  <i class="pi pi-cog"></i>
-                  From Environment Variable
-                </span>
-              </p>
-              <Button 
-                v-if="!modelStore.tokenFromEnvironment"
-                label="Clear Token" 
-                icon="pi pi-trash"
-                severity="danger"
-                outlined
-                @click="clearToken"
-                :loading="settingToken"
-              />
-              <p v-else class="env-info">
-                <i class="pi pi-info-circle"></i>
-                Token is set via HUGGINGFACE_API_KEY environment variable and cannot be modified via UI
-              </p>
-            </div>
-          </AccordionTab>
-        </Accordion>
+    <!-- Search bar -->
+    <div class="search-bar">
+      <div class="search-input-wrap">
+        <i class="pi pi-search search-icon" />
+        <InputText
+          v-model="query"
+          placeholder="Search HuggingFace models…"
+          class="search-input"
+          @keyup.enter="search"
+        />
+        <Button
+          v-if="query"
+          icon="pi pi-times"
+          text
+          severity="secondary"
+          class="clear-btn"
+          @click="query = ''; searchResults = []"
+        />
       </div>
 
-      <!-- Search Section -->
-      <div class="search-section">
-        <div class="search-bar">
-          <InputText 
-            v-model="searchQuery"
-          :placeholder="`Search HuggingFace for ${formatLabel} models...`"
-            @keyup.enter="performSearch"
-            class="search-input"
-          />
-        <Dropdown
-          v-model="selectedFormat"
-          :options="formatOptions"
-          optionLabel="label"
-          optionValue="value"
-          class="format-dropdown"
-        />
-          <Button 
-            label="Search" 
-            icon="pi pi-search"
-            @click="performSearch"
-            :loading="modelStore.searchLoading"
-            :disabled="!searchQuery.trim()"
-          />
-        </div>
+      <Dropdown
+        v-model="searchFormat"
+        :options="formatOptions"
+        optionLabel="label"
+        optionValue="value"
+        class="format-select"
+      />
+
+      <Button
+        label="Search"
+        icon="pi pi-search"
+        severity="success"
+        :loading="searching"
+        :disabled="!query.trim()"
+        @click="search"
+      />
+    </div>
+
+    <!-- Download progress -->
+    <ProgressTracker type="download" :show-completed="true" />
+
+    <!-- Results loading -->
+    <div v-if="searching" class="loading-row">
+      <ProgressSpinner style="width:40px;height:40px" strokeWidth="4" />
+      <span>Searching…</span>
+    </div>
+
+    <!-- Empty search state -->
+    <div v-else-if="!searchResults.length && hasSearched" class="empty-state">
+      <i class="pi pi-search" style="font-size:3rem;color:var(--text-secondary)" />
+      <h3>No results for "{{ lastQuery }}"</h3>
+      <p>Try different keywords or change the format filter.</p>
+    </div>
+
+    <!-- Initial prompt -->
+    <div v-else-if="!searchResults.length && !hasSearched" class="empty-state">
+      <i class="pi pi-search" style="font-size:3rem;color:var(--text-secondary)" />
+      <h3>Search for models</h3>
+      <p>Enter a model name or keyword above to find models on HuggingFace.</p>
+    </div>
+
+    <!-- Results table -->
+    <div v-else class="results-table">
+      <div class="results-header">
+        <span class="results-count">{{ searchResults.length }} result{{ searchResults.length !== 1 ? 's' : '' }}</span>
       </div>
 
-      <!-- Search Results -->
-      <div v-if="Array.isArray(modelStore.searchResults) && modelStore.searchResults.length > 0" class="search-results">
-        <h3>Search Results</h3>
-        <div class="model-grid">
-          <div 
-            v-for="model in modelStore.searchResults" 
-            :key="model.id"
-            class="model-card"
-          >
-            <div class="model-card-header">
-              <div>
-              <div class="model-name-row">
-                <div class="model-name">{{ model.name }}</div>
-                <span class="model-format-badge">{{ (model.model_format || 'gguf').toUpperCase() }}</span>
-              </div>
-                <div v-if="model.author || (typeof model.id === 'string' && model.id.includes('/'))" class="model-author">
-                  by {{ model.author || (typeof model.id === 'string' && model.id.includes('/') ? model.id.split('/')[0] : '') }}
-                </div>
-                <div class="model-meta" v-if="model.parameters || model.architecture || model.language?.length">
-                  <div class="model-meta-item" v-if="model.parameters">
-                    <span>Parameters:</span>
-                    <span>{{ model.parameters }}</span>
-                  </div>
-                  <div class="model-meta-item" v-if="model.architecture">
-                    <span>Architecture:</span>
-                    <span>{{ model.architecture }}</span>
-                  </div>
-                  <div class="model-meta-item" v-if="Array.isArray(model.language) && model.language.length">
-                    <span>Language:</span>
-                    <span>{{ model.language.join(', ') }}</span>
-                  </div>
-                  <div class="model-meta-item" v-if="model.license">
-                    <span>License:</span>
-                    <span>{{ model.license }}</span>
-                  </div>
-                </div>
-                <div class="model-pipeline" v-if="model.pipeline_tag">
-                  <span class="pipeline-badge">{{ formatPipelineLabel(model.pipeline_tag) }}</span>
-                </div>
-              </div>
-              <div class="model-stats">
-                <i class="pi pi-download"></i>
-                <span>{{ formatNumber(model.downloads) }}</span>
-                <i class="pi pi-heart" v-if="model.likes"></i>
-                <span v-if="model.likes">{{ formatNumber(model.likes) }}</span>
-              </div>
-            </div>
-            
-            <div class="model-description" v-if="model.description">
-              {{ truncateText(model.description, 100) }}
-            </div>
-            
-            <div class="model-links" v-if="model.readme_url">
-              <a :href="model.readme_url" target="_blank" class="readme-link">
-                <i class="pi pi-external-link"></i>
-                View README
-              </a>
-            </div>
-            
-            <div v-if="model.model_format === 'gguf'" class="quantizations">
-              <!-- Downloaded Quantizations Section -->
-              <div v-if="getDownloadedQuantizationsForModel(model.id).length > 0" class="downloaded-quantizations">
-                <h4>Downloaded Quantizations:</h4>
-                <div class="downloaded-list">
-                  <div 
-                    v-for="downloaded in getDownloadedQuantizationsForModel(model.id)" 
-                    :key="downloaded.quantization"
-                    class="downloaded-item"
-                  >
-                    <div class="downloaded-info">
-                      <span class="downloaded-name">{{ downloaded.quantization }}</span>
-                      <span class="downloaded-badge">
-                        <i class="pi pi-check"></i>
-                        Downloaded
-                      </span>
-                    </div>
-                    <div class="downloaded-details">
-                      <span class="downloaded-size">{{ formatFileSize(downloaded.file_size) }}</span>
-                      <span class="downloaded-date">{{ formatDate(downloaded.downloaded_at) }}</span>
-                    </div>
-                  </div>
-                </div>
-              </div>
-              
-              <!-- Available Quantizations Section -->
-              <h4>Available Quantizations:</h4>
-              <div class="quantization-selector">
-                <Dropdown 
-                  v-model="selectedQuantization[model.id]"
-                  :options="getQuantizationOptions(model.quantizations, model.id)"
-                  optionLabel="label"
-                  optionValue="value"
-                  :placeholder="loadingQuantizationSizes[model.id] ? 'Loading sizes...' : 'Select quantization'"
-                  class="quantization-dropdown"
-                  :loading="loadingQuantizationSizes[model.id]"
-                  @change="onQuantizationChange(model.id, $event.value)"
-                  @show="onDropdownOpen(model.id)"
-                />
-                <div v-if="loadingQuantizationSizes[model.id]" class="loading-indicator">
-                  <i class="pi pi-spin pi-spinner"></i>
-                  <span>Fetching file sizes...</span>
-                </div>
-              </div>
-              <div v-if="selectedQuantization[model.id]" class="selected-quantization-info">
-                <div class="quant-info">
-                  <div class="quant-name-row">
-                    <span class="quant-name">{{ selectedQuantization[model.id] }}</span>
-                  </div>
-                  <span v-if="getQuantizationSizeWithUnit(model.quantizations, selectedQuantization[model.id])" class="quant-size">{{ getQuantizationSizeWithUnit(model.quantizations, selectedQuantization[model.id]) }}</span>
-                </div>
-                <Button 
-                  :label="isModelDownloaded(model.id, selectedQuantization[model.id], model) ? 'Downloaded' : 'Download'"
-                  :icon="isModelDownloaded(model.id, selectedQuantization[model.id], model) ? 'pi pi-check' : 'pi pi-download'"
-                  @click="downloadSelectedQuantization(model.id)"
-                  :disabled="isModelDownloaded(model.id, selectedQuantization[model.id], model) || (downloadingModels[model.id]?.size > 0) || !selectedQuantization[model.id]"
-                  :loading="downloadingModels[model.id]?.size > 0"
-                  class="download-button"
-                  :severity="isModelDownloaded(model.id, selectedQuantization[model.id]) ? 'success' : 'success'"
-                />
-              </div>
-            </div>
-            <div v-else class="safetensors-section">
-              <div class="safetensors-header">
-                <div>
-                  <h4>Safetensors Files</h4>
-                  <p v-if="Array.isArray(model.safetensors_files) && model.safetensors_files.length">
-                    {{ model.safetensors_files.length }} files
-                  </p>
-                  <p v-else>No safetensors files found for this model.</p>
-                </div>
-                <template v-if="isSafetensorsDownloaded(model)">
-                  <span class="downloaded-badge">
-                    <i class="pi pi-check"></i>
-                    Downloaded
-                  </span>
-                </template>
-                <template v-else>
-                  <Button 
-                    label="Download"
-                    icon="pi pi-download"
-                    severity="success"
-                    :disabled="!Array.isArray(model.safetensors_files) || model.safetensors_files.length === 0 || (downloadingModels[model.id]?.size > 0) || isSafetensorsDownloaded(model)"
-                    :loading="downloadingModels[model.id]?.size > 0"
-                    @click="downloadSafetensorsBundle(model)"
-                  />
-                </template>
-              </div>
-              <div 
-                v-if="getDownloadedSafetensorsForModel(model.id).length > 0" 
-                class="safetensors-files-box"
-              >
-                <h4>Downloaded Safetensors ({{ getDownloadedSafetensorsForModel(model.id).length }})</h4>
-                <div class="safetensors-files-list">
-                  <div 
-                    v-for="file in getDownloadedSafetensorsForModel(model.id)" 
-                    :key="file.filename"
-                    class="safetensors-file-name"
-                  >
-                    {{ file.filename }}
-                  </div>
-                </div>
-              </div>
-              <Accordion 
-                :multiple="false" 
-                :activeIndex="safetensorsAccordionIndex[model.id] ?? null"
-                @tab-open="onSafetensorsAccordionOpen(model.id)"
-                @tab-close="onSafetensorsAccordionClose(model.id)"
-                class="safetensors-accordion"
+      <div
+        v-for="result in searchResults"
+        :key="result.modelId || result.id"
+        class="result-row"
+      >
+        <!-- Main row -->
+        <div class="result-main" @click="toggleExpand(result.modelId || result.id)">
+          <div class="result-expand-icon">
+            <i :class="['pi', expanded.has(result.modelId || result.id) ? 'pi-chevron-down' : 'pi-chevron-right']" />
+          </div>
+
+          <div class="result-info">
+            <div class="result-name">
+              <a
+                :href="`https://huggingface.co/${result.modelId || result.id}`"
+                target="_blank"
+                class="model-link"
+                @click.stop
               >
-                <AccordionTab header="Safetensors metadata">
-                  <div v-if="modelStore.safetensorsMetadataLoading[model.id]" class="loading-indicator">
-                    <i class="pi pi-spin pi-spinner"></i>
-                    <span>Loading tensor metadata...</span>
-                  </div>
-                  <div v-else-if="modelStore.safetensorsMetadata[model.id]" class="safetensors-metadata">
-                    <div v-if="modelStore.safetensorsMetadata[model.id].error" class="metadata-error">
-                      <i class="pi pi-exclamation-triangle"></i>
-                      <span>{{ modelStore.safetensorsMetadata[model.id].error }}</span>
-                    </div>
-                    <template v-else>
-                      <div v-if="modelStore.safetensorsMetadata[model.id].total_files === 0" class="metadata-empty">
-                        No safetensors files found in this repository
-                      </div>
-                      <template v-else>
-                        <div class="dtype-summary">
-                          <h5>Data types</h5>
-                          <div 
-                            v-for="(count, dtype) in modelStore.safetensorsMetadata[model.id].dtype_totals" 
-                            :key="dtype"
-                            class="dtype-row"
-                          >
-                            <span class="dtype-name">{{ dtype }}</span>
-                            <span class="dtype-count">{{ formatNumber(count) }}</span>
-                          </div>
-                        </div>
-                        <div class="metadata-files">
-                          <h5>Files</h5>
-                          <div 
-                            v-for="fileMeta in modelStore.safetensorsMetadata[model.id].files" 
-                            :key="fileMeta.filename"
-                            class="metadata-file-row"
-                          >
-                            <div class="metadata-file-name">{{ fileMeta.filename }} ({{ fileMeta.tensor_count }} tensors)</div>
-                            <div class="metadata-dtypes">
-                              <span 
-                                v-for="(count, dtype) in fileMeta.dtype_counts" 
-                                :key="dtype"
-                                class="dtype-chip"
-                              >
-                                {{ dtype }}: {{ formatNumber(count) }}
-                              </span>
-                            </div>
-                          </div>
-                        </div>
-                      </template>
-                    </template>
-                  </div>
-                  <div v-else class="metadata-empty">
-                    Expand to load tensor metadata
-                  </div>
-                </AccordionTab>
-              </Accordion>
+                {{ result.modelId || result.id }}
+              </a>
+              <Tag v-if="result.pipeline_tag" :value="result.pipeline_tag" severity="secondary" class="pipeline-tag" />
             </div>
-            
-            <!-- Download Progress - Multiple concurrent downloads -->
-            <div v-if="getModelDownloadProgress(model.id).length > 0" class="downloads-container">
-              <div 
-                v-for="progressData in getModelDownloadProgress(model.id)" 
-                :key="progressData.taskId"
-                class="download-progress"
-              >
-                <div class="progress-header">
-                  <span class="progress-filename">
-                    <template v-if="progressData.format === 'safetensors-bundle'">
-                      Safetensors Bundle — {{ progressData.current_filename || progressData.filename }}
-                    </template>
-                    <template v-else>
-                      {{ progressData.quantization }} - {{ progressData.filename }}
-                    </template>
-                  </span>
-                  <span class="progress-percentage">{{ progressData.progress }}%</span>
-                </div>
-                <div class="progress-bar-container">
-                  <div class="progress-bar" :style="{ width: progressData.progress + '%' }"></div>
-                </div>
-                <div class="progress-details">
-                  <div class="progress-row-1">
-                    <span class="progress-size">
-                      {{ formatBytes(progressData.bytes_downloaded) }} / {{ formatBytes(progressData.total_bytes) }}
-                    </span>
-                    <span v-if="progressData.speed_mbps > 0" class="progress-speed">
-                      {{ (progressData.speed_mbps || 0).toFixed(1) }} MB/s
-                    </span>
-                  </div>
-                  <div v-if="progressData.format === 'safetensors-bundle' || progressData.format === 'gguf-bundle'" class="progress-bundle-row">
-                    <span>File {{ progressData.files_completed }} / {{ progressData.files_total }}</span>
-                  </div>
-                  <div v-if="progressData.eta_seconds > 0" class="progress-eta-row">
-                    <span class="progress-eta">
-                      {{ formatTime(progressData.eta_seconds) }} remaining
-                    </span>
-                  </div>
-                </div>
-              </div>
+            <div class="result-meta">
+              <span v-if="result.author" class="meta-item">
+                <i class="pi pi-user" /> {{ result.author }}
+              </span>
+              <span v-if="result.downloads != null" class="meta-item">
+                <i class="pi pi-download" /> {{ formatNumber(result.downloads) }}
+              </span>
+              <span v-if="result.likes != null" class="meta-item">
+                <i class="pi pi-heart" /> {{ formatNumber(result.likes) }}
+              </span>
+              <span v-if="result.license" class="meta-item license">
+                {{ result.license }}
+              </span>
             </div>
           </div>
+
+          <div class="result-tags">
+            <Tag
+              v-for="tag in (result.tags || []).filter(t => interestingTag(t)).slice(0, 3)"
+              :key="tag"
+              :value="tag"
+              severity="info"
+              class="model-tag"
+            />
+          </div>
         </div>
-      </div>
 
-      <!-- Empty State -->
-      <div v-else-if="!modelStore.searchLoading && searchQuery" class="empty-state">
-        <i class="pi pi-search"></i>
-        <h3>No models found</h3>
-        <p>Try adjusting your search terms.</p>
-      </div>
+        <!-- Expanded: quantization files -->
+        <Transition name="row-expand">
+          <div v-if="expanded.has(result.modelId || result.id)" class="result-files">
+            <div v-if="loadingFiles.has(result.modelId || result.id)" class="files-loading">
+              <ProgressSpinner style="width:24px;height:24px" strokeWidth="4" />
+              <span>Loading files…</span>
+            </div>
+
+            <div v-else-if="getFiles(result.modelId || result.id).length === 0" class="files-empty">
+              <span>No downloadable files found for the selected format.</span>
+            </div>
 
-      <!-- Initial State -->
-      <div v-else class="empty-state">
-        <i class="pi pi-search"></i>
-        <h3>Search for Models</h3>
-        <p>Enter a search term above to find {{ formatLabel }} models on HuggingFace.</p>
+            <table v-else class="files-table">
+              <thead>
+                <tr>
+                  <th>File</th>
+                  <th>Size</th>
+                  <th>Status</th>
+                  <th></th>
+                </tr>
+              </thead>
+              <tbody>
+                <tr v-for="file in getFiles(result.modelId || result.id)" :key="file.filename">
+                  <td class="file-name">
+                    <code>{{ file.filename }}</code>
+                    <Tag v-if="file.quantization" :value="file.quantization" severity="info" />
+                  </td>
+                  <td class="file-size">{{ formatBytes(file.size) }}</td>
+                  <td class="file-status">
+                    <Tag v-if="file.downloaded" value="Downloaded" severity="success" />
+                    <span v-else class="not-downloaded">—</span>
+                  </td>
+                  <td class="file-action">
+                    <Button
+                      v-if="file.downloaded"
+                      label="Configure"
+                      icon="pi pi-cog"
+                      size="small"
+                      severity="secondary"
+                      text
+                      @click="configureDownloaded(result.modelId || result.id, file)"
+                    />
+                    <Button
+                      v-else
+                      label="Download"
+                      icon="pi pi-download"
+                      size="small"
+                      severity="success"
+                      outlined
+                      :loading="downloadingFiles.has(`${result.modelId || result.id}:${file.filename}`)"
+                      @click="downloadFile(result, file)"
+                    />
+                  </td>
+                </tr>
+              </tbody>
+            </table>
+          </div>
+        </Transition>
       </div>
     </div>
   </div>
 </template>
 
 <script setup>
-import { ref, onMounted, watch, onUnmounted, computed } from 'vue'
-import { useModelStore } from '@/stores/models'
-import { useWebSocketStore } from '@/stores/websocket'
-import { toast } from 'vue3-toastify'
-import { useConfirm } from 'primevue/useconfirm'
+import { ref, onMounted } from 'vue'
+import { useRouter } from 'vue-router'
+import { useToast } from 'primevue/usetoast'
 import Button from 'primevue/button'
+import Tag from 'primevue/tag'
 import InputText from 'primevue/inputtext'
 import Dropdown from 'primevue/dropdown'
-import Accordion from 'primevue/accordion'
-import AccordionTab from 'primevue/accordiontab'
-import { formatFileSize, formatDate } from '@/utils/formatting'
+import ProgressSpinner from 'primevue/progressspinner'
+import ProgressTracker from '@/components/common/ProgressTracker.vue'
+import { useModelStore } from '@/stores/models'
+import axios from 'axios'
 
+const router = useRouter()
+const toast = useToast()
 const modelStore = useModelStore()
-const wsStore = useWebSocketStore()
-const confirm = useConfirm()
-
-// Reactive state
-const searchQuery = ref('')
-const newToken = ref('')
-const settingToken = ref(false)
-const tokenAccordionIndex = ref(-1)
+
+// ── State ──────────────────────────────────────────────────
+const query = ref('')
+const lastQuery = ref('')
+const searchFormat = ref('gguf')
+const searching = ref(false)
+const hasSearched = ref(false)
+const searchResults = ref([])
+const expanded = ref(new Set())
+const loadingFiles = ref(new Set())
+const downloadingFiles = ref(new Set())
+const filesCache = ref({})   // modelId -> files[]
+
 const formatOptions = [
   { label: 'GGUF', value: 'gguf' },
-  { label: 'Safetensors', value: 'safetensors' }
+  { label: 'Safetensors', value: 'safetensors' },
 ]
-const selectedFormat = ref('gguf')
-const formatLabel = computed(() => selectedFormat.value === 'safetensors' ? 'Safetensors' : 'GGUF')
-const selectedQuantization = ref({})
-const downloadingModels = ref({}) // {[modelId]: Set of task_ids}
-const downloadProgress = ref({}) // {[task_id]: {modelId, quantization, progress, ...}}
-const loadingQuantizationSizes = ref({})
-const activeDownloadPolling = ref(null) // Polling interval ID
-const safetensorsAccordionIndex = ref({})
-
-const findModelByFilename = (filename) => {
-  if (!Array.isArray(modelStore.searchResults)) return null
-  return modelStore.searchResults.find(m => {
-    const quantizations = Object.values(m.quantizations || {})
-    if (quantizations.some(q => Array.isArray(q.files) && q.files.some(f => f.filename === filename))) {
-      return true
-    }
-    return Array.isArray(m.safetensors_files) && m.safetensors_files.some(file => file.filename === filename)
-  }) || null
-}
 
-onMounted(async () => {
-  await modelStore.fetchModels()
-  await modelStore.fetchSafetensorsModels()
-  await modelStore.fetchHuggingfaceTokenStatus()
-  selectedFormat.value = modelStore.searchFormat || 'gguf'
-  
-  // Subscribe to download progress updates
-  wsStore.subscribeToDownloadProgress((data) => {
-    const taskId = data.task_id
-    if (!taskId) return
-    
-    // First, try to find model by stored taskId in downloadingModels
-    let modelId = null
-    for (const [mid, taskSet] of Object.entries(downloadingModels.value)) {
-      if (taskSet.has(taskId)) {
-        modelId = mid
-        break
-      }
-    }
-    
-    // If not found by taskId, try to find by huggingface_id (most reliable)
-    let model = null
-    if (!modelId && data.huggingface_id) {
-      model = Array.isArray(modelStore.searchResults) 
-        ? modelStore.searchResults.find(m => m.id === data.huggingface_id)
-        : null
-      if (model) {
-        modelId = model.id
-      }
-    }
-    
-    // Fallback to filename matching if huggingface_id not available
-    if (!model && !modelId) {
-      model = findModelByFilename(data.filename)
-      if (model) {
-        modelId = model.id
-      }
-    }
-    
-    // If we have modelId but not model object, find it
-    if (modelId && !model) {
-      model = Array.isArray(modelStore.searchResults)
-        ? modelStore.searchResults.find(m => m.id === modelId)
-        : null
-    }
-    
-    const formatFromMessage = data.model_format || model?.model_format || 'gguf'
-    
-    if (modelId) {
-      // Ensure taskId is tracked for this model
-      if (!downloadingModels.value[modelId]) {
-        downloadingModels.value[modelId] = new Set()
-      }
-      downloadingModels.value[modelId].add(taskId)
-      
-      let quantization = data.filename
-      if (formatFromMessage === 'gguf' || formatFromMessage === 'gguf-bundle') {
-        const quantMatch = data.filename.match(/Q\d+[K_]?[A-Z]*|IQ\d+_[A-Z]+/)
-        quantization = quantMatch ? quantMatch[0] : 'unknown'
-      }
-      
-      const isBundle = formatFromMessage === 'safetensors-bundle' || formatFromMessage === 'gguf-bundle'
-      const currentFilename = data.current_filename || data.filename
-      downloadProgress.value[taskId] = {
-        modelId: modelId,
-        quantization: isBundle ? (formatFromMessage === 'gguf-bundle' ? 'GGUF Bundle' : 'Safetensors Bundle') : quantization,
-        progress: data.progress,
-        message: data.message,
-        bytes_downloaded: data.bytes_downloaded,
-        total_bytes: data.total_bytes,
-        speed_mbps: data.speed_mbps,
-        eta_seconds: data.eta_seconds,
-        filename: currentFilename,
-        current_filename: currentFilename,
-        format: formatFromMessage,
-        files_total: data.files_total || (isBundle ? (formatFromMessage === 'safetensors-bundle' ? model?.safetensors_files?.length || 1 : 1) : 1),
-        files_completed: data.files_completed || (isBundle ? 0 : 0)
-      }
-      
-      // Remove progress when download completes
-      if (data.progress >= 100) {
-        setTimeout(() => {
-          delete downloadProgress.value[taskId]
-          // Remove task_id from downloading models
-          if (downloadingModels.value[modelId]) {
-            downloadingModels.value[modelId].delete(taskId)
-            if (downloadingModels.value[modelId].size === 0) {
-              delete downloadingModels.value[modelId]
-            }
-          }
-        }, 3000)
-      }
-    }
-  })
-
-// Subscribe to download complete events
-wsStore.subscribeToDownloadComplete(async (data) => {
-  console.log('Download complete event received:', data)
-  
-  // Refresh models list to update downloaded status
-  await modelStore.fetchModels()
-  const format = (data.model_format || '').toLowerCase()
-  if (format === 'safetensors' || format === 'safetensors_bundle' || format === 'safetensors-bundle') {
-    await modelStore.fetchSafetensorsModels()
-  }
-  
-  // Force reactivity update on search results to refresh dropdown states
-  if (Array.isArray(modelStore.searchResults)) {
-    modelStore.searchResults = [...modelStore.searchResults]
-  }
-  
-  // Show success notification
-  toast.success(`Download completed: ${data.filename}`)
-})
-
-  // Watch for active downloads and start/stop polling
-  watch(() => Object.keys(downloadProgress.value).length, (activeCount) => {
-    if (activeCount > 0 && !activeDownloadPolling.value) {
-      // Start polling every 10 seconds while downloads are active
-      activeDownloadPolling.value = setInterval(async () => {
-        await modelStore.fetchModels()
-      }, 10000)
-    } else if (activeCount === 0 && activeDownloadPolling.value) {
-      // Stop polling when no active downloads
-      clearInterval(activeDownloadPolling.value)
-      activeDownloadPolling.value = null
-    }
-  })
-
-  // Cleanup on unmount
-  onUnmounted(() => {
-    if (activeDownloadPolling.value) {
-      clearInterval(activeDownloadPolling.value)
-    }
-  })
-})
-
-watch(() => modelStore.searchFormat, (format) => {
-  if (format && format !== selectedFormat.value) {
-    selectedFormat.value = format
-  }
-})
-
-watch(selectedFormat, async (newFormat, oldFormat) => {
-  if (!searchQuery.value.trim()) return
-  if (newFormat === oldFormat) return
+// ── Search ─────────────────────────────────────────────────
+async function search() {
+  if (!query.value.trim()) return
+  searching.value = true
+  hasSearched.value = true
+  lastQuery.value = query.value
+  expanded.value = new Set()
+  filesCache.value = {}
   try {
-    await modelStore.searchModels(searchQuery.value, 20, newFormat)
-  } catch (error) {
-    toast.error(`Failed to search for ${formatLabel.value} models`)
-  }
-})
-
-const performSearch = async () => {
-  if (!searchQuery.value.trim()) return
-  
-  try {
-    await modelStore.searchModels(searchQuery.value, 20, selectedFormat.value)
-    await modelStore.fetchSafetensorsModels()
-  } catch (error) {
-    toast.error('Failed to search for models')
+    searchResults.value = await modelStore.searchModels(query.value.trim(), 20, searchFormat.value)
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Search failed', detail: e.message, life: 4000 })
+    searchResults.value = []
+  } finally {
+    searching.value = false
   }
 }
 
-const refreshSearch = () => {
-  if (searchQuery.value.trim()) {
-    performSearch()
+// ── Expand row & load files ────────────────────────────────
+async function toggleExpand(modelId) {
+  if (expanded.value.has(modelId)) {
+    expanded.value.delete(modelId)
+    expanded.value = new Set(expanded.value)
+    return
   }
-}
-
-const setToken = async () => {
-  if (!newToken.value.trim()) return
-  
-  settingToken.value = true
-  try {
-    await modelStore.setHuggingfaceToken(newToken.value)
-    newToken.value = ''
-    tokenAccordionIndex.value = -1
-    toast.success('HuggingFace API token has been configured')
-  } catch (error) {
-    toast.error('Failed to set HuggingFace token')
-  } finally {
-    settingToken.value = false
+  expanded.value.add(modelId)
+  expanded.value = new Set(expanded.value)
+  if (!filesCache.value[modelId]) {
+    await loadFiles(modelId)
   }
 }
 
-const clearToken = async () => {
-  settingToken.value = true
+async function loadFiles(modelId) {
+  loadingFiles.value.add(modelId)
+  loadingFiles.value = new Set(loadingFiles.value)
   try {
-    await modelStore.clearHuggingfaceToken()
-    toast.success('HuggingFace API token has been removed')
-  } catch (error) {
-    toast.error('Failed to clear HuggingFace token')
-  } finally {
-    settingToken.value = false
-  }
-}
-
-const getDownloadedQuantizations = (huggingfaceId) => {
-  return modelStore.downloadedModels
-    .filter(model => model.huggingface_id === huggingfaceId)
-    .map(model => model.quantization)
-}
+    const result = searchResults.value.find(r => (r.modelId || r.id) === modelId)
+    if (!result) return
+
+    let files = []
+    if (searchFormat.value === 'gguf') {
+      // Backend returns quantizations as a dict: { "Q4_K_M": { quantization, files: [{filename, size}], total_size, size_mb } }
+      const quantEntries = Object.values(result.quantizations || {})
+      // Flatten to individual files, keeping quant label
+      const allFiles = quantEntries.flatMap(entry =>
+        (entry.files || []).map(f => ({
+          filename: f.filename,
+          size: f.size || entry.total_size || 0,
+          quantization: entry.quantization || '',
+          variantPrefix: entry.variant_prefix || '',
+        }))
+      )
 
-const getDownloadedQuantizationsForModel = (huggingfaceId) => {
-  const grouped = {}
-  modelStore.downloadedModels
-    .filter(model => model.huggingface_id === huggingfaceId)
-    .forEach(model => {
-      const key = model.quantization || model.name
-      const existing = grouped[key]
-      if (!existing) {
-        grouped[key] = {
-          quantization: model.quantization,
-          name: model.name,
-          file_size: model.file_size || 0,
-          downloaded_at: model.downloaded_at
-        }
-      } else {
-        // Aggregate size across shards/duplicates for the same quant
-        const size = model.file_size || 0
-        grouped[key].file_size += size
-        // Keep the most recent download timestamp
-        if (model.downloaded_at && (!existing.downloaded_at || model.downloaded_at > existing.downloaded_at)) {
-          grouped[key].downloaded_at = model.downloaded_at
+      if (allFiles.length) {
+        // Try to get accurate sizes from the API
+        try {
+          const filenames = allFiles.map(f => f.filename).join(',')
+          const { data } = await axios.get(`/api/models/search/${encodeURIComponent(modelId)}/file-sizes`, {
+            params: { filenames },
+          })
+          const sizes = data.sizes || {}
+          files = allFiles.map(f => {
+            const downloaded = isDownloaded(modelId, f.filename)
+            return {
+              ...f,
+              size: sizes[f.filename] ?? f.size,
+              downloaded,
+              modelId: downloaded?.id,
+            }
+          })
+        } catch {
+          files = allFiles.map(f => {
+            const downloaded = isDownloaded(modelId, f.filename)
+            return { ...f, downloaded, modelId: downloaded?.id }
+          })
         }
       }
-    })
-  return Object.values(grouped)
-}
-
-const getDownloadedSafetensorsForModel = (huggingfaceId) => {
-  if (!huggingfaceId) return []
-  const group = (modelStore.safetensorsModels || []).find(entry => entry.huggingface_id === huggingfaceId)
-  return group?.files || []
-}
-
-// Shared helper to transform raw quantization metadata into dropdown options.
-// This keeps search results and any other quantization pickers consistent.
-const buildQuantizationOptions = (quantizations, downloadedQuantizations = []) => {
-  if (!quantizations || typeof quantizations !== 'object') return []
-
-  const options = Object.entries(quantizations).map(([name, data]) => {
-    let sizeText = ''
-    let statusText = ''
-    const variantPrefix = data.variant_prefix || ''
-    let displayBase = name
-    // If the key already includes the variant prefix (e.g. "i1-Q4_K_M"),
-    // don't prepend it again. Otherwise, build a prefixed display name.
-    if (variantPrefix && !name.startsWith(`${variantPrefix}-`)) {
-      displayBase = `${variantPrefix}-${name}`
-    }
-
-    // Prefer aggregated total_size/size_mb (may represent multiple shards)
-    const sizeMB = data.size_mb || (data.total_size ? data.total_size / (1024 * 1024) : 0)
-    if (sizeMB && sizeMB > 0) {
-      if (sizeMB >= 1024) {
-        sizeText = ` (${Math.round((sizeMB / 1024) * 100) / 100} GB)`
-      } else {
-        sizeText = ` (${Math.round(sizeMB * 100) / 100} MB)`
-      }
-    }
-
-    if (downloadedQuantizations.includes(name)) {
-      statusText = ' ✓ Downloaded'
-    }
-
-    return {
-      label: `${displayBase}${sizeText}${statusText}`,
-      value: name,
-      disabled: downloadedQuantizations.includes(name),
-      sizeMB: sizeMB || 0
-    }
-  })
-
-  // Sort by file size (increasing/smallest first)
-  return options.sort((a, b) => a.sizeMB - b.sizeMB)
-}
-
-const getQuantizationOptions = (quantizations, huggingfaceId) => {
-  const downloadedQuantizations = getDownloadedQuantizations(huggingfaceId)
-  return buildQuantizationOptions(quantizations, downloadedQuantizations)
-}
-
-const getQuantizationSizeWithUnit = (quantizations, quantizationName) => {
-  if (!quantizations || typeof quantizations !== 'object' || !quantizationName) return ''
-  
-  // Get size from object structure
-  const quant = quantizations[quantizationName]
-  if (!quant) return ''
-  
-  // Prefer aggregated total_size/size_mb which may represent multiple shards
-  let sizeMB = quant.size_mb
-  if (!sizeMB && quant.total_size && quant.total_size > 0) {
-    sizeMB = quant.total_size / (1024 * 1024)
-  }
-
-  if (sizeMB && sizeMB > 0) {
-    if (sizeMB >= 1024) {
-      // Convert to GB for large files
-      return `${Math.round((sizeMB / 1024) * 100) / 100} GB`
     } else {
-      return `${Math.round(sizeMB * 100) / 100} MB`
-    }
-  } else if (quant.size && quant.size > 0) {
-    // Convert bytes to appropriate unit
-    const sizeMB = quant.size / (1024 * 1024)
-    if (sizeMB >= 1024) {
-      return `${Math.round((sizeMB / 1024) * 100) / 100} GB`
-    } else {
-      return `${Math.round(sizeMB * 100) / 100} MB`
+      // Safetensors: backend returns safetensors_files: [{ filename }]
+      const stFiles = result.safetensors_files || []
+      files = stFiles.map(f => ({
+        filename: f.filename,
+        size: f.size || 0,
+        downloaded: false,
+      }))
     }
-  }
-  
-  // Fallback text if no size data available
-  return 'Unknown size'
-}
 
-const onDropdownOpen = async (modelId) => {
-  // Fetch actual file sizes from HuggingFace API when dropdown opens
-  const model = Array.isArray(modelStore.searchResults) ? 
-    modelStore.searchResults.find(m => m.id === modelId) : null
-  
-  if (model && model.model_format === 'gguf' && model.quantizations) {
-    // Check if we already have size data from API to avoid unnecessary API calls
-    const hasApiData = Object.values(model.quantizations).some(q => q.size_mb)
-    if (hasApiData) {
-      console.log(`📊 API sizes already available for ${model.id}, skipping API call`)
-      return
-    }
-    
-    try {
-      loadingQuantizationSizes.value[modelId] = true
-      console.log(`🔍 Fetching actual sizes for ${model.id} when dropdown opened...`)
-      console.log(`📊 Current quantizations:`, model.quantizations)
-      
-      const actualQuantizations = await modelStore.getQuantizationSizes(model.id, model.quantizations)
-      
-      console.log(`📊 API Response:`, actualQuantizations)
-      
-      // Update the model's quantizations with actual sizes while preserving
-      // existing metadata such as `variant_prefix` (e.g. the "i1-" prefix).
-      // We merge per-quantization instead of replacing the whole object so
-      // labels like "i1-Q4_K_M" don't lose their prefix after the API call.
-      Object.entries(actualQuantizations || {}).forEach(([quantName, apiData]) => {
-        const existing = model.quantizations[quantName] || {}
-        model.quantizations[quantName] = {
-          ...existing,
-          ...apiData
-        }
-      })
-      console.log(`✅ Updated quantizations with API sizes (preserving metadata):`, actualQuantizations)
-      
-      // Force reactivity update
-      modelStore.searchResults = [...modelStore.searchResults]
-    } catch (error) {
-      console.error('❌ Failed to fetch actual quantization sizes:', error)
-      // Continue with original sizes if API call fails
-    } finally {
-      loadingQuantizationSizes.value[modelId] = false
-    }
+    filesCache.value[modelId] = files
+  } catch (e) {
+    filesCache.value[modelId] = []
+    console.error('Failed to load files:', e)
+  } finally {
+    loadingFiles.value.delete(modelId)
+    loadingFiles.value = new Set(loadingFiles.value)
   }
 }
 
-const onQuantizationChange = async (modelId, quantization) => {
-  selectedQuantization.value[modelId] = quantization
+function getFiles(modelId) {
+  return filesCache.value[modelId] || []
 }
 
-const downloadSelectedQuantization = async (modelId) => {
-  const model = Array.isArray(modelStore.searchResults) ? 
-    modelStore.searchResults.find(m => m.id === modelId) : null
-  const quantization = selectedQuantization.value[modelId]
-  
-  if (!model || model.model_format !== 'gguf' || !quantization) return
-  
-  const quantizationData = model.quantizations?.[quantization]
-  if (!quantizationData) {
-    toast.error('Quantization data not found')
-    return
-  }
-  
+// ── Download ───────────────────────────────────────────────
+async function downloadFile(result, file) {
+  const modelId = result.modelId || result.id
+  const key = `${modelId}:${file.filename}`
+  downloadingFiles.value.add(key)
+  downloadingFiles.value = new Set(downloadingFiles.value)
   try {
-    // Initialize Set for this model if doesn't exist
-    if (!downloadingModels.value[modelId]) {
-      downloadingModels.value[modelId] = new Set()
-    }
-    
-    // If we have a bundle of files for this quantization, use GGUF bundle endpoint
-    if (Array.isArray(quantizationData.files) && quantizationData.files.length > 0) {
-      const filesPayload = quantizationData.files.map((file) => ({
-        filename: file.filename,
-        size: file.size || 0
-      }))
-
-      const response = await modelStore.downloadGgufBundle(
-        model.id,
-        quantization,
-        filesPayload,
-        model.pipeline_tag || null
-      )
-
-      const taskId = response.task_id
-      if (taskId) {
-        downloadingModels.value[modelId].add(taskId)
-      }
-      toast.success(`Downloading ${model.name} (${quantization})`)
-      return
-    }
-
-    // Fallback: legacy single-file behavior
-    let totalBytes = 0
-    const sizeMB = quantizationData.size_mb || (quantizationData.total_size ? quantizationData.total_size / (1024 * 1024) : 0)
-    if (sizeMB && sizeMB > 0) {
-      totalBytes = Math.round(sizeMB * 1024 * 1024)
-    } else if (quantizationData.size && quantizationData.size > 0) {
-      if (quantizationData.size > 1000000) {
-        totalBytes = quantizationData.size
-      } else {
-        totalBytes = Math.round(quantizationData.size * 1024 * 1024)
-      }
-    }
-    
-    console.log(`Downloading ${quantizationData.filename}: ${totalBytes} bytes`)
-    
-    const response = await modelStore.downloadModel(
-      model.id,
-      quantizationData.filename,
-      totalBytes,
-      model.model_format || 'gguf',
-      model.pipeline_tag || null
+    await modelStore.downloadModel(
+      modelId,
+      file.filename,
+      file.size || 0,
+      searchFormat.value,
+      result.pipeline_tag || null
     )
-    
-    // Store the task_id for tracking
-    const taskId = response.task_id
-    if (taskId) {
-      downloadingModels.value[modelId].add(taskId)
-    }
-    
-    toast.success(`Downloading ${model.name} (${quantization})`)
-  } catch (error) {
-    // Handle 409 Conflict - already downloading
-    if (error.response?.status === 409) {
-      toast.warning('This quantization is already being downloaded')
-    } else {
-      toast.error('Failed to start model download')
-    }
-    console.error('Download error:', error)
+    toast.add({ severity: 'success', summary: 'Download started', detail: 'Track progress above', life: 3000 })
+    // Refresh files to update downloaded status
+    delete filesCache.value[modelId]
+    await loadFiles(modelId)
+    await modelStore.fetchModels()
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Download failed', detail: e.message, life: 4000 })
+  } finally {
+    downloadingFiles.value.delete(key)
+    downloadingFiles.value = new Set(downloadingFiles.value)
   }
 }
 
-const downloadSafetensorsBundle = async (model) => {
-  if (!model) {
-    toast.warning('Model details unavailable')
-    return
-  }
-
-  const repoFiles = Array.isArray(model.repo_files) && model.repo_files.length > 0
-    ? model.repo_files
-    : model.safetensors_files
-
-  if (!Array.isArray(repoFiles) || repoFiles.length === 0) {
-    toast.warning('No files available to download')
-    return
-  }
-
-  const filesPayload = repoFiles.map((file) => ({
-    filename: file.filename,
-    size: file.size || 0
-  }))
-
-  try {
-    if (!downloadingModels.value[model.id]) {
-      downloadingModels.value[model.id] = new Set()
-    }
-    const response = await modelStore.downloadSafetensorsBundle(model.id, filesPayload)
-    const taskId = response.task_id
-    if (taskId) {
-      downloadingModels.value[model.id].add(taskId)
-    }
-    toast.success('Downloading safetensors bundle')
-  } catch (error) {
-    if (error.response?.status === 409) {
-      toast.warning('Safetensors bundle already downloading')
-    } else {
-      toast.error('Failed to start safetensors bundle download')
-    }
-    console.error('Safetensors bundle download error:', error)
-  }
-}
 
-const ensureSafetensorsMetadata = async (modelId) => {
-  if (!modelId) return
-  if (modelStore.safetensorsMetadata?.[modelId]) return
-  try {
-    await modelStore.fetchSafetensorsMetadata(modelId)
-  } catch (error) {
-    console.error('Failed to load safetensors metadata:', error)
-    toast.error('Unable to load safetensors metadata')
+function configureDownloaded(modelId, file) {
+  const model = file.modelId
+    ? modelStore.allQuantizations.find(m => m.id === file.modelId)
+    : findDownloadedModel(modelId, file.filename)
+  if (model) {
+    router.push(`/models/${encodeURIComponent(model.id)}/config`)
   }
 }
 
-const onSafetensorsAccordionOpen = async (modelId) => {
-  safetensorsAccordionIndex.value[modelId] = 0
-  await ensureSafetensorsMetadata(modelId)
-}
-
-const onSafetensorsAccordionClose = (modelId) => {
-  safetensorsAccordionIndex.value[modelId] = null
-}
-
-const isModelDownloaded = (huggingfaceId, quantization, model = null) => {
-  if (!huggingfaceId || !quantization) return false
-
-  // All downloaded entries for this repo/quantization
-  const downloaded = modelStore.downloadedModels.filter(m =>
-    m.huggingface_id === huggingfaceId &&
-    m.quantization === quantization
+// ── Helpers ────────────────────────────────────────────────
+function isDownloaded(hfId, filename) {
+  return modelStore.allQuantizations.find(
+    m => m.huggingface_id === hfId &&
+    (m.filename === filename || (m.quantization && filename.includes(m.quantization)))
   )
-  if (downloaded.length === 0) return false
-
-  // If we don't know the expected files (single-file quant), "any" is enough
-  const quantMeta = model?.quantizations?.[quantization]
-  const expectedFiles = Array.isArray(quantMeta?.files)
-    ? quantMeta.files.map(f => f.filename).filter(Boolean)
-    : []
-  if (!expectedFiles.length) {
-    return true
-  }
-
-  // For bundles: only mark as downloaded if *all* expected shard filenames exist locally
-  const downloadedFilenames = downloaded.map(m => {
-    const path = (m.file_path || '').replace(/\\/g, '/')
-    const parts = path.split('/')
-    return parts[parts.length - 1]
-  })
-
-  return expectedFiles.every(fname => downloadedFilenames.includes(fname))
 }
 
-const formatNumber = (num) => {
-  if (num >= 1000000) {
-    return (num / 1000000).toFixed(1) + 'M'
-  } else if (num >= 1000) {
-    return (num / 1000).toFixed(1) + 'K'
-  }
-  return num.toString()
+function findDownloadedModel(hfId, filename) {
+  return modelStore.allQuantizations.find(
+    m => m.huggingface_id === hfId &&
+    (m.filename === filename || (m.quantization && filename.includes(m.quantization)))
+  )
 }
 
-const formatPipelineLabel = (tag) => {
-  if (!tag) return ''
-  const lower = tag.toLowerCase()
-  if (lower.includes('embed')) return 'Embedding'
-  if (lower.includes('feature')) return 'Feature Extraction'
-  return tag
+function extractQuantization(filename) {
+  if (!filename) return null
+  const match = filename.match(/[_-](Q\d[_A-Z0-9]*(?:_M|_S|_XS|_XL|_XXS)?|IQ\d_[A-Z]+|BF16|F16|F32)/i)
+  return match?.[1]?.toUpperCase() ?? null
 }
 
-const truncateText = (text, maxLength) => {
-  if (text.length <= maxLength) return text
-  return text.substring(0, maxLength) + '...'
-}
+const INTERESTING_TAGS = new Set([
+  'text-generation', 'chat', 'instruct', 'code', 'embedding', 'vision',
+  'multimodal', 'image-text-to-text', 'fill-mask', 'question-answering',
+])
 
-const formatBytes = (bytes) => {
-  if (!bytes || bytes === 0) return '0 Bytes'
-  if (typeof bytes !== 'number') return 'Unknown'
-  
-  const k = 1024
-  const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']
-  const i = Math.floor(Math.log(bytes) / Math.log(k))
-  
-  // Ensure we don't go out of bounds
-  const sizeIndex = Math.min(i, sizes.length - 1)
-  const value = bytes / Math.pow(k, sizeIndex)
-  
-  // Format with appropriate decimal places
-  let formattedValue
-  if (sizeIndex === 0) {
-    formattedValue = Math.round(value) // Bytes - no decimals
-  } else if (sizeIndex === 1) {
-    formattedValue = Math.round(value * 10) / 10 // KB - 1 decimal
-  } else {
-    formattedValue = Math.round(value * 100) / 100 // MB+ - 2 decimals
-  }
-  
-  return formattedValue + ' ' + sizes[sizeIndex]
+function interestingTag(tag) {
+  return INTERESTING_TAGS.has(tag) || tag.startsWith('language:') || /^\d+[bBmM]$/.test(tag)
 }
 
-// formatFileSize and formatDate are now imported from @/utils/formatting
-
-const formatTime = (seconds) => {
-  if (seconds === 0) return '0s'
-  const h = Math.floor(seconds / 3600)
-  const m = Math.floor((seconds % 3600) / 60)
-  const s = Math.floor(seconds % 60)
-  
-  let result = ''
-  if (h > 0) result += `${h}h `
-  if (m > 0) result += `${m}m `
-  result += `${s}s`
-  return result.trim()
+// Use decimal (1000) so MB/GB match Hugging Face's display
+function formatBytes(bytes) {
+  if (!bytes) return '—'
+  const units = ['B', 'KB', 'MB', 'GB', 'TB']
+  let i = 0; let val = bytes
+  while (val >= 1000 && i < units.length - 1) { val /= 1000; i++ }
+  return `${val.toFixed(1)} ${units[i]}`
 }
 
-const getModelDownloadProgress = (modelId) => {
-  // Get all active progress entries for this model
-  // We no longer filter out progress >= 100 here; instead, entries are cleaned up
-  // a few seconds after completion so the user can briefly see the 100% state.
-  return Object.entries(downloadProgress.value)
-    .filter(([taskId, data]) => data.modelId === modelId)
-    .map(([taskId, data]) => ({
-      taskId,
-      ...data
-    }))
+function formatNumber(n) {
+  if (n == null) return ''
+  if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`
+  if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`
+  return String(n)
 }
 
-const isSafetensorsDownloaded = (model) => {
-  if (!model) return false
-  const huggingfaceId = model.huggingface_id || model.id
-  if (!huggingfaceId) return false
-  return getDownloadedSafetensorsForModel(huggingfaceId).length > 0
-}
+// ── Lifecycle ──────────────────────────────────────────────
+onMounted(async () => {
+  if (!modelStore.models.length) await modelStore.fetchModels()
+})
 </script>
 
 <style scoped>
 .model-search {
-  max-width: 1400px;
+  max-width: 960px;
   margin: 0 auto;
+  padding: var(--spacing-lg, 1.5rem);
+  display: flex;
+  flex-direction: column;
+  gap: var(--spacing-md, 0.75rem);
 }
 
-.search-section {
-  margin-bottom: var(--spacing-md);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  border: 1px solid var(--border-primary);
-}
-
+/* ── Search bar ───────────────────────────────────────── */
 .search-bar {
   display: flex;
-  gap: var(--spacing-sm);
+  gap: 0.5rem;
   align-items: center;
+  flex-wrap: wrap;
 }
 
-.format-dropdown {
-  width: 180px;
-}
-
-.search-input {
+.search-input-wrap {
   flex: 1;
-}
-
-.search-results {
-  margin-top: var(--spacing-md);
-}
-
-.search-results h3 {
-  margin-bottom: var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 600;
-}
-
-.model-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-  gap: var(--spacing-md);
-}
-
-.model-card {
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-md);
-  padding: var(--spacing-md);
-  transition: all var(--transition-normal);
-  box-shadow: var(--shadow-sm);
+  min-width: 200px;
   position: relative;
-  overflow: hidden;
-}
-
-.model-card:hover {
-  box-shadow: var(--shadow-md);
-  transform: translateY(-1px);
-  border-color: var(--accent-cyan);
-}
-
-.model-card-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: flex-start;
-  margin-bottom: var(--spacing-sm);
 }
 
-.model-name-row {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-}
-
-.model-format-badge {
-  padding: 2px 8px;
-  border-radius: var(--radius-sm);
-  background: var(--accent-cyan);
-  color: #fff;
-  font-size: 0.7rem;
-  font-weight: 600;
-  letter-spacing: 0.05em;
-}
-
-.model-pipeline {
-  margin-top: var(--spacing-xs);
-}
-
-.pipeline-badge {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: 2px 8px;
-  border-radius: var(--radius-sm);
-  background: rgba(14, 165, 233, 0.12);
-  color: var(--accent-cyan);
-  font-size: 0.7rem;
-  font-weight: 600;
-  text-transform: uppercase;
-  letter-spacing: 0.05em;
-  border: 1px solid rgba(14, 165, 233, 0.25);
-}
-
-.model-name {
-  font-weight: 700;
-  color: var(--text-primary);
-  margin-bottom: var(--spacing-xs);
-}
-
-.model-author {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  margin-bottom: var(--spacing-xs);
-}
-
-.model-meta {
-  margin-top: var(--spacing-xs);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-
-.model-meta-item {
-  display: flex;
-  justify-content: space-between;
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-}
-
-.model-meta-item span:first-child {
-  font-weight: 600;
+.search-icon {
+  position: absolute;
+  left: 0.75rem;
+  top: 50%;
+  transform: translateY(-50%);
+  color: var(--text-secondary, #9ca3af);
+  pointer-events: none;
 }
 
-.model-stats {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-}
-
-.model-description {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  line-height: 1.4;
-  margin-bottom: var(--spacing-sm);
-}
-
-.model-links {
-  margin: var(--spacing-sm) 0;
-}
-
-.readme-link {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  color: var(--accent-cyan);
-  text-decoration: none;
-  font-size: 0.875rem;
-}
-
-.readme-link:hover {
-  text-decoration: underline;
-}
-
-.quantizations {
-  margin-top: var(--spacing-sm);
-  padding-top: var(--spacing-sm);
-  border-top: 1px solid var(--border-primary);
-}
-
-.quantizations h4 {
-  margin-bottom: var(--spacing-sm);
-  font-size: 0.875rem;
-  color: var(--text-primary);
-  font-weight: 600;
-}
-
-.safetensors-section {
-  margin-top: var(--spacing-sm);
-  padding-top: var(--spacing-sm);
-  border-top: 1px solid var(--border-primary);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.safetensors-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-md);
-}
-
-.safetensors-header h4 {
-  margin: 0;
-}
-
-.safetensors-header p {
-  margin: 4px 0 0;
-  color: var(--text-secondary);
-  font-size: 0.85rem;
+.search-input {
+  width: 100%;
+  padding-left: 2.25rem !important;
 }
 
-.safetensors-files-box {
-  margin-top: var(--spacing-sm);
-  padding: var(--spacing-md);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
+.clear-btn {
+  position: absolute;
+  right: 0.25rem;
+  top: 50%;
+  transform: translateY(-50%);
 }
 
-.safetensors-files-box h4 {
-  margin: 0 0 var(--spacing-sm);
-  font-size: 0.9rem;
-  font-weight: 600;
-  color: var(--text-primary);
-}
+.format-select { width: 140px; }
 
-.safetensors-files-list {
+/* ── Loading ──────────────────────────────────────────── */
+.loading-row {
   display: flex;
-  flex-direction: column;
-  gap: 2px;
-  font-family: monospace;
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-  line-height: 1.6;
-}
-
-.safetensors-file-name {
-  padding: 2px 0;
-  word-break: break-all;
-}
-
-.safetensors-file {
-  display: flex;
-  justify-content: space-between;
   align-items: center;
-  padding: var(--spacing-sm);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  background: var(--bg-surface);
-}
-
-.safetensors-file .file-info {
-  display: flex;
-  flex-direction: column;
-  gap: 2px;
-}
-
-.file-name {
-  font-weight: 600;
-  color: var(--text-primary);
-}
-
-.safetensors-accordion {
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
+  gap: 0.75rem;
+  padding: 1.5rem 0;
+  color: var(--text-secondary, #9ca3af);
 }
 
-.safetensors-metadata {
+/* ── Empty state ──────────────────────────────────────── */
+.empty-state {
   display: flex;
   flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.dtype-summary,
-.metadata-files {
-  background: var(--bg-surface);
-  padding: var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-}
-
-.dtype-row,
-.metadata-file-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-top: 4px;
-  font-size: 0.85rem;
-}
-
-.dtype-chip {
-  display: inline-flex;
   align-items: center;
-  gap: 4px;
-  background: var(--bg-tertiary);
-  padding: 2px 6px;
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  margin-right: var(--spacing-xs);
-}
-
-.metadata-empty,
-.empty-safetensors {
-  font-size: 0.85rem;
-  color: var(--text-secondary);
+  gap: 1rem;
+  padding: 4rem 0;
   text-align: center;
-  padding: var(--spacing-md);
+  color: var(--text-secondary, #9ca3af);
 }
 
-.metadata-error {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  padding: var(--spacing-sm);
-  background: var(--bg-warning);
-  border: 1px solid var(--border-warning);
-  border-radius: var(--radius-sm);
-  color: var(--text-warning);
-  font-size: 0.9rem;
-}
-
-.downloaded-quantizations {
-  margin-bottom: var(--spacing-md);
-  padding-bottom: var(--spacing-sm);
-  border-bottom: 1px solid var(--border-primary);
-}
+.empty-state h3 { margin: 0; font-size: 1.1rem; color: var(--text-primary, #f1f5f9); }
+.empty-state p  { margin: 0; font-size: 0.875rem; }
 
-.downloaded-list {
+/* ── Results table ────────────────────────────────────── */
+.results-table {
   display: flex;
   flex-direction: column;
-  gap: var(--spacing-xs);
+  gap: 0.375rem;
 }
 
-.downloaded-item {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  padding: var(--spacing-sm);
-  background: var(--bg-surface);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-sm);
-  transition: all var(--transition-normal);
+.results-header {
+  padding: 0.25rem 0;
 }
 
-.downloaded-item:hover {
-  border-color: var(--accent-green);
-  background: var(--bg-tertiary);
+.results-count {
+  font-size: 0.8rem;
+  color: var(--text-secondary, #9ca3af);
 }
 
-.downloaded-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
-
-.downloaded-name {
-  font-weight: 600;
-  color: var(--text-primary);
+.result-row {
+  background: var(--bg-card, #161b2e);
+  border: 1px solid var(--border-primary, #2a2f45);
+  border-radius: var(--radius-lg, 0.75rem);
+  overflow: hidden;
 }
 
-.downloaded-badge {
+.result-main {
   display: flex;
   align-items: center;
-  gap: var(--spacing-xs);
-  padding: 2px 6px;
-  background: var(--accent-green);
-  color: white;
-  border-radius: var(--radius-xs);
-  font-size: 0.75rem;
-  font-weight: 500;
+  padding: 0.75rem 1rem;
+  cursor: pointer;
+  gap: 0.75rem;
+  transition: background 0.15s;
 }
 
-.downloaded-details {
-  display: flex;
-  flex-direction: column;
-  align-items: flex-end;
-  gap: 2px;
-}
+.result-main:hover { background: var(--bg-card-hover, #1e2235); }
 
-.downloaded-size {
+.result-expand-icon {
+  color: var(--text-secondary, #9ca3af);
+  width: 16px;
+  flex-shrink: 0;
   font-size: 0.75rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.downloaded-date {
-  font-size: 0.7rem;
-  color: var(--text-tertiary);
-}
-
-.quantization-selector {
-  margin-bottom: var(--spacing-sm);
-}
-
-.quantization-dropdown {
-  width: 100%;
-}
-
-.selected-quantization-info {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: var(--spacing-sm);
 }
 
-.quant-info {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
+.result-info { flex: 1; min-width: 0; }
 
-.quant-name-row {
+.result-name {
   display: flex;
   align-items: center;
-  gap: var(--spacing-sm);
+  gap: 0.5rem;
+  flex-wrap: wrap;
+  margin-bottom: 0.3rem;
 }
 
-.quant-name {
+.model-link {
   font-weight: 600;
-  color: var(--text-primary);
-  font-size: 0.875rem;
-}
-
-.downloaded-badge {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  padding: 2px 6px;
-  background: rgba(16, 185, 129, 0.1);
-  color: var(--accent-green);
-  border: 1px solid rgba(16, 185, 129, 0.2);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  font-weight: 500;
-}
-
-.quant-size {
-  font-size: 0.75rem;
-  color: var(--text-secondary);
-}
-
-.download-button {
-  flex-shrink: 0;
+  font-size: 0.9rem;
+  color: var(--text-primary, #f1f5f9);
+  text-decoration: none;
+  font-family: monospace;
 }
 
-/* Download Progress Styles */
-.downloads-container {
-  margin-top: var(--spacing-md);
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
-}
+.model-link:hover { color: var(--accent-cyan, #22d3ee); text-decoration: underline; }
 
-.downloads-container .download-progress {
-  margin-top: 0;
-}
+.pipeline-tag { flex-shrink: 0; }
 
-.download-progress {
-  margin-top: var(--spacing-lg);
-  padding: var(--spacing-lg);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  border: 1px solid var(--border-secondary);
-  position: relative;
-  z-index: 1;
-  overflow: hidden;
-  box-shadow: var(--shadow-sm);
-  width: 100%;
-  display: block;
-}
-
-.progress-header {
+.result-meta {
   display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: var(--spacing-xs);
-}
-
-.progress-filename {
-  font-size: 0.875rem;
-  font-weight: 500;
-  color: var(--text-primary);
-  flex: 1;
-  margin-right: var(--spacing-sm);
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
-}
-
-.progress-percentage {
-  font-size: 0.875rem;
-  font-weight: 600;
-  color: var(--accent-blue);
-  min-width: 3rem;
-  text-align: right;
+  gap: 0.875rem;
+  flex-wrap: wrap;
 }
 
-.progress-bar-container {
-  width: 100%;
-  height: 6px;
-  background: var(--bg-secondary);
-  border-radius: 3px;
-  overflow: hidden;
-  margin-bottom: var(--spacing-xs);
-  position: relative;
-}
-
-.progress-bar {
-  height: 100%;
-  background: linear-gradient(90deg, var(--accent-blue), var(--accent-cyan));
-  border-radius: 3px;
-  transition: width 0.3s ease;
-  position: relative;
-  z-index: 1;
-}
-
-.progress-details {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
+.meta-item {
   font-size: 0.75rem;
-  color: var(--text-secondary);
-}
-
-.progress-row-1 {
+  color: var(--text-secondary, #9ca3af);
   display: flex;
-  gap: var(--spacing-sm);
   align-items: center;
-  white-space: nowrap;
-}
-
-.progress-bundle-row {
-  font-size: 0.85rem;
-  color: var(--text-secondary);
-  margin-top: 4px;
-}
-
-.progress-size {
-  font-weight: 500;
-  min-width: 150px;
+  gap: 0.25rem;
 }
 
-.progress-speed {
-  color: var(--accent-green);
-  min-width: 70px;
-}
+.meta-item .pi { font-size: 0.7rem; }
+.license { font-style: italic; }
 
-.progress-eta-row {
+.result-tags {
   display: flex;
+  gap: 0.25rem;
+  flex-wrap: wrap;
+  flex-shrink: 0;
 }
 
-.progress-eta {
-  color: var(--accent-orange);
-}
-
-.token-section {
-  margin-bottom: var(--spacing-md);
-}
-
-.token-header {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-}
+.model-tag { font-size: 0.7rem; }
 
-.token-status-indicator {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  margin-left: auto;
-  font-size: 0.75rem;
-  color: var(--accent-green);
-}
+/* ── Expanded files ───────────────────────────────────── */
+.row-expand-enter-active,
+.row-expand-leave-active { transition: all 0.2s ease; overflow: hidden; }
+.row-expand-enter-from,
+.row-expand-leave-to    { max-height: 0; opacity: 0; }
+.row-expand-enter-to,
+.row-expand-leave-from  { max-height: 600px; opacity: 1; }
 
-.token-setup {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-md);
+.result-files {
+  border-top: 1px solid var(--border-primary, #2a2f45);
+  padding: 0.75rem 1rem;
+  background: var(--bg-surface, #1e2235);
 }
 
-.token-info {
+.files-loading,
+.files-empty {
   display: flex;
   align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
+  gap: 0.5rem;
   font-size: 0.875rem;
+  color: var(--text-secondary, #9ca3af);
+  padding: 0.5rem 0;
 }
 
-.token-input {
-  display: flex;
-  gap: var(--spacing-sm);
-}
-
-.token-field {
-  flex: 1;
-}
-
-.token-help {
-  font-size: 0.75rem;
-  color: var(--text-muted);
-}
-
-.token-help a {
-  color: var(--accent-cyan);
-  text-decoration: none;
-}
-
-.token-help a:hover {
-  text-decoration: underline;
-}
-
-.token-status {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-sm);
-}
-
-.token-success {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--accent-green);
+.files-table {
+  width: 100%;
+  border-collapse: collapse;
   font-size: 0.875rem;
 }
 
-.env-badge {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  background: var(--bg-surface);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  font-size: 0.75rem;
-  color: var(--text-secondary);
+.files-table th {
+  text-align: left;
+  font-size: 0.7rem;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  color: var(--text-secondary, #9ca3af);
+  padding: 0.375rem 0.5rem;
+  border-bottom: 1px solid var(--border-primary, #2a2f45);
 }
 
-.env-info {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  color: var(--text-secondary);
-  font-size: 0.75rem;
+.files-table td {
+  padding: 0.4rem 0.5rem;
+  border-bottom: 1px solid rgba(255, 255, 255, 0.04);
+  vertical-align: middle;
 }
 
-.empty-state {
-  text-align: center;
-  padding: var(--spacing-2xl) var(--spacing-md);
-  color: var(--text-secondary);
-  background: var(--bg-surface);
-  border-radius: var(--radius-md);
-  border: 2px dashed var(--border-secondary);
-  margin: var(--spacing-md) 0;
-}
+.files-table tr:last-child td { border-bottom: none; }
 
-.empty-state i {
-  font-size: 3rem !important;
-  color: var(--accent-cyan);
-  margin-bottom: var(--spacing-md);
-}
+.file-name { display: flex; align-items: center; gap: 0.4rem; }
+.file-name code { font-size: 0.8rem; }
+.file-size { color: var(--text-secondary, #9ca3af); white-space: nowrap; }
+.not-downloaded { color: var(--text-secondary, #9ca3af); }
 
-/* Loading indicator for quantization sizes */
-.loading-indicator {
+.safetensors-download {
   display: flex;
   align-items: center;
-  gap: var(--spacing-xs);
-  margin-top: var(--spacing-xs);
-  color: var(--text-secondary);
-  font-size: 0.875rem;
+  gap: 0.75rem;
+  margin-top: 0.75rem;
+  padding-top: 0.75rem;
+  border-top: 1px solid var(--border-primary, #2a2f45);
 }
 
-.loading-indicator i {
-  color: var(--accent-blue);
-}
-
-.empty-state h3 {
-  margin: var(--spacing-md) 0 var(--spacing-sm);
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.1rem;
-}
-
-.empty-state p {
-  font-size: 0.875rem;
-  line-height: 1.6;
-  max-width: 400px;
-  margin: 0 auto;
-}
-
-/* Responsive */
-@media (max-width: 768px) {
-  .model-grid {
-    grid-template-columns: 1fr;
-  }
-  
-  .search-bar {
-    flex-direction: column;
-  }
-  
-  .token-input {
-    flex-direction: column;
-  }
-  
-  .selected-quantization-info {
-    flex-direction: column;
-    align-items: stretch;
-  }
+.safetensors-download small {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #9ca3af);
 }
 </style>
diff --git a/frontend/src/views/System.vue b/frontend/src/views/System.vue
deleted file mode 100644
index 9c3a9c9..0000000
--- a/frontend/src/views/System.vue
+++ /dev/null
@@ -1,74 +0,0 @@
-<template>
-  <div class="system-page">
-    <TabView>
-      <TabPanel header="System">
-        <SystemTab />
-      </TabPanel>
-      <TabPanel header="llama.cpp">
-        <LlamaCppTab />
-      </TabPanel>
-      <TabPanel header="LMDeploy">
-        <LMDeployTab />
-      </TabPanel>
-    </TabView>
-  </div>
-</template>
-
-<script setup>
-import TabView from 'primevue/tabview'
-import TabPanel from 'primevue/tabpanel'
-import SystemTab from '@/components/system/SystemTab.vue'
-import LlamaCppTab from '@/components/system/LlamaCppTab.vue'
-import LMDeployTab from '@/components/system/LMDeployTab.vue'
-</script>
-
-<style scoped>
-.system-page {
-  max-width: 1400px;
-  margin: 0 auto;
-  padding: var(--spacing-lg);
-}
-
-:deep(.p-tabview-nav) {
-  background: var(--bg-surface);
-  border-radius: var(--radius-lg) var(--radius-lg) 0 0;
-  padding: var(--spacing-sm) var(--spacing-sm) 0;
-}
-
-:deep(.p-tabview-nav-link) {
-  padding: var(--spacing-md) var(--spacing-lg);
-  font-weight: 600;
-  transition: all var(--transition-fast);
-}
-
-:deep(.p-tabview-nav-link:hover) {
-  background: var(--bg-card);
-}
-
-:deep(.p-tabview-nav-link.p-highlight) {
-  background: var(--gradient-primary);
-  color: white;
-  border-radius: var(--radius-md) var(--radius-md) 0 0;
-}
-
-:deep(.p-tabview-panels) {
-  background: var(--bg-card);
-  border: 1px solid var(--border-primary);
-  border-top: none;
-  border-radius: 0 0 var(--radius-lg) var(--radius-lg);
-  padding: var(--spacing-xl);
-}
-</style>
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/frontend/src/views/SystemStatus.vue b/frontend/src/views/SystemStatus.vue
deleted file mode 100644
index aea3195..0000000
--- a/frontend/src/views/SystemStatus.vue
+++ /dev/null
@@ -1,723 +0,0 @@
-<template>
-  <div class="system-status">
-    <div class="card">
-      <div class="card-header">
-        <h2 class="card-title">System Status</h2>
-        <div class="header-actions">
-          <div class="connection-info">
-            <div class="live-indicator" v-if="wsStore.isConnected">
-              <i class="pi pi-circle-fill" style="color: #22d3ee; font-size: 0.5rem;"></i>
-              <span>Live</span>
-            </div>
-            <div class="connection-status" v-else>
-              <i class="pi pi-circle" style="color: #ef4444; font-size: 0.5rem;"></i>
-              <span>{{ wsStore.connectionStatus }}</span>
-            </div>
-          </div>
-          <Button 
-            icon="pi pi-refresh" 
-            @click="refreshStatus"
-            :loading="systemStore.loading"
-            severity="secondary"
-            text
-          />
-        </div>
-      </div>
-
-      <!-- System Overview -->
-      <div class="system-overview">
-        <div class="overview-grid">
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">🖥️</span>
-              <h3>CPU</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.cpu_percent || 0).toFixed(1) }}%</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.cpu_percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">💾</span>
-              <h3>Memory</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.memory?.percent || 0).toFixed(1) }}%</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Available</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.systemStatus.system?.memory?.available || 0) }}</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.memory?.percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">💿</span>
-              <h3>Storage</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Usage</span>
-                <span class="metric-value">{{ (systemStore.systemStatus.system?.disk?.percent || 0).toFixed(1) }}%</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Free</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.systemStatus.system?.disk?.free || 0) }}</span>
-              </div>
-              <ProgressBar :value="systemStore.systemStatus.system?.disk?.percent || 0" />
-            </div>
-          </div>
-
-          <div class="overview-card">
-            <div class="overview-header">
-              <span style="color: #22d3ee; font-size: 1.75rem; font-weight: bold;">🎮</span>
-              <h3>GPU</h3>
-            </div>
-            <div class="overview-content">
-              <div class="metric">
-                <span class="metric-label">Count</span>
-                <span class="metric-value">{{ systemStore.gpuInfo.device_count || 0 }}</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Total VRAM</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.gpuInfo.total_vram || 0) }}</span>
-              </div>
-              <div class="metric">
-                <span class="metric-label">Available</span>
-                <span class="metric-value">{{ formatFileSize(systemStore.gpuInfo.available_vram || 0) }}</span>
-              </div>
-              <div v-if="systemStore.gpuInfo.nvlink_topology?.has_nvlink" class="metric">
-                <span class="metric-label">NVLink</span>
-                <span class="metric-value">{{ systemStore.gpuInfo.nvlink_topology.recommended_strategy }}</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- GPU Details -->
-      <div v-if="systemStore.gpuInfo.gpus && systemStore.gpuInfo.gpus.length > 0" class="gpu-details">
-        <h3>GPU Details</h3>
-        <div class="gpu-list">
-          <div 
-            v-for="gpu in systemStore.gpuInfo.gpus" 
-            :key="gpu.index"
-            class="gpu-card"
-          >
-            <div class="gpu-header">
-              <h4>GPU {{ gpu.index }}: {{ gpu.name }}</h4>
-              <div class="gpu-status">
-                <span 
-                  :class="['status-indicator', gpu.utilization?.gpu ? 'status-running' : 'status-stopped']"
-                >
-                  <i :class="gpu.utilization?.gpu ? 'pi pi-play' : 'pi pi-pause'"></i>
-                  {{ gpu.utilization?.gpu ? `${gpu.utilization.gpu}%` : 'Idle' }}
-                </span>
-              </div>
-            </div>
-            
-            <div class="gpu-metrics">
-              <div class="metric-row">
-                <span class="metric-label">Memory Usage</span>
-                <div class="metric-bar">
-                  <ProgressBar 
-                    :value="(gpu.memory.used / gpu.memory.total) * 100"
-                    :showValue="false"
-                  />
-                  <span class="metric-text">
-                    {{ formatFileSize(gpu.memory.used) }} / {{ formatFileSize(gpu.memory.total) }}
-                  </span>
-                </div>
-              </div>
-              
-              <div class="metric-row">
-                <span class="metric-label">Compute Capability</span>
-                <span class="metric-value">{{ gpu.compute_capability }}</span>
-              </div>
-              
-              <div v-if="gpu.nvlink && gpu.nvlink.connections.length > 0" class="metric-row">
-                <span class="metric-label">NVLink</span>
-                <div class="nvlink-info">
-                  <span class="nvlink-version">v{{ gpu.nvlink.nvlink_version }}</span>
-                  <span class="nvlink-bandwidth">{{ gpu.nvlink.total_bandwidth }} GB/s</span>
-                  <span class="nvlink-connections">{{ gpu.nvlink.connections.length }} links</span>
-                </div>
-              </div>
-              
-              <div v-if="gpu.temperature" class="metric-row">
-                <span class="metric-label">Temperature</span>
-                <span class="metric-value">{{ gpu.temperature }}°C</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Proxy Status -->
-      <div v-if="systemStore.systemStatus.proxy_status" class="proxy-status">
-        <h3>Multi-Model Proxy</h3>
-        <div class="proxy-card">
-          <div class="proxy-header">
-            <i class="pi pi-share-alt"></i>
-            <h4>llama-swap Proxy</h4>
-            <span class="status-indicator status-running">
-              <i class="pi pi-check"></i>
-              Active
-            </span>
-          </div>
-          <div class="proxy-details">
-            <div class="detail-row">
-              <span class="detail-label">Port:</span>
-              <span class="detail-value">{{ systemStore.systemStatus.proxy_status.port }}</span>
-            </div>
-            <div class="detail-row">
-              <span class="detail-label">API Endpoint:</span>
-              <span class="detail-value">{{ systemStore.systemStatus.proxy_status.endpoint }}</span>
-            </div>
-            <div class="detail-row">
-              <span class="detail-label">Models Available:</span>
-              <span class="detail-value">{{ sortedRunningInstances.length }}</span>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Running Instances -->
-      <div v-if="sortedRunningInstances && sortedRunningInstances.length > 0" class="running-instances">
-        <h3>Running Models</h3>
-        <div class="instance-list">
-          <div 
-            v-for="instance in sortedRunningInstances" 
-            :key="instance.id"
-            class="instance-card"
-          >
-            <div class="instance-header">
-              <h4>{{ instance.proxy_model_name || `Model #${instance.model_id}` }}</h4>
-              <div class="instance-status">
-                <span class="status-indicator status-running">
-                  <i class="pi pi-play"></i>
-                  Running
-                </span>
-              </div>
-            </div>
-            
-            <div class="instance-metrics">
-              <div class="metric-row">
-                <span class="metric-label">Model ID</span>
-                <span class="metric-value">{{ instance.model_id }}</span>
-              </div>
-            </div>
-          </div>
-        </div>
-      </div>
-
-      <!-- Empty State -->
-      <div v-if="!sortedRunningInstances || sortedRunningInstances.length === 0" class="empty-instances">
-        <i class="pi pi-play-circle" style="font-size: 3rem; color: var(--text-secondary);"></i>
-        <h4>No Running Instances</h4>
-        <p>Start a model from the Model Library to see running instances here.</p>
-      </div>
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { onMounted, onUnmounted, computed } from 'vue'
-import { useSystemStore } from '@/stores/system'
-import { useWebSocketStore } from '@/stores/websocket'
-import { toast } from 'vue3-toastify'
-import { formatFileSize, formatDate } from '@/utils/formatting'
-
-const systemStore = useSystemStore()
-const wsStore = useWebSocketStore()
-let unsubscribeUnifiedMonitoring = null
-
-// Computed property to sort running instances by model name (alphabetical)
-const sortedRunningInstances = computed(() => {
-  if (!systemStore.systemStatus.running_instances) return []
-  
-  return [...systemStore.systemStatus.running_instances].sort((a, b) => {
-    const nameA = (a.proxy_model_name || a.model_id || '').toLowerCase()
-    const nameB = (b.proxy_model_name || b.model_id || '').toLowerCase()
-    return nameA.localeCompare(nameB)
-  })
-})
-
-onMounted(() => {
-  refreshStatus()
-  
-  // Subscribe to unified monitoring updates for real-time status
-  unsubscribeUnifiedMonitoring = wsStore.subscribeToUnifiedMonitoring((data) => {
-    
-    // Update system status with real-time data
-    if (data.system) {
-      systemStore.updateSystemStatus({
-        system: data.system,
-        proxy_status: data.proxy_status
-      })
-    }
-    
-    // Update running instances from both database and llama-swap
-    if (data.models) {
-      const runningInstances = data.models.running_instances || []
-      const llamaSwapModels = data.models.llama_swap_models || []
-      
-      
-      // Combine both sources of running models
-      const allRunningInstances = [...runningInstances]
-      
-      // Add llama-swap models as additional instances
-      if (Array.isArray(llamaSwapModels)) {
-        llamaSwapModels.forEach(model => {
-          // Check if this model is already in running_instances
-          const exists = allRunningInstances.some(instance => 
-            instance.proxy_model_name === model.model || 
-            instance.model_id === model.model ||
-            instance.proxy_model_name === model.name
-          )
-          
-          if (!exists) {
-            allRunningInstances.push({
-              id: `llama-swap-${model.model}`,
-              model_id: model.model,
-              proxy_model_name: model.model,
-              port: 'N/A', // llama-swap doesn't provide port info in /running
-              started_at: new Date().toISOString(),
-              source: 'llama-swap',
-              state: model.state
-            })
-          }
-        })
-      }
-      
-      
-      systemStore.updateSystemStatus({
-        running_instances: allRunningInstances
-      })
-    }
-    
-    // Update GPU info if available
-    if (data.gpu) {
-      systemStore.updateGpuInfo(data.gpu)
-    }
-  })
-  
-})
-
-onUnmounted(() => {
-  if (unsubscribeUnifiedMonitoring) {
-    unsubscribeUnifiedMonitoring()
-  }
-})
-
-const refreshStatus = async () => {
-  try {
-    await systemStore.fetchSystemStatus()
-  } catch (error) {
-    toast.error('Failed to refresh system status')
-  }
-}
-
-// formatFileSize and formatDate are now imported from @/utils/formatting
-</script>
-
-<style scoped>
-.system-status {
-  max-width: 1400px;
-  margin: 0 auto;
-}
-
-.system-overview {
-  margin-bottom: 2rem;
-}
-
-.overview-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
-  gap: 1rem;
-}
-
-.overview-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-  backdrop-filter: blur(10px);
-  animation: fadeIn 0.6s ease-out;
-}
-
-.overview-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.overview-card:hover {
-  transform: translateY(-5px) scale(1.02);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
-}
-
-.overview-card:hover::before {
-  opacity: 1;
-}
-
-.overview-header {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-bottom: 1rem;
-}
-
-.overview-header i {
-  font-size: 1.75rem !important;
-  color: var(--accent-cyan) !important;
-  display: inline-block !important;
-  visibility: visible !important;
-  opacity: 1 !important;
-}
-
-.overview-header h3 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.2rem;
-}
-
-.overview-content {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.metric {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.metric-label {
-  font-size: 0.9rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.metric-value {
-  font-weight: 700;
-  color: var(--text-primary);
-  font-size: 1rem;
-}
-
-.gpu-details,
-.proxy-status {
-  margin-bottom: 2rem;
-}
-
-.proxy-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.proxy-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-success);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.proxy-card:hover {
-  transform: translateY(-3px);
-  box-shadow: var(--shadow-lg);
-}
-
-.proxy-card:hover::before {
-  opacity: 1;
-}
-
-.proxy-header {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  margin-bottom: 1rem;
-}
-
-.proxy-header i {
-  font-size: 1.75rem;
-  background: var(--gradient-success);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-}
-
-.proxy-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.2rem;
-}
-
-.proxy-details {
-  display: flex;
-  flex-direction: column;
-  gap: 0.5rem;
-}
-
-.detail-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.detail-label {
-  font-weight: 600;
-  color: var(--text-secondary);
-  font-size: 0.9rem;
-}
-
-.detail-value {
-  font-weight: 600;
-  color: var(--text-primary);
-  font-family: 'Courier New', monospace;
-  background: var(--bg-surface);
-  padding: var(--spacing-xs) var(--spacing-sm);
-  border-radius: var(--radius-sm);
-  border: 1px solid var(--border-primary);
-}
-
-.running-instances {
-  margin-top: 2rem;
-}
-
-.gpu-list,
-.instance-list {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-  margin-top: 1rem;
-}
-
-.gpu-card,
-.instance-card {
-  background: var(--gradient-card);
-  border: 1px solid var(--border-primary);
-  border-radius: var(--radius-xl);
-  padding: var(--spacing-xl);
-  box-shadow: var(--shadow-md);
-  transition: all var(--transition-normal);
-  position: relative;
-  overflow: hidden;
-}
-
-.gpu-card::before,
-.instance-card::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 3px;
-  background: var(--gradient-primary);
-  opacity: 0;
-  transition: opacity var(--transition-normal);
-}
-
-.gpu-card:hover,
-.instance-card:hover {
-  transform: translateY(-3px);
-  box-shadow: var(--shadow-lg);
-}
-
-.gpu-card:hover::before,
-.instance-card:hover::before {
-  opacity: 1;
-}
-
-.gpu-header,
-.instance-header {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  margin-bottom: 1rem;
-}
-
-.gpu-header h4,
-.instance-header h4 {
-  margin: 0;
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.1rem;
-}
-
-.gpu-metrics,
-.instance-metrics {
-  display: flex;
-  flex-direction: column;
-  gap: 0.75rem;
-}
-
-.metric-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-}
-
-.metric-bar {
-  display: flex;
-  align-items: center;
-  gap: 0.5rem;
-  flex: 1;
-  margin-left: 1rem;
-}
-
-.metric-text {
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  min-width: 120px;
-  text-align: right;
-}
-
-.empty-instances {
-  text-align: center;
-  padding: var(--spacing-3xl) var(--spacing-xl);
-  color: var(--text-secondary);
-  background: var(--gradient-surface);
-  border-radius: var(--radius-xl);
-  border: 2px dashed var(--border-secondary);
-  margin: var(--spacing-xl) 0;
-  position: relative;
-  overflow: hidden;
-}
-
-.empty-instances::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 2px;
-  background: var(--gradient-primary);
-  opacity: 0.3;
-}
-
-.empty-instances i {
-  font-size: 3rem !important;
-  background: var(--gradient-primary);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-  margin-bottom: var(--spacing-lg);
-}
-
-.empty-instances h4 {
-  margin: var(--spacing-lg) 0 var(--spacing-md);
-  color: var(--text-primary);
-  font-weight: 700;
-  font-size: 1.3rem;
-}
-
-@media (max-width: 768px) {
-  .overview-grid {
-    grid-template-columns: 1fr;
-  }
-  
-  .gpu-header,
-  .instance-header {
-    flex-direction: column;
-    align-items: flex-start;
-    gap: 0.5rem;
-  }
-  
-  .metric-bar {
-    flex-direction: column;
-    align-items: flex-start;
-    margin-left: 0;
-    margin-top: 0.5rem;
-  }
-  
-  .metric-text {
-    text-align: left;
-    min-width: auto;
-  }
-}
-
-.nvlink-info {
-  display: flex;
-  flex-direction: column;
-  gap: 0.25rem;
-  font-size: 0.875rem;
-}
-
-.nvlink-version {
-  font-weight: 700;
-  color: var(--accent-cyan);
-}
-
-.nvlink-bandwidth {
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.nvlink-connections {
-  color: var(--text-secondary);
-  font-size: 0.8rem;
-  font-weight: 500;
-}
-
-.connection-info {
-  display: flex;
-  align-items: center;
-}
-
-.live-indicator,
-.connection-status {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.875rem;
-  color: var(--text-secondary);
-  font-weight: 500;
-}
-
-.live-indicator i {
-  animation: pulse 2s infinite;
-}
-
-.connection-status {
-  color: #ef4444;
-}
-
-@keyframes pulse {
-  0% { opacity: 1; }
-  50% { opacity: 0.5; }
-  100% { opacity: 1; }
-}
-</style>
diff --git a/frontend/vite.config.js b/frontend/vite.config.js
index 7292d57..037bc25 100644
--- a/frontend/vite.config.js
+++ b/frontend/vite.config.js
@@ -1,40 +1,47 @@
-import { defineConfig } from 'vite'
-import vue from '@vitejs/plugin-vue'
-import { resolve, dirname } from 'path'
-import { fileURLToPath } from 'url'
-
-const __filename = fileURLToPath(import.meta.url)
-const __dirname = dirname(__filename)
-
-export default defineConfig({
-  plugins: [vue()],
-  root: resolve(__dirname, '.'),
-  resolve: {
-    alias: {
-      '@': resolve(__dirname, 'src'),
-    },
-  },
-  server: {
-    proxy: {
-      '/api': {
-        target: 'http://localhost:8080',
-        changeOrigin: true,
-      },
-      '/ws': {
-        target: 'ws://localhost:8080',
-        ws: true,
-      },
-    },
-  },
-  build: {
-    outDir: 'dist',
-    assetsDir: 'assets',
-    rollupOptions: {
-      output: {
-        entryFileNames: `assets/[name]-${Date.now()}.js`,
-        chunkFileNames: `assets/[name]-${Date.now()}.js`,
-        assetFileNames: `assets/[name]-${Date.now()}.[ext]`
-      }
-    }
-  },
-})
+import { defineConfig } from 'vite'
+import vue from '@vitejs/plugin-vue'
+import { resolve, dirname } from 'path'
+import { fileURLToPath } from 'url'
+import { readFileSync } from 'fs'
+
+const __filename = fileURLToPath(import.meta.url)
+const __dirname = dirname(__filename)
+const pkg = JSON.parse(readFileSync(resolve(__dirname, '../package.json'), 'utf-8'))
+
+export default defineConfig({
+  plugins: [vue()],
+  root: resolve(__dirname, '.'),
+  define: {
+    __APP_VERSION__: JSON.stringify(pkg.version || '0.0.0'),
+  },
+  resolve: {
+    alias: {
+      '@': resolve(__dirname, 'src'),
+    },
+  },
+  server: {
+    port: 5173,
+    strictPort: false, // use next port if 5173 in use
+    host: true,        // listen on 0.0.0.0 so reachable from host (e.g. WSL → Windows browser)
+    watch: {
+      usePolling: true,
+    },
+    proxy: {
+      '/api': {
+        target: 'http://localhost:8081',
+        changeOrigin: true,
+      },
+    },
+  },
+  build: {
+    outDir: 'dist',
+    assetsDir: 'assets',
+    rollupOptions: {
+      output: {
+        entryFileNames: `assets/[name]-${Date.now()}.js`,
+        chunkFileNames: `assets/[name]-${Date.now()}.js`,
+        assetFileNames: `assets/[name]-${Date.now()}.[ext]`
+      }
+    }
+  },
+})
diff --git a/migrate_db.py b/migrate_db.py
deleted file mode 100644
index 6b6eb74..0000000
--- a/migrate_db.py
+++ /dev/null
@@ -1,404 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unified database migration and reset utility for llama-cpp-studio
-Handles all database migrations and provides a reset option
-"""
-
-import sqlite3
-import os
-import shutil
-import argparse
-from datetime import datetime
-from pathlib import Path
-import re
-
-
-def migrate_base_model_name(db_path: str):
-    """Add base_model_name column and populate it"""
-    print("📝 Migrating base_model_name column...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    try:
-        # Check if base_model_name column exists
-        cursor.execute("PRAGMA table_info(models)")
-        columns = [col[1] for col in cursor.fetchall()]
-
-        if "base_model_name" not in columns:
-            print("  - Adding base_model_name column...")
-            cursor.execute("ALTER TABLE models ADD COLUMN base_model_name TEXT")
-            print("  ✓ Added base_model_name column")
-        else:
-            print("  ✓ base_model_name column already exists")
-
-        # Populate base_model_name for all existing models
-        cursor.execute(
-            "SELECT id, name FROM models WHERE base_model_name IS NULL OR base_model_name = ''"
-        )
-        models = cursor.fetchall()
-
-        if models:
-            print(f"  - Populating base_model_name for {len(models)} models...")
-            for model_id, name in models:
-                base_name = (
-                    name.replace(".gguf", "").split("-")[-1].split("_")[0]
-                )  # Simplified extraction
-                cursor.execute(
-                    "UPDATE models SET base_model_name = ? WHERE id = ?",
-                    (base_name, model_id),
-                )
-            print(f"  ✓ Populated {len(models)} models")
-        else:
-            print("  ✓ All models already have base_model_name")
-
-        conn.commit()
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        conn.rollback()
-    finally:
-        try:
-            cursor.execute("PRAGMA foreign_keys=on")
-        except Exception:
-            pass
-        conn.close()
-
-
-def migrate_running_instances(db_path: str):
-    """Add proxy_model_name and runtime_type columns to running_instances"""
-    print("📝 Migrating running_instances table...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    try:
-        cursor.execute("PRAGMA table_info(running_instances)")
-        columns = {col[1] for col in cursor.fetchall()}
-
-        if "proxy_model_name" not in columns:
-            print("  - Adding proxy_model_name column...")
-            cursor.execute(
-                "ALTER TABLE running_instances ADD COLUMN proxy_model_name TEXT"
-            )
-            print("  ✓ Added proxy_model_name column")
-        else:
-            print("  ✓ proxy_model_name column already exists")
-
-        if "runtime_type" not in columns:
-            print("  - Adding runtime_type column...")
-            cursor.execute(
-                "ALTER TABLE running_instances ADD COLUMN runtime_type VARCHAR"
-            )
-            cursor.execute(
-                "UPDATE running_instances SET runtime_type = 'llama_cpp' WHERE runtime_type IS NULL"
-            )
-            print("  ✓ Added runtime_type column")
-        else:
-            print("  ✓ runtime_type column already exists")
-
-        conn.commit()
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        conn.rollback()
-    finally:
-        conn.close()
-
-
-def cleanup_legacy_running_instances(db_path: str):
-    """Remove deprecated columns from running_instances table"""
-    print("🧹 Cleaning legacy running_instances columns...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    try:
-        cursor.execute("PRAGMA table_info(running_instances)")
-        columns = [col[1] for col in cursor.fetchall()]
-
-        if "process_id" not in columns and "port" not in columns:
-            print("  ✓ No legacy columns found")
-            return
-
-        print("  - Dropping deprecated process_id/port columns...")
-        cursor.execute("PRAGMA foreign_keys=off")
-        cursor.execute(
-            """
-            CREATE TABLE running_instances_new (
-                id INTEGER PRIMARY KEY,
-                model_id INTEGER,
-                llama_version TEXT,
-                proxy_model_name TEXT,
-                started_at DATETIME,
-                config TEXT
-            )
-        """
-        )
-        cursor.execute(
-            """
-            INSERT INTO running_instances_new (id, model_id, llama_version, proxy_model_name, started_at, config)
-            SELECT id, model_id, llama_version, proxy_model_name, started_at, config
-            FROM running_instances
-        """
-        )
-        cursor.execute("DROP TABLE running_instances")
-        cursor.execute("ALTER TABLE running_instances_new RENAME TO running_instances")
-        cursor.execute("PRAGMA foreign_keys=on")
-        conn.commit()
-        print("  ✓ Legacy columns removed")
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        conn.rollback()
-    finally:
-        conn.close()
-
-
-def migrate_llama_versions(db_path: str):
-    """Add is_active column to llama_versions"""
-    print("📝 Migrating llama_versions table...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    try:
-        cursor.execute("PRAGMA table_info(llama_versions)")
-        columns = [column[1] for column in cursor.fetchall()]
-
-        if "is_active" not in columns:
-            print("  - Adding is_active column...")
-            cursor.execute(
-                "ALTER TABLE llama_versions ADD COLUMN is_active BOOLEAN DEFAULT 0"
-            )
-
-            # Set first version as active if none are active
-            cursor.execute("SELECT COUNT(*) FROM llama_versions WHERE is_active = 1")
-            active_count = cursor.fetchone()[0]
-
-            if active_count == 0:
-                cursor.execute(
-                    "UPDATE llama_versions SET is_active = 1 WHERE id = (SELECT MIN(id) FROM llama_versions)"
-                )
-                print("  ✓ Set first llama-cpp version as active")
-
-            print("  ✓ Added is_active column")
-        else:
-            print("  ✓ is_active column already exists")
-
-        conn.commit()
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        conn.rollback()
-    finally:
-        conn.close()
-
-
-def migrate_build_config(db_path: str):
-    """Add build_config column to llama_versions"""
-    print("📝 Migrating build_config column...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    try:
-        cursor.execute("PRAGMA table_info(llama_versions)")
-        columns = [column[1] for column in cursor.fetchall()]
-
-        if "build_config" not in columns:
-            print("  - Adding build_config column...")
-            cursor.execute("ALTER TABLE llama_versions ADD COLUMN build_config TEXT")
-            print("  ✓ Added build_config column")
-        else:
-            print("  ✓ build_config column already exists")
-
-        conn.commit()
-    except Exception as e:
-        print(f"  ✗ Error: {e}")
-        conn.rollback()
-    finally:
-        conn.close()
-
-
-def migrate_safetensors_models(db_path: str):
-    """
-    Merge per-file safetensors Model rows into a single logical model per Hugging Face repo.
-
-    Older versions stored one Model row per .safetensors shard. The new design keeps a single
-    logical Model per huggingface_id and tracks shards in the safetensors manifest.
-    """
-    print("📝 Migrating safetensors models to single logical entries...")
-
-    conn = sqlite3.connect(db_path)
-    cursor = conn.cursor()
-
-    manifest_path = Path("data") / "models" / "safetensors" / "manifest.json"
-    manifest_data = []
-
-    try:
-        if manifest_path.exists():
-            with manifest_path.open("r", encoding="utf-8") as f:
-                loaded = f.read().strip()
-                if loaded:
-                    import json
-
-                    manifest_data = json.loads(loaded)
-        else:
-            print("  - No safetensors manifest found; skipping manifest migration")
-
-        # Group existing safetensors models by huggingface_id
-        cursor.execute(
-            """
-            SELECT id, huggingface_id, name, file_path, file_size, model_format
-            FROM models
-            WHERE model_format = 'safetensors'
-            """
-        )
-        rows = cursor.fetchall()
-        if not rows:
-            print("  ✓ No safetensors models found in database")
-        else:
-            by_repo = {}
-            for row in rows:
-                model_id, repo_id, name, file_path, file_size, model_format = row
-                if not repo_id:
-                    continue
-                by_repo.setdefault(repo_id, []).append(
-                    {
-                        "id": model_id,
-                        "huggingface_id": repo_id,
-                        "name": name,
-                        "file_path": file_path,
-                        "file_size": file_size,
-                    }
-                )
-
-            import json
-
-            # Group manifest entries by huggingface_id for convenience
-            manifest_by_repo = {}
-            for entry in manifest_data or []:
-                repo_id = entry.get("huggingface_id")
-                if not repo_id:
-                    continue
-                manifest_by_repo.setdefault(repo_id, []).append(entry)
-
-            for repo_id, models in by_repo.items():
-                if not models:
-                    continue
-                # Choose canonical model (smallest id) as the logical model
-                canonical = sorted(models, key=lambda m: m["id"])[0]
-                canonical_id = canonical["id"]
-                print(
-                    f"  - Repo {repo_id}: canonical model id {canonical_id}, merging {len(models)} rows"
-                )
-
-                # Update manifest entries for this repo to point to canonical_id
-                for entry in manifest_by_repo.get(repo_id, []):
-                    entry["model_id"] = canonical_id
-
-                # Recompute aggregate file_size from manifest entries for this repo
-                total_size = 0
-                for entry in manifest_by_repo.get(repo_id, []):
-                    size = entry.get("file_size") or 0
-                    try:
-                        total_size += int(size)
-                    except Exception:
-                        continue
-
-                if total_size <= 0:
-                    # Fallback: sum sizes from DB rows
-                    total_size = sum(int(m.get("file_size") or 0) for m in models)
-
-                cursor.execute(
-                    "UPDATE models SET file_size = ? WHERE id = ?",
-                    (total_size, canonical_id),
-                )
-
-                # Delete all non-canonical rows for this repo
-                stale_ids = [m["id"] for m in models if m["id"] != canonical_id]
-                if stale_ids:
-                    cursor.execute(
-                        f"DELETE FROM models WHERE id IN ({','.join('?' for _ in stale_ids)})",
-                        stale_ids,
-                    )
-
-            # Persist updated manifest if we loaded one
-            if manifest_path.exists():
-                with manifest_path.open("w", encoding="utf-8") as f:
-                    json.dump(manifest_data or [], f, indent=2)
-
-        conn.commit()
-        print("  ✓ Safetensors models migration completed")
-    except Exception as e:
-        print(f"  ✗ Error during safetensors models migration: {e}")
-        conn.rollback()
-    finally:
-        conn.close()
-
-
-def reset_database(db_path: str):
-    """Reset database by backing up and removing old one"""
-    if not os.path.exists(db_path):
-        print("No existing database found.")
-        return
-
-    # Create backup
-    backup_name = f"data/db.sqlite.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-    print(f"📦 Backing up database to {backup_name}...")
-    shutil.copy2(db_path, backup_name)
-    print("  ✓ Backup created")
-
-    # Remove old database
-    print("🗑️  Removing old database...")
-    os.remove(db_path)
-    print("  ✓ Old database removed")
-
-    print("\n✅ Database reset complete!")
-    print(
-        "The new database will be created automatically when you start the application."
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Database migration utility for llama-cpp-studio"
-    )
-    parser.add_argument(
-        "action",
-        choices=["migrate", "reset"],
-        help="Action to perform: migrate (apply migrations) or reset (reset database)",
-    )
-
-    args = parser.parse_args()
-
-    db_path = "data/db.sqlite"
-
-    if args.action == "migrate":
-        if not os.path.exists(db_path):
-            print("❌ Database file not found at data/db.sqlite")
-            print("Run 'migrate_db.py migrate' after starting the application once.")
-            return
-
-        print("🚀 Starting database migrations...\n")
-
-        migrate_base_model_name(db_path)
-        migrate_running_instances(db_path)
-        cleanup_legacy_running_instances(db_path)
-        migrate_llama_versions(db_path)
-        migrate_build_config(db_path)
-        migrate_safetensors_models(db_path)
-
-        print("\n✅ All migrations completed successfully!")
-
-    elif args.action == "reset":
-        print("⚠️  WARNING: This will delete your database and create a backup.")
-        print("Your model files (.gguf) will NOT be deleted.\n")
-
-        response = input("Do you want to proceed? (yes/no): ")
-
-        if response.lower() in ["yes", "y"]:
-            reset_database(db_path)
-        else:
-            print("Database reset cancelled.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/migrate_gguf_storage.py b/migrate_gguf_storage.py
deleted file mode 100644
index 3fe0dcf..0000000
--- a/migrate_gguf_storage.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import os
-import shutil
-import asyncio
-from typing import Optional
-
-from sqlalchemy import or_
-
-from backend.database import SessionLocal, Model
-from backend.huggingface import create_gguf_manifest_entry
-from backend.routes.models import _apply_hf_defaults_to_model
-
-
-def _safe_repo_name(huggingface_id: Optional[str], fallback: str) -> str:
-    if huggingface_id:
-        safe = huggingface_id.replace("/", "_")
-    else:
-        safe = fallback
-    safe = safe.strip() or fallback
-    return safe
-
-
-async def migrate_gguf_models():
-    session = SessionLocal()
-    moved_count = 0
-    total = 0
-    try:
-        models = (
-            session.query(Model)
-            .filter(or_(Model.model_format.is_(None), Model.model_format == "gguf"))
-            .all()
-        )
-        total = len(models)
-        for model in models:
-            original_path = model.file_path or ""
-            normalized_old_path = (
-                os.path.normpath(original_path.replace("\\", os.sep))
-                if original_path
-                else ""
-            )
-            if not normalized_old_path or not os.path.exists(normalized_old_path):
-                print(f"Skipping model {model.id}: file missing ({original_path})")
-                continue
-
-            huggingface_id = model.huggingface_id or f"model_{model.id}"
-            safe_repo = _safe_repo_name(huggingface_id, f"model_{model.id}")
-            filename = os.path.basename(normalized_old_path)
-            new_dir = os.path.join("data", "models", "gguf", safe_repo)
-            os.makedirs(new_dir, exist_ok=True)
-            new_path = os.path.join(new_dir, filename)
-
-            if os.path.abspath(normalized_old_path) != os.path.abspath(new_path):
-                print(f"Moving {normalized_old_path} -> {new_path}")
-                shutil.move(normalized_old_path, new_path)
-                moved_count += 1
-
-            model.file_path = new_path
-            model.model_format = "gguf"
-            session.commit()
-
-            try:
-                file_size = os.path.getsize(new_path)
-            except OSError:
-                file_size = 0
-
-            manifest_entry = None
-            try:
-                manifest_entry = await create_gguf_manifest_entry(
-                    model.huggingface_id, new_path, file_size, model_id=model.id
-                )
-            except Exception as exc:
-                print(f"Warning: failed to record manifest for {model.id}: {exc}")
-            if manifest_entry:
-                try:
-                    _apply_hf_defaults_to_model(
-                        model, manifest_entry.get("metadata") or {}, session
-                    )
-                except Exception as exc:
-                    print(
-                        f"Warning: failed to apply HF defaults for model {model.id}: {exc}"
-                    )
-    finally:
-        session.close()
-
-    print(f"Processed {total} GGUF models. Moved {moved_count} files.")
-
-
-def remove_legacy_manifest():
-    legacy_manifest = os.path.join("data", "models", "gguf", "manifest.json")
-    if os.path.exists(legacy_manifest):
-        os.remove(legacy_manifest)
-        print("Removed legacy aggregated GGUF manifest.")
-
-
-if __name__ == "__main__":
-    asyncio.run(migrate_gguf_models())
-    remove_legacy_manifest()
diff --git a/package-lock.json b/package-lock.json
index a402380..d1a1f49 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -20,6 +20,7 @@
       "devDependencies": {
         "@types/node": "^20.9.0",
         "@vitejs/plugin-vue": "^4.5.0",
+        "concurrently": "^9.0.0",
         "eslint": "^9.39.2",
         "eslint-plugin-vue": "^10.6.2",
         "prettier": "^3.7.4",
@@ -1616,6 +1617,21 @@
         "url": "https://github.com/chalk/chalk?sponsor=1"
       }
     },
+    "node_modules/cliui": {
+      "version": "8.0.1",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
+      "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "string-width": "^4.2.0",
+        "strip-ansi": "^6.0.1",
+        "wrap-ansi": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
     "node_modules/color-convert": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
@@ -1662,6 +1678,47 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/concurrently": {
+      "version": "9.2.1",
+      "resolved": "https://registry.npmjs.org/concurrently/-/concurrently-9.2.1.tgz",
+      "integrity": "sha512-fsfrO0MxV64Znoy8/l1vVIjjHa29SZyyqPgQBwhiDcaW8wJc2W3XWVOGx4M3oJBnv/zdUZIIp1gDeS98GzP8Ng==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "chalk": "4.1.2",
+        "rxjs": "7.8.2",
+        "shell-quote": "1.8.3",
+        "supports-color": "8.1.1",
+        "tree-kill": "1.2.2",
+        "yargs": "17.7.2"
+      },
+      "bin": {
+        "conc": "dist/bin/concurrently.js",
+        "concurrently": "dist/bin/concurrently.js"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/open-cli-tools/concurrently?sponsor=1"
+      }
+    },
+    "node_modules/concurrently/node_modules/supports-color": {
+      "version": "8.1.1",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
+      "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/supports-color?sponsor=1"
+      }
+    },
     "node_modules/cosmiconfig": {
       "version": "9.0.0",
       "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz",
@@ -1931,6 +1988,16 @@
         "@esbuild/win32-x64": "0.21.5"
       }
     },
+    "node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/escape-string-regexp": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
@@ -2348,6 +2415,16 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/get-caller-file": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
+      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": "6.* || 8.* || >= 10.*"
+      }
+    },
     "node_modules/get-intrinsic": {
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
@@ -3300,6 +3377,16 @@
       ],
       "license": "MIT"
     },
+    "node_modules/require-directory": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
+      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/require-from-string": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
@@ -3397,6 +3484,16 @@
         "queue-microtask": "^1.2.2"
       }
     },
+    "node_modules/rxjs": {
+      "version": "7.8.2",
+      "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz",
+      "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "tslib": "^2.1.0"
+      }
+    },
     "node_modules/semver": {
       "version": "7.7.3",
       "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
@@ -3433,6 +3530,19 @@
         "node": ">=8"
       }
     },
+    "node_modules/shell-quote": {
+      "version": "1.8.3",
+      "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.3.tgz",
+      "integrity": "sha512-ObmnIF4hXNg1BqhnHmgbDETF8dLPCggZWBjkQfhZpbszZnYur5DUljTcCHii5LC3J5E0yeO/1LIMyH+UvHQgyw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
     "node_modules/signal-exit": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
@@ -3776,6 +3886,23 @@
         "node": ">=8.0"
       }
     },
+    "node_modules/tree-kill": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz",
+      "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "tree-kill": "cli.js"
+      }
+    },
+    "node_modules/tslib": {
+      "version": "2.8.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+      "dev": true,
+      "license": "0BSD"
+    },
     "node_modules/type-check": {
       "version": "0.4.0",
       "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
@@ -4022,6 +4149,24 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/wrap-ansi": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
     "node_modules/write-file-atomic": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-5.0.1.tgz",
@@ -4046,6 +4191,45 @@
         "node": ">=12"
       }
     },
+    "node_modules/y18n": {
+      "version": "5.0.8",
+      "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
+      "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/yargs": {
+      "version": "17.7.2",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
+      "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "cliui": "^8.0.1",
+        "escalade": "^3.1.1",
+        "get-caller-file": "^2.0.5",
+        "require-directory": "^2.1.1",
+        "string-width": "^4.2.3",
+        "y18n": "^5.0.5",
+        "yargs-parser": "^21.1.1"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/yargs-parser": {
+      "version": "21.1.1",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
+      "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=12"
+      }
+    },
     "node_modules/yocto-queue": {
       "version": "0.1.0",
       "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
diff --git a/package.json b/package.json
index b6ef3a9..4d092de 100644
--- a/package.json
+++ b/package.json
@@ -3,9 +3,13 @@
   "version": "1.0.0",
   "description": "llama.cpp Studio - Professional AI Model Management Platform",
   "scripts": {
-    "dev": "vite",
-    "build": "vite build",
-    "preview": "vite preview"
+    "dev": "cd frontend && vite",
+    "dev:frontend": "cd frontend && vite",
+    "dev:backend": "WATCHFILES_FORCE_POLLING=true python -m uvicorn main:app --host 0.0.0.0 --port 8081 --app-dir backend --reload --reload-dir backend",
+    "dev:all": "concurrently -n backend,frontend -c blue,green \"npm run dev:backend\" \"npm run dev:frontend\"",
+    "kill-ports": "powershell.exe -Command \"Get-NetTCPConnection -LocalPort 5173,8080 -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }\"",
+    "build": "cd frontend && vite build",
+    "preview": "cd frontend && vite preview"
   },
   "dependencies": {
     "@vueuse/core": "^10.5.0",
@@ -14,10 +18,10 @@
     "primeicons": "^6.0.0",
     "primevue": "^3.45.0",
     "vue": "^3.4.0",
-    "vue-router": "^4.2.0",
-    "vue3-toastify": "^0.1.0"
+    "vue-router": "^4.2.0"
   },
   "devDependencies": {
+    "concurrently": "^9.0.0",
     "@types/node": "^20.9.0",
     "@vitejs/plugin-vue": "^4.5.0",
     "eslint": "^9.39.2",
diff --git a/requirements.txt b/requirements.txt
index 016ff8d..c988650 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,18 +4,10 @@ huggingface-hub
 hf-transfer
 requests
 aiofiles
-sqlalchemy
-alembic
 pydantic
 python-multipart
-websockets
 psutil
 pyyaml
 nvidia-ml-py
 aiohttp
-httpx
-httpx-sse
-tqdm
-tokenizers
-transformers
-scikit-learn
\ No newline at end of file
+httpx
\ No newline at end of file