From 7edaf6fa34fdffecff3d18c59fd5f7dc63a633b5 Mon Sep 17 00:00:00 2001
From: lapy <vin.lapenta@gmail.com>
Date: Mon, 9 Mar 2026 17:46:59 +0000
Subject: [PATCH] Finish refactoring

---
 Dockerfile                                    |    8 +-
 README.md                                     |  778 +++++-----
 backend/data_store.py                         |   72 +
 backend/gguf_introspection_config.json        |   23 +
 backend/gguf_reader.py                        |   49 +-
 backend/huggingface.py                        |   65 +-
 backend/llama_manager.py                      |   18 +-
 backend/llama_swap_client.py                  |   20 +
 backend/llama_swap_config.py                  |  194 ++-
 backend/llama_swap_manager.py                 |    4 +-
 backend/lmdeploy_installer.py                 |  416 -----
 backend/lmdeploy_manager.py                   | 1274 ++++++----------
 backend/main.py                               |   39 +-
 backend/model_introspection.py                |  555 +++++++
 backend/param_registry.py                     |    2 +
 backend/routes/llama_version_manager.py       |  159 --
 backend/routes/llama_versions.py              |  427 ++++--
 backend/routes/lmdeploy.py                    |   82 -
 backend/routes/lmdeploy_versions.py           |   65 +
 backend/routes/models.py                      | 1344 +++++------------
 backend/routes/status.py                      |   25 +-
 backend/tests/test_lmdeploy_installer.py      |   10 +-
 backend/tests/test_model_introspection.py     |   52 +
 docker-compose.cuda.yml                       |    4 +-
 frontend/src/App.vue                          |   19 +-
 frontend/src/components/ModelRow.vue          |   90 ++
 frontend/src/components/ThemeToggle.vue       |   29 +-
 .../src/components/common/ProgressTracker.vue |   82 +-
 frontend/src/components/layout/AppHeader.vue  |   84 +-
 .../src/components/system/VersionTable.vue    |    5 +-
 frontend/src/stores/engines.js                |   31 +-
 frontend/src/stores/models.js                 |   30 +-
 frontend/src/stores/progress.js               |  220 ++-
 frontend/src/styles/_components.css           |   25 +-
 frontend/src/views/EnginesView.vue            |  853 +++++++----
 frontend/src/views/ModelConfig.vue            |  161 +-
 frontend/src/views/ModelLibrary.vue           |  252 ++--
 frontend/src/views/ModelSearch.vue            |  496 +++++-
 package-lock.json                             |   51 +-
 package.json                                  |    7 +-
 40 files changed, 4331 insertions(+), 3789 deletions(-)
 create mode 100644 backend/gguf_introspection_config.json
 delete mode 100644 backend/lmdeploy_installer.py
 create mode 100644 backend/model_introspection.py
 delete mode 100644 backend/routes/llama_version_manager.py
 delete mode 100644 backend/routes/lmdeploy.py
 create mode 100644 backend/routes/lmdeploy_versions.py
 create mode 100644 backend/tests/test_model_introspection.py
 create mode 100644 frontend/src/components/ModelRow.vue

diff --git a/Dockerfile b/Dockerfile
index 0b8bd9b..dcc4173 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -81,8 +81,8 @@ ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_VISIBLE_DEVICES=all \
     NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-    HF_HOME=/app/data/temp/.cache/huggingface \
-    HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub \
+    HF_HOME=/app/data/hf-cache \
+    HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub \
     VENV_PATH=/opt/venv \
     PYTHONPATH=/app \
     PATH="/app/data/cuda/current/bin:${PATH}" \
@@ -133,7 +133,7 @@ RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERS
     && cmake --version
 
 # Install llama-swap binary
-ARG LLAMA_SWAP_VERSION=179
+ARG LLAMA_SWAP_VERSION=197
 RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \
     tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \
     mv /tmp/llama-swap /usr/local/bin/llama-swap && \
@@ -168,7 +168,7 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 # Create non-root user and data directory structure
 RUN useradd -m -s /bin/bash appuser && \
-    mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp/.cache/huggingface/hub && \
+    mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/hf-cache/hub && \
     chown -R appuser:appuser /app && \
     # Ensure entrypoint script is accessible to appuser
     chmod 755 /usr/local/bin/docker-entrypoint.sh
diff --git a/README.md b/README.md
index 89567a4..cb1ecef 100644
--- a/README.md
+++ b/README.md
@@ -1,525 +1,499 @@
-# llama.cpp Studio
+## llama.cpp Studio
 
-A professional AI model management platform for llama.cpp models and versions, designed for modern AI workflows with comprehensive GPU support (NVIDIA CUDA, AMD Vulkan/ROCm, Metal, OpenBLAS).
+llama.cpp Studio is a web-based control plane for running and managing local LLMs on top of `llama.cpp`, `ik_llama.cpp`, and `LMDeploy` – all served through a single OpenAI-compatible endpoint powered by `llama-swap`.
+
+It is designed for **power users running models on a single machine or small server** (Docker or bare metal) with strong support for:
+
+- **CPU-only** inference (OpenBLAS)
+- **NVIDIA CUDA GPUs** (via the NVIDIA Container Toolkit)
+
+There is **no built-in support for Vulkan/ROCm/Metal backends** and **no Smart Auto feature** – configuration is explicit and predictable.
+
+### Key capabilities
+
+- **HuggingFace search (GGUF + safetensors)**: Search the Hub, inspect metadata, and plan downloads by quantization or safetensors bundle.
+- **Model library with multi-quantization support**: Manage multiple quantizations per base model in a grouped view with start/stop/delete actions.
+- **Per-model runtime configuration**: Configure engine (llama.cpp / ik_llama / LMDeploy), context length, GPU layers, batch sizes, and advanced flags.
+- **Unified multi-model serving**: Serve many GGUF quantizations at once via `llama-swap` on port `2000`.
+- **System & progress monitoring**: Live system stats, GPU information, and unified progress for downloads, builds, CUDA/LMDeploy installs via SSE.
+
+---
+
+## Core concepts & architecture
+
+llama.cpp Studio is a single application composed of a Vue 3 SPA frontend and a FastAPI backend. The backend persists configuration to YAML files under `/app/data` and orchestrates runtimes through `llama-swap`.
+
+### High-level architecture
+
+```mermaid
+flowchart LR
+  userClient[User_Client] --> browserUI["Web_UI_(Vue_3_SPA)"]
+  browserUI --> fastapiBackend["FastAPI_Backend"]
+  fastapiBackend --> dataStore["YAML_DataStore_(models_engines_settings)"]
+  fastapiBackend --> progressSSE["SSE_/api/events"]
+  fastapiBackend --> llamaSwap["llama-swap_Proxy_:2000"]
+  llamaSwap --> llamaCpp["llama.cpp_ik_llama_runtimes"]
+  llamaSwap --> lmdeploy["LMDeploy_TurboMind_(safetensors)"]
+```
+
+### Frontend (Vue 3 SPA)
+
+- `App.vue` provides the global shell:
+  - Header with llama-swap status and theme toggle
+  - Navigation between the main sections
+  - Central `<router-view>` for page content
+  - Global ConfirmDialog/Toast and SSE connection
+- Main views:
+  - **Model Library** (`/models`) – installed models grouped by base model and quantization.
+  - **Model Search** (`/search`) – HuggingFace search & download (GGUF and safetensors).
+  - **Model Config** (`/models/:id/config`) – per-quantization configuration.
+  - **Engines & System** (`/engines`) – llama.cpp / ik_llama builds, CUDA and LMDeploy status, system & GPU info.
+- State management:
+  - `useModelStore` – models, search, downloads, metadata, start/stop/config operations.
+  - `useEnginesStore` – engine versions, CUDA installer, system and GPU info.
+  - `useProgressStore` – EventSource connection to `/api/events`, normalized tasks, logs, and notifications.
+
+### Backend (FastAPI)
+
+- `backend/main.py`:
+  - Ensures the `/app/data` (or local `./data`) directory structure exists and is writable.
+  - Initializes the YAML-backed `DataStore` for models, engine versions, and settings.
+  - Loads `HUGGINGFACE_API_KEY` from the environment if present.
+  - Starts and manages the `llama-swap` proxy on port `2000` when a valid llama.cpp/ik_llama binary is active.
+  - Registers all known models with `llama-swap` at startup based on logical metadata (not hard-coded paths).
+  - Serves the built Vue app from `frontend/dist` and exposes a catch-all SPA route.
+- Key route groups:
+  - `/api/models` – model library, HuggingFace search, GGUF/safetensors downloads, configuration, start/stop.
+  - `/api/llama-versions` – llama.cpp/ik_llama build settings, builds, version listing, activation, deletion, CUDA installer.
+  - `/api/lmdeploy` – LMDeploy install/remove.
+  - `/api/status` & `/api/gpu-info` – system and GPU metrics plus `llama-swap` proxy health.
+  - `/api/events` – Server-Sent Events stream for unified progress and notifications.
+
+### Runtimes and `llama-swap`
+
+- `llama.cpp` and `ik_llama.cpp` versions are:
+  - Built from source under `/app/data/llama-cpp/...`
+  - Recorded in the DataStore with metadata and active version selection
+  - Exposed to the frontend via `/api/llama-versions`
+- `llama-swap`:
+  - Is downloaded and installed into the runtime image at build time.
+  - Runs a single proxy process on port `2000` and multiplexes multiple model backends.
+  - Reads its configuration from files generated by the backend based on stored models and the active engine.
+- LMDeploy:
+  - Is installed into `/app/data/lmdeploy/venv` from PyPI or source on demand.
+  - Serves safetensors checkpoints using TurboMind behind `llama-swap`.
+
+---
 
 ## Features
 
-### Model Management
-- **Search & Download**: Search HuggingFace for GGUF models with comprehensive metadata and size information for each quantization
-- **Multi-Quantization Support**: Download and manage multiple quantizations of the same model
-- **Model Library**: Manage downloaded models with start/stop/delete functionality
-- **Smart Configuration**: Auto-generate optimal llama.cpp parameters based on GPU capabilities
-- **VRAM Estimation**: Real-time VRAM usage estimation with warnings for memory constraints
-- **Metadata Extraction**: Rich model information including parameters, architecture, license, tags, and more
-- **Safetensors Runner**: Configure and run safetensors checkpoints via LMDeploy TurboMind with an OpenAI-compatible endpoint on port 2001
-
-### llama.cpp Version Management
-- **Release Installation**: Download and install pre-built binaries from GitHub releases
-- **Source Building**: Build from source with optional patches from GitHub PRs
-- **Custom Build Configuration**: Customize GPU backends (CUDA, Vulkan, Metal, OpenBLAS), build type, and compiler flags
-- **Update Checking**: Check for updates to both releases and source code
-- **Version Management**: Install, update, and delete multiple llama.cpp versions
-- **Build Validation**: Automatic validation of built binaries to ensure they work correctly
-
-### GPU Support
-- **Multi-GPU Support**: Automatic detection and configuration for NVIDIA, AMD, and other GPUs
-- **NVIDIA CUDA**: Full support for CUDA compute capabilities, flash attention, and multi-GPU
-- **AMD GPU Support**: Vulkan and ROCm support for AMD GPUs
-- **Apple Metal**: Support for Apple Silicon GPUs
-- **OpenBLAS**: CPU acceleration with optimized BLAS routines
-- **VRAM Monitoring**: Real-time GPU memory usage and temperature monitoring
-- **NVLink Detection**: Automatic detection of NVLink connections and topology analysis
-
-### Multi-Model Serving
-- **Concurrent Execution**: Run multiple models simultaneously via llama-swap proxy
-- **OpenAI-Compatible API**: Standard API format for easy integration
-- **Port 2000**: All models served through a single unified endpoint
-- **Automatic Lifecycle Management**: Seamless starting/stopping of models
-
-### Web Interface
-- **Modern UI**: Vue.js 3 with PrimeVue components
-- **Real-time Updates**: SSE-based progress tracking and system monitoring
-- **Responsive Design**: Works on desktop and mobile devices
-- **System Status**: CPU, memory, disk, and GPU monitoring
-- **LMDeploy Installer**: Dedicated UI to install/remove LMDeploy at runtime with live logs
-- **Dark Mode**: Built-in theme support
-
-## Quick Start
-
-### Using Docker Compose
-
-1. Clone the repository:
+### Model management
+
+- **Unified model library**
+  - Models are grouped by HuggingFace repo (e.g. `meta-llama/Meta-Llama-3-8B-Instruct`).
+  - Each group contains one or more quantizations (GGUF) and optional safetensors bundles.
+  - Per-quantization rows show size, download timestamp, runtime type, and running state.
+
+- **HuggingFace search (GGUF + safetensors)**
+  - Search by model name or keyword with a choice of:
+    - `gguf` – quantized GGUF files and bundles.
+    - `safetensors` – safetensors checkpoints.
+  - See metadata (file sizes, quantization names, tags) before you download.
+
+- **Downloads & bundles**
+  - GGUF:
+    - Download individual quantizations or full bundles.
+    - Optionally attach `.mmproj` projector files for multimodal models.
+  - Safetensors:
+    - Download full safetensors bundles.
+  - All downloads are tracked as long-running tasks via SSE and shown in the global progress panel.
+
+### Engine & version management
+
+- **llama.cpp and ik_llama.cpp**
+  - Multiple versions per engine are supported.
+  - Builds are always **from source**, configured using stored build settings (CUDA flags, flash attention, CPU variants, etc.).
+  - Versions can be activated/deactivated; activation updates `llama-swap` configuration automatically.
+  - Old versions can be removed to reclaim disk space.
+
+- **CUDA toolkit management (NVIDIA only)**
+  - Optional in-container CUDA installer can install or remove the CUDA Toolkit (plus optional cuDNN/TensorRT) under `/app/data/cuda`.
+  - Progress and logs for installs/uninstalls are surfaced in the Engines/System view and via SSE events.
+  - Only **NVIDIA CUDA + CPU** are documented and supported; other GPU backends are not part of this project’s supported surface.
+
+- **LMDeploy integration**
+  - Install LMDeploy from **PyPI** or from **source** into a dedicated virtual environment under `/app/data/lmdeploy/venv`.
+  - The backend exposes endpoints to:
+    - Check for the latest LMDeploy version.
+    - Install/update/remove LMDeploy.
+  - Once installed, safetensors models can be launched via LMDeploy TurboMind and are exposed through the same `llama-swap` endpoint.
+
+### Multi-model serving
+
+- **Single OpenAI-compatible endpoint**
+  - All models are served via `llama-swap` on `http://<host>:2000`.
+  - The proxy implements standard OpenAI-style `/v1/chat/completions` and `/v1/models`.
+
+- **Concurrent GGUF quantizations**
+  - Multiple GGUF quantizations can be active at once behind `llama-swap`.
+  - The System Status view shows running models and basic health information.
+
+- **Safetensors via LMDeploy**
+  - One LMDeploy runtime is supported at a time for safetensors models.
+  - It is exposed alongside GGUF models through the same `llama-swap` API.
+
+### Monitoring & progress
+
+- **System & GPU status**
+  - `/api/status` reports CPU, memory, disk utilization, running model instances, and `llama-swap` proxy health.
+  - `/api/gpu-info` reports detected GPUs and their capabilities (focused on NVIDIA/CUDA).
+
+- **Unified progress tracking**
+  - `/api/events` streams:
+    - Download progress and completion events.
+    - llama.cpp/ik_llama source build progress.
+    - CUDA toolkit installation/uninstallation status and logs.
+    - LMDeploy installation status and logs.
+    - Notifications related to long-running tasks.
+
+---
+
+## Quick start (Docker)
+
+The recommended way to run llama.cpp Studio is via Docker Compose. All examples assume you’ve cloned the repository.
+
+### 1. Clone the repo
+
 ```bash
 git clone <repository-url>
 cd llama-cpp-studio
 ```
 
-2. Start the application:
-```bash
-# CPU-only mode
-docker-compose -f docker-compose.cpu.yml up -d
-
-# GPU mode (NVIDIA CUDA)
-docker-compose -f docker-compose.cuda.yml up -d
+### 2. CPU-focused development (hot reload backend)
 
-# Vulkan/AMD GPU mode
-docker-compose -f docker-compose.vulkan.yml up -d
+Use the CPU compose file (`docker-compose.cpu.yml`) during development. It mounts the backend source and enables reload:
 
-# ROCm mode
-docker-compose -f docker-compose.rocm.yml up -d
+```bash
+docker-compose -f docker-compose.cpu.yml up --build
 ```
 
-3. Access the web interface at `http://localhost:8080`
-
-### Published Container Images
+This will:
 
-Prebuilt images are pushed to GitHub Container Registry whenever the `publish-docker` workflow runs.
+- Expose the web UI and API at `http://localhost:8080`.
+- Expose the `llama-swap` proxy at `http://localhost:2000`.
+- Mount `./data` to `/app/data` so models, configs, and logs persist between runs.
 
-- `ghcr.io/<org-or-user>/llama-cpp-studio:latest` – standard image based on `ubuntu:22.04` with GPU tooling installed at runtime
+### 3. GPU mode (NVIDIA CUDA)
 
-Pull the image from GHCR:
+For NVIDIA GPUs with the NVIDIA Container Toolkit installed on the host:
 
 ```bash
-docker pull ghcr.io/<org-or-user>/llama-cpp-studio:latest
+docker-compose -f docker-compose.cuda.yml up --build -d
 ```
 
-### Manual Docker Build
+This will:
+
+- Build the image from the current source tree.
+- Map:
+  - `8080:8080` – web UI + FastAPI backend
+  - `2000:2000` – `llama-swap` OpenAI-compatible endpoint
+- Mount `./data` to `/app/data`.
+- Reserve all GPUs for the container using the Compose `deploy.resources.reservations.devices` section.
+
+### 4. Manual Docker build and run
+
+You can also build and run the container without Compose:
 
-1. Build the image:
 ```bash
+# Build the image
 docker build -t llama-cpp-studio .
-```
 
-2. Run the container:
-```bash
-# With GPU support
+# GPU-capable run (NVIDIA)
 docker run -d \
   --name llama-cpp-studio \
   --gpus all \
   -p 8080:8080 \
+  -p 2000:2000 \
   -v ./data:/app/data \
   llama-cpp-studio
 
-# CPU-only
+# CPU-only run
 docker run -d \
-  --name llama-cpp-studio \
+  --name llama-cpp-studio-cpu \
   -p 8080:8080 \
+  -p 2000:2000 \
+  -e CUDA_VISIBLE_DEVICES="" \
   -v ./data:/app/data \
   llama-cpp-studio
 ```
 
+### 5. Published images
+
+If you prefer pulling from a registry, use the GitHub Container Registry image published by this project (replace `<org-or-user>` with the correct namespace):
+
+```bash
+docker pull ghcr.io/<org-or-user>/llama-cpp-studio:latest
+```
+
+Run it with the same ports and volume mapping as above.
+
+---
+
 ## Configuration
 
-### Environment Variables
-- `CUDA_VISIBLE_DEVICES`: GPU device selection (default: all, set to "" for CPU-only)
-- `PORT`: Web server port (default: 8080)
-- `HUGGINGFACE_API_KEY`: HuggingFace API token for model search and download (optional)
-- `LMDEPLOY_BIN`: Override path to the `lmdeploy` CLI (default: `lmdeploy` on PATH)
-- `LMDEPLOY_PORT`: Override the LMDeploy OpenAI port (default: 2001)
+### Environment variables
+
+Common environment variables for the backend:
+
+- **`HUGGINGFACE_API_KEY`** – HuggingFace token used for model search and download.
+  - When set via environment variable, the UI treats it as read-only and shows only a masked preview.
+- **`CUDA_VISIBLE_DEVICES`** – controls which GPUs are visible to the container:
+  - Default in Compose is `all`.
+  - Set to `""` (empty string) for CPU-only runs.
+- **`HF_HOME`** and **`HUGGINGFACE_HUB_CACHE`** – location for the HuggingFace cache:
+  - Default to `/app/data/hf-cache` and `/app/data/hf-cache/hub` so cache data is persisted in the volume.
+- **`BACKEND_CORS_ORIGINS`**, **`BACKEND_CORS_ALLOW_CREDENTIALS`** – advanced CORS options for custom setups.
+- **`RELOAD`** – when running the backend directly, controls uvicorn reload behavior (`true` in local dev, `false` in Docker).
+
+These can be set directly in `docker-compose.yml` or via an `.env` file referenced by Compose.
+
+### Data & volumes
+
+The image expects a writable data directory at `/app/data`, typically mapped from `./data` on the host:
+
+- **Models** – GGUF files and safetensors bundles.
+- **Config** – YAML files for models, engines, and other settings.
+- **Logs** – backend logs, build logs, installer logs.
+- **llama.cpp builds** – source trees and build outputs.
+- **CUDA toolkit** – if installed, under `/app/data/cuda`.
+- **LMDeploy virtualenv** – under `/app/data/lmdeploy/venv`.
+
+Recommended Compose mapping:
+
+```yaml
+volumes:
+  - ./data:/app/data
+```
 
-### Volume Mounts
-- `/app/data`: Persistent storage for models, configurations, and database
+### HuggingFace token
 
-### HuggingFace API Key
+You can provide your HuggingFace token in multiple ways:
 
-To enable model search and download functionality, you need to set your HuggingFace API key. You can do this in several ways:
+- **Directly in Compose** (keep this private):
 
-#### Option 1: Docker Compose Environment Variable
-Uncomment and set the token in your `docker-compose.yml`:
 ```yaml
 environment:
-  - CUDA_VISIBLE_DEVICES=all
   - HUGGINGFACE_API_KEY=your_huggingface_token_here
 ```
 
-#### Option 2: .env File
-Create a `.env` file in your project root:
+- **`.env` file** (not committed to git):
+
 ```bash
 HUGGINGFACE_API_KEY=your_huggingface_token_here
 ```
 
-Then uncomment the `env_file` section in `docker-compose.yml`:
+Then in Compose:
+
 ```yaml
 env_file:
   - .env
 ```
 
-#### Option 3: System Environment Variable
-Set the environment variable before running Docker Compose:
-```bash
-export HUGGINGFACE_API_KEY=your_huggingface_token_here
-docker-compose up -d
-```
-
-#### Getting Your HuggingFace Token
-1. Go to [HuggingFace Settings](https://huggingface.co/settings/tokens)
-2. Create a new token with "Read" permissions
-3. Copy the token and use it in one of the methods above
-
-**Note**: When the API key is set via environment variable, it cannot be modified through the web UI for security reasons.
+Once configured, the Model Search UI will use this token transparently.
 
-### GPU Requirements
-- **NVIDIA**: NVIDIA GPU with CUDA support, NVIDIA Container Toolkit installed
-- **AMD**: AMD GPU with Vulkan/ROCm drivers
-- **Apple**: Apple Silicon with Metal support
-- **CPU**: OpenBLAS for CPU acceleration (included in Docker image)
-- Minimum 8GB VRAM recommended for most models
+---
 
-### LMDeploy Requirement
+## Using the web UI
 
-Safetensors execution relies on [LMDeploy](https://github.com/InternLM/lmdeploy), but the base image intentionally omits it to keep Docker builds lightweight (critical for GitHub Actions). Use the **LMDeploy** page in the UI to install or remove LMDeploy inside the running container—installs happen via `pip` at runtime and logs are streamed live. The installer creates a dedicated virtual environment under `/app/data/lmdeploy/venv`, so the package lives on the writable volume and can be removed by deleting that folder. If you are running outside the container, you can still `pip install lmdeploy` manually or point `LMDEPLOY_BIN` to a custom binary. The runtime uses `lmdeploy serve turbomind` to expose an OpenAI-compatible server on port `2001`.
+### Model search & download
 
-## Usage
+- Open the **Model Search** view.
+- Enter a HuggingFace repo name or search term, choose:
+  - `gguf` – to browse GGUF quantizations and bundles.
+  - `safetensors` – to browse safetensors bundles.
+- Expand a result to:
+  - Inspect file sizes and quantization names.
+  - See optional projector (`.mmproj`) files for multimodal models.
+- Click **Download** to start a download; progress will appear in the global progress panel.
 
-### 1. Model Management
+### Model library
 
-#### Search Models
-- Use the search bar to find GGUF models on HuggingFace
-- Filter by tags, parameters, or model name
-- View comprehensive metadata including downloads, likes, tags, and file sizes
+- Open the **Model Library** view (`/models`).
+- Each card groups all quantizations for a base model:
+  - GGUF quantizations (different sizes and quant schemes).
+  - Safetensors bundles (if present).
+- Per-row actions:
+  - **Start** / **Stop** – launch or stop a model via `llama-swap`.
+  - **Configure** – open the per-quantization configuration screen.
+  - **Delete** – remove a specific quantization.
+- Group-level actions let you delete entire model groups to reclaim disk space.
 
-#### Download Models
-- Click download on any quantization to start downloading
-- Multiple quantizations of the same model are automatically grouped
-- Progress tracking with real-time updates via SSE
+### Per-model configuration
 
-#### Configure Models
-- Set llama.cpp parameters or use Smart Auto for optimal settings
-- View VRAM estimation before starting
-- Configure context size, batch sizes, temperature, and more
+- From the library, click **Configure** on a quantization.
+- Choose an engine:
+  - `llama.cpp` or `ik_llama.cpp` for GGUF.
+  - `LMDeploy` for safetensors.
+- Adjust:
+  - Context length.
+  - GPU layers (`-ngl`-style behavior).
+  - Batch sizes and other llama.cpp/LMDeploy flags.
+- Advanced options are rendered from a parameter registry maintained on the backend, allowing you to set engine-specific flags explicitly.
 
-#### Run Models
-- Start/stop models with one click
-- Multiple models can run simultaneously
-- View running instances and resource usage
+### Engines and CUDA
 
-### 2. llama.cpp Versions
+- Open the **Engines & System** view (`/engines`) to:
+  - View and manage **llama.cpp** and **ik_llama.cpp** versions:
+    - Build from source using saved build settings (e.g. CUDA on/off).
+    - Activate a version (updates `llama-swap` configuration).
+    - Delete non-active versions to free disk.
+  - Manage **CUDA toolkit** in the container:
+    - Install or uninstall specific CUDA versions.
+    - See status and detailed logs.
+  - Manage **LMDeploy**:
+    - Install from PyPI or a git branch.
+    - Remove LMDeploy and its virtualenv.
+    - Tail installer logs for debugging.
 
-#### Check Updates
-- View available releases and source updates
-- See commit history and release notes
+All of these actions surface their progress and logs in the unified progress UI.
 
-#### Install Release
-- Download pre-built binaries from GitHub
-- Automatic verification and installation
+### System status & monitoring
 
-#### Build from Source
-- Compile from source with custom configuration
-- Select GPU backends (CUDA, Vulkan, Metal, OpenBLAS)
-- Configure build type (Release, Debug, RelWithDebInfo)
-- Add custom CMake flags and compiler options
-- Apply patches from GitHub PRs
-- Automatic validation of built binaries
+- The header shows a concise llama-swap status indicator (health and port).
+- The System section displays:
+  - CPU, memory, and disk usage.
+  - Detected NVIDIA GPUs and key characteristics.
+  - Currently running models as reported by `llama-swap`.
 
-#### Manage Versions
-- Delete old versions to free up space
-- View installation details and build configuration
+---
 
-### 3. System Monitoring
-- **Overview**: CPU, memory, disk, and GPU usage
-- **GPU Details**: Individual GPU information and utilization
-- **Running Instances**: Active model instances with resource usage
-- **SSE**: Real-time updates for all metrics
+## OpenAI-compatible API (high level)
 
-## Multi-Model Serving
+Once at least one model is running, you can call the `llama-swap` proxy directly.
 
-llama-cpp-studio uses llama-swap to serve multiple models simultaneously on port 2000.
-
-### Starting Models
-
-Simply start any model from the Model Library. All models run on port 2000 simultaneously.
-
-### OpenAI-Compatible API
+- **Base URL**: `http://<host>:2000`
+- **Chat completions**:
 
 ```bash
 curl http://localhost:2000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "llama-3-2-1b-instruct-iq2-xs",
+    "model": "your-model-name",
     "messages": [{"role": "user", "content": "Hello!"}]
   }'
 ```
 
-Model names are shown in System Status after starting a model.
-
-### Features
-
-- Multiple models run concurrently
-- No loading time - instant switching between models
-- Standard OpenAI API format
-- Automatic lifecycle management
-- Single unified endpoint
-
-### Troubleshooting
+- **Model listing**: `GET http://localhost:2000/v1/models`
+- **Health**: `GET http://localhost:2000/health`
 
-- Check available models: `http://localhost:2000/v1/models`
-- Check proxy health: `http://localhost:2000/health`
-- View logs: `docker logs llama-cpp-studio`
+Model IDs are shown in the System Status view and in the Model Library when a model is running.
 
-### LMDeploy TurboMind (Safetensors)
+---
 
-- Run exactly one safetensors checkpoint at a time via LMDeploy
-- Configure tensor/pipeline parallelism, context length, temperature, and other runtime flags from the Model Library
-- Serves an OpenAI-compatible endpoint at `http://localhost:2001/v1/chat/completions`
-- Install LMDeploy on demand from the LMDeploy page (or manually via `pip`) before starting safetensors runtimes
-- Start/stop directly from the Safetensors panel; status is reported in System Status and the LMDeploy status chip
+## Troubleshooting & logs
 
-## Build Customization
+### Common issues
 
-### GPU Backends
+- **GPU not detected**
+  - Ensure the NVIDIA Container Toolkit is installed and `nvidia-smi` works on the host.
+  - Use `--gpus all` (docker run) or the `deploy.resources.reservations.devices` stanza in Compose.
+  - Confirm `CUDA_VISIBLE_DEVICES` is not set to `""` when you intend to use the GPU.
 
-Enable specific GPU backends during source builds:
+- **Build failures (llama.cpp / ik_llama / CUDA)**
+  - Check that you have enough disk space (≥ 10 GB free is a good baseline).
+  - Verify CUDA and driver versions are compatible with the chosen build settings.
+  - Review build or installer logs (via the progress UI or log files in `/app/data/logs`).
 
-- **CUDA**: NVIDIA GPU acceleration with cuBLAS
-- **Vulkan**: AMD/Intel GPU acceleration with Vulkan compute
-- **Metal**: Apple Silicon GPU acceleration
-- **OpenBLAS**: CPU optimization with OpenBLAS routines
+- **Memory errors / out-of-memory**
+  - Reduce context length and/or batch size for the model configuration.
+  - For GPU runs, lower GPU layers or choose a smaller quantization.
 
-### Build Configuration
+- **Model download failures**
+  - Verify HuggingFace connectivity and model visibility (public/private).
+  - Ensure `HUGGINGFACE_API_KEY` is correctly configured for private models.
+  - Check free space under `/app/data`.
 
-Customize your build with:
+- **llama-swap**
+  - Hit `http://localhost:2000/health` and `http://localhost:2000/v1/models` to check proxy state.
 
-- **Build Type**: Release (optimal), Debug (development), RelWithDebInfo
-- **Custom CMake Flags**: Additional CMake configuration
-- **Compiler Flags**: CFLAGS and CXXFLAGS for optimization
-- **Git Patches**: Apply patches from GitHub PRs
-
-### Example Build Configuration
-
-```json
-{
-  "commit_sha": "master",
-  "patches": [
-    "https://github.com/ggerganov/llama.cpp/pull/1234.patch"
-  ],
-  "build_config": {
-    "build_type": "Release",
-    "enable_cuda": true,
-    "enable_vulkan": false,
-    "enable_metal": false,
-    "enable_openblas": true,
-    "custom_cmake_args": "-DGGML_CUDA_CUBLAS=ON",
-    "cflags": "-O3 -march=native",
-    "cxxflags": "-O3 -march=native"
-  }
-}
-```
+### Logs
 
-## Smart Auto Configuration
-
-The Smart Auto feature automatically generates optimal llama.cpp parameters based on:
-
-- **GPU Capabilities**: VRAM, compute capability, multi-GPU support
-- **NVLink Topology**: Automatic detection and optimization for NVLink clusters
-- **Model Architecture**: Detected from model name (Llama, Mistral, etc.)
-- **Available Resources**: CPU cores, memory, disk space
-- **Performance Optimization**: Flash attention, tensor parallelism, batch sizing
-
-### NVLink Optimization Strategies
-
-The system automatically detects NVLink topology and applies appropriate strategies:
-
-- **Unified NVLink**: All GPUs connected via NVLink - uses aggressive tensor splitting and higher parallelism
-- **Clustered NVLink**: Multiple NVLink clusters - optimizes for the largest cluster
-- **Partial NVLink**: Some GPUs connected via NVLink - uses hybrid approach
-- **PCIe Only**: No NVLink detected - uses conservative PCIe-based configuration
-
-### Supported Parameters
-- Context size, batch sizes, GPU layers
-- Temperature, top-k, top-p, repeat penalty
-- CPU threads, parallel sequences
-- RoPE scaling, YaRN factors
-- Multi-GPU tensor splitting
-- Custom arguments via YAML config
-
-## API Endpoints
-
-### Models
-- `GET /api/models` - List all models
-- `POST /api/models/search` - Search HuggingFace
-- `POST /api/models/download` - Download model
-- `GET /api/models/{id}/config` - Get model configuration
-- `PUT /api/models/{id}/config` - Update configuration
-- `POST /api/models/{id}/auto-config` - Generate smart configuration
-- `POST /api/models/{id}/start` - Start model
-- `POST /api/models/{id}/stop` - Stop model
-- `DELETE /api/models/{id}` - Delete model
-- `GET /api/models/safetensors/{model_id}/lmdeploy/config` - Get LMDeploy config for a safetensors download
-- `PUT /api/models/safetensors/{model_id}/lmdeploy/config` - Update LMDeploy config
-- `POST /api/models/safetensors/{model_id}/lmdeploy/start` - Start LMDeploy runtime
-- `POST /api/models/safetensors/{model_id}/lmdeploy/stop` - Stop LMDeploy runtime
-- `GET /api/models/safetensors/lmdeploy/status` - LMDeploy manager status
-
-### LMDeploy Installer
-- `GET /api/lmdeploy/status` - Installer status (version, binary path, current operation)
-- `POST /api/lmdeploy/install` - Install LMDeploy via pip at runtime
-- `POST /api/lmdeploy/remove` - Remove LMDeploy from the runtime environment
-- `GET /api/lmdeploy/logs` - Tail the LMDeploy installer log
-
-### llama.cpp Versions
-- `GET /api/llama-versions` - List installed versions
-- `GET /api/llama-versions/check-updates` - Check for updates
-- `GET /api/llama-versions/build-capabilities` - Get build capabilities
-- `POST /api/llama-versions/install-release` - Install release
-- `POST /api/llama-versions/build-source` - Build from source
-- `DELETE /api/llama-versions/{id}` - Delete version
-
-### System
-- `GET /api/status` - System status
-- `GET /api/gpu-info` - GPU information
-- `GET /api/events` - Server-Sent Events for real-time updates
-
-## Database Migration
-
-If upgrading from an older version, you may need to migrate your database:
+- **Container logs**:
 
 ```bash
-# Run migration to support multi-quantization
-python migrate_db.py
+docker logs llama-cpp-studio
 ```
 
-## Troubleshooting
-
-### Common Issues
-
-1. **GPU Not Detected**
-   - Ensure NVIDIA Container Toolkit is installed (for NVIDIA)
-   - Check `nvidia-smi` output
-   - Verify `--gpus all` flag in docker run
-   - For AMD: Check Vulkan/ROCm drivers
-
-2. **Build Failures**
-   - Check CUDA version compatibility (for NVIDIA)
-   - Ensure sufficient disk space (at least 10GB free)
-   - Verify internet connectivity for downloads
-   - For Vulkan builds: Ensure `glslang-tools` is installed
-   - Check build logs for specific errors
-
-3. **Memory Issues**
-   - Use Smart Auto configuration
-   - Reduce context size or batch size
-   - Enable memory mapping
-   - Check available system RAM and VRAM
-
-4. **Model Download Failures**
-   - Check HuggingFace connectivity
-   - Verify model exists and is public
-   - Ensure sufficient disk space
-   - Set HUGGINGFACE_API_KEY if using private models
-
-5. **Validation Failed**
-   - Binary exists and is executable
-   - Binary runs `--version` successfully
-   - Output contains "llama" or "version:" string
-
-### Logs
-- Application logs: `docker logs llama-cpp-studio`
-- Model logs: Available in the web interface
-- Build logs: Shown during source compilation
-- SSE event stream: GET /api/events for real-time progress and status
+- **Backend and task logs**: stored under `/app/data/logs` and surfaced via `/api/events`.
+- **CUDA installer logs**: available via CUDA log endpoints and the Engines/System view.
 
-## Development
+---
 
-### Backend
-- FastAPI with async support
-- YAML-backed data store (models, engines, settings)
-- SSE (GET /api/events) for real-time updates
-- Background tasks for long operations
-- Llama-swap integration for multi-model serving
+## Development & testing
 
-### Frontend
-- Vue.js 3 with Composition API
-- PrimeVue component library
-- Pinia for state management
-- Vite for build tooling
-- Dark mode support
+### Backend (FastAPI)
 
-### Testing
-- Backend tests: `pytest` (install deps first: `pip install -r requirements.txt pytest pytest-asyncio`)
-- Run from repo root: `PYTHONPATH=. pytest backend/tests/ -v`
-- Smoke tests in `backend/tests/test_app_smoke.py` verify the app starts and key API routes respond (`/api/status`, `/api/models/param-registry`, `/api/models/`, `/api/events`)
-- LMDeploy installer and config validation tests in `backend/tests/test_lmdeploy_*.py`
+- The backend code lives under `backend/`.
+- To run the backend directly in development:
 
-## Memory Estimation Model
-
-The studio’s capacity planning tooling is grounded in a three-component model for llama.cpp that provides a conservative upper bound on peak memory usage.
-
-- **Formula**: `M_total = M_weights + M_kv + M_compute`
-- **Model weights (`M_weights`)**: Treat the GGUF file size as the ground truth. When `--no-mmap` is disabled (default), the file is memory-mapped so only referenced pages touch physical RAM, but the virtual footprint still equals the file size.
-- **KV cache (`M_kv`)**: Uses the GQA-aware formula `n_ctx × N_layers × N_head_kv × (N_embd / N_head) × (p_a_k + p_a_v)`, where `p_a_*` are the bytes-per-value chosen via `--cache-type-k` / `--cache-type-v`.
-- **Compute buffers (`M_compute`)**: Approximate as a fixed CUDA overhead (~550 MB) plus a scratch buffer that scales with micro-batch size (`n_ubatch × 0.5 MB` by default).
-
-### RAM vs VRAM Allocation
+```bash
+cd backend
+pip install -r ../requirements.txt
+uvicorn main:app --reload --port 8080
+```
 
-- `-ngl 0` (CPU-only): All components stay in RAM.
-- `-ngl > 0` (hybrid/full GPU): Model weights split by layer between RAM and VRAM, while **both `M_kv` and `M_compute` move entirely to VRAM**—the “VRAM trap”.
-- Full offload avoids PCIe contention; hybrid splits suffer a “performance cliff” because activations bounce between CPU and GPU.
+### Frontend (Vue 3 + Vite)
 
-### Optimization Strategy
+- The frontend SPA lives under `frontend/`.
 
-1. Attempt full offload first (best throughput). If weights + compute fit, deduce `n_ctx_max` from remaining VRAM budget.
-2. When full offload fails, search decreasing `n_ngl` values that satisfy RAM limits while maximizing context length, accepting the hybrid performance penalty.
-3. Iterate quantization choices to find the smallest model that still enables full offload on the target hardware profile.
+```bash
+cd frontend
+npm install
+npm run dev
+```
 
-## Smart Auto Module Report
+The dev server (typically on port `5173`) is configured to proxy API calls to the backend.
 
-The Smart Auto subsystem applies the model above to recommend llama.cpp launch parameters. Priority 1 fixes are complete, eliminating prior memory underestimation bugs.
+### Backend tests
 
-- **Resolutions**:
-  - Corrected KV cache math to respect grouped-query attention head counts.
-  - Removed the dangerous 0.30 multiplier on cache size; estimates now use real memory.
-  - Ensured KV cache/compute buffers migrate to VRAM whenever GPU layers are in play.
-  - Modeled compute overhead as `550 MB + 0.5 MB × n_ubatch`.
-  - Improved GPU layer estimation using GGUF file size with a 20 % safety buffer.
-- **Open improvements**:
-  - Reorder calculations so KV cache quantization feeds batch/context sizing directly.
-  - Replace remaining heuristics with joint optimization across `n_ctx`, `n_ngl`, and `n_ubatch`.
+```bash
+pip install -r requirements.txt pytest pytest-asyncio
+PYTHONPATH=. pytest backend/tests -v
+```
 
-### Recommended Validation
+The test suite includes:
 
-- Benchmark against known examples (e.g., 13B @ 2 048 tokens → ~1.6 GB KV cache, 7B @ 4 096 tokens → ~6 GB total).
-- Stress-test large contexts, tight VRAM scenarios, MoE models, and hybrid modes.
-- Expand automated regression coverage around the estimator and Smart Auto flows.
+- Smoke tests to ensure the app boots and key routes (`/api/status`, `/api/models`, `/api/llama-versions`, `/api/events`) respond.
+- Tests for LMDeploy management and configuration.
+- Tests for CUDA installer flows and model introspection logic.
 
-## Memory Estimation Test Results
+---
 
-Empirical testing with `Llama-3.2-1B-Instruct.IQ1_M` demonstrates that the estimator acts as a safe upper bound.
+## License
 
-- **Setup**: `n_ctx ≈ 35 K`, batch 32, CPU-only run.
-- **Estimated peak**: 4.99 GB (weights 394 MB, KV cache 4.34 GB, batch 12 MB, llama.cpp overhead 256 MB).
-- **Observed deltas**:
-  - With mmap enabled: ~608 MB (11.9 % of estimate). Lower usage is expected because the KV cache grows as context fills and weights are paged on demand.
-  - With `--no-mmap`: ~1.16 GB (23 % of estimate). Weights load fully, but KV cache still expands progressively.
-- **Takeaways**:
-  - Estimates intentionally err on the high side to prevent OOM once the context window reaches capacity.
-  - Divergence between virtual and physical usage stems from memory mapping and lazy KV cache allocation.
-  - Additional GPU-focused measurements and long session traces are encouraged to correlate VRAM predictions with reality.
+This project is licensed under the MIT License – see the `LICENSE` file for details.
 
-## License
+---
 
-This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Contributing & support
 
-Copyright (c) 2024 llama.cpp Studio
+### Contributing
 
-## Contributing
+- Fork the repository.
+- Create a feature branch.
+- Make your changes and add tests where appropriate.
+- Open a pull request describing your changes and how you tested them.
 
-1. Fork the repository
-2. Create a feature branch
-3. Make your changes
-4. Add tests if applicable
-5. Submit a pull request
+### Support
 
-## Support
+- Open an issue on GitHub for bugs or feature requests.
+- Review this README and the troubleshooting section before filing.
 
-For issues and questions:
-- Create an issue on GitHub
-- Check the troubleshooting section
-- Review the API documentation
+### Acknowledgments
 
-## Acknowledgments
+- **llama.cpp** – core inference engine.
+- **llama-swap** – multi-model serving proxy.
+- **HuggingFace** – model hosting and search.
+- **Vue.js** – frontend framework.
+- **FastAPI** – backend framework.
 
-- [llama.cpp](https://github.com/ggerganov/llama.cpp) - The core inference engine
-- [llama-swap](https://github.com/mostlygeek/llama-swap) - Multi-model serving proxy
-- [HuggingFace](https://huggingface.co) - Model hosting and search
-- [Vue.js](https://vuejs.org) - Frontend framework
-- [FastAPI](https://fastapi.tiangolo.com) - Backend framework
diff --git a/backend/data_store.py b/backend/data_store.py
index 36aebdf..e75b57b 100644
--- a/backend/data_store.py
+++ b/backend/data_store.py
@@ -1,6 +1,8 @@
 """YAML-backed data store replacing SQLite."""
 
+import json
 import os
+import re
 import threading
 from typing import Any, Dict, List, Optional
 
@@ -31,6 +33,59 @@ def generate_proxy_name(huggingface_id: str, quantization: Optional[str] = None)
     return huggingface_slug
 
 
+def _coerce_config(config_value: Optional[Any]) -> Dict[str, Any]:
+    if not config_value:
+        return {}
+    if isinstance(config_value, dict):
+        return config_value
+    if isinstance(config_value, str):
+        try:
+            return json.loads(config_value)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _model_value(model: Any, key: str, default: Any = None) -> Any:
+    if isinstance(model, dict):
+        return model.get(key, default)
+    return getattr(model, key, default)
+
+
+def normalize_proxy_alias(alias: Optional[str]) -> str:
+    """Normalize a user-provided model alias into a safe exposed engine ID."""
+    if alias is None:
+        return ""
+
+    normalized = str(alias).strip().lower()
+    if not normalized:
+        return ""
+
+    normalized = normalized.replace("/", "-").replace("\\", "-")
+    normalized = re.sub(r"\s+", "-", normalized)
+    normalized = re.sub(r"[^a-z0-9._-]", "-", normalized)
+    normalized = re.sub(r"-{2,}", "-", normalized)
+    normalized = normalized.strip("._-")
+    return normalized
+
+
+def resolve_proxy_name(model: Any) -> str:
+    """Return the exposed runtime model ID for a stored model."""
+    config = _coerce_config(_model_value(model, "config"))
+    alias = normalize_proxy_alias(config.get("model_alias"))
+    if alias:
+        return alias
+
+    existing = normalize_proxy_alias(_model_value(model, "proxy_name"))
+    if existing:
+        return existing
+
+    return generate_proxy_name(
+        _model_value(model, "huggingface_id", ""),
+        _model_value(model, "quantization"),
+    )
+
+
 class DataStore:
     """Thread-safe YAML-backed data store replacing SQLite."""
 
@@ -175,6 +230,23 @@ def delete_engine_version(self, engine: str, version: str) -> bool:
         self._save_yaml("engines.yaml", data)
         return True
 
+    def get_engine_build_settings(self, engine: str) -> Dict[str, Any]:
+        """Return persisted build settings for the given engine (or empty dict)."""
+        data = self._read_yaml("engines.yaml")
+        return data.get(engine, {}).get("build_settings", {}) or {}
+
+    def update_engine_build_settings(self, engine: str, settings: Dict[str, Any]) -> Dict[str, Any]:
+        """Merge and persist build settings for the given engine. Returns the stored settings."""
+        if not isinstance(settings, dict):
+            settings = {}
+        data = self._read_yaml("engines.yaml")
+        engine_data = data.setdefault(engine, {})
+        existing = engine_data.get("build_settings") or {}
+        merged = {**existing, **settings}
+        engine_data["build_settings"] = merged
+        self._save_yaml("engines.yaml", data)
+        return merged
+
     # --- LMDeploy ---
 
     def get_lmdeploy_status(self) -> dict:
diff --git a/backend/gguf_introspection_config.json b/backend/gguf_introspection_config.json
new file mode 100644
index 0000000..885f0cc
--- /dev/null
+++ b/backend/gguf_introspection_config.json
@@ -0,0 +1,23 @@
+{
+  "global": {
+    "context_length": {
+      "preferred_keys": [
+        "general.context_length",
+        "general.model_max_length",
+        "general.max_position_embeddings"
+      ]
+    }
+  },
+  "glm4": {
+    "match_arch": ["glm4", "glm4moe"],
+    "context_length": {
+      "preferred_keys": ["glm4.context_length", "glm4.model_max_length"],
+      "fallback_terms": ["context", "max_position_embeddings"]
+    },
+    "layer_count": {
+      "preferred_keys": ["glm4.num_hidden_layers"],
+      "fallback_terms": ["layer", "block"]
+    }
+  }
+}
+
diff --git a/backend/gguf_reader.py b/backend/gguf_reader.py
index 2ad9dd1..df47723 100644
--- a/backend/gguf_reader.py
+++ b/backend/gguf_reader.py
@@ -2,13 +2,14 @@
 GGUF file metadata reader for extracting model layer information
 """
 
-import struct
 import os
+import struct
 import mmap
 from enum import IntEnum
-from typing import Dict, Optional, Any, List, Tuple, BinaryIO
+from typing import Any, BinaryIO, Dict, List, Optional, Tuple
 
 from backend.logging_config import get_logger
+from backend.model_introspection import GgufIntrospector
 
 logger = get_logger(__name__)
 
@@ -1247,22 +1248,34 @@ def get_model_layer_info(model_path: str) -> Optional[Dict[str, Any]]:
             logger.error(f"Model file is not GGUF format: {model_path}")
             return None
 
-        metadata = read_gguf_metadata(model_path)
-        if metadata:
-            return {
-                "layer_count": metadata["layer_count"],
-                "architecture": metadata["architecture"],
-                "context_length": metadata["context_length"],
-                "vocab_size": 0,  # Not extracted from metadata
-                "embedding_length": metadata["embedding_length"],
-                "attention_head_count": metadata["attention_head_count"],
-                "attention_head_count_kv": metadata["attention_head_count_kv"],
-                "block_count": metadata["block_count"],
-                "is_moe": metadata["is_moe"],
-                "expert_count": metadata["expert_count"],
-                "experts_used_count": metadata["experts_used_count"],
-            }
-        return None
+        with GGUFReader(model_path) as reader:
+            metadata = reader.metadata
+            tensors = reader.tensors
+
+            introspector = GgufIntrospector(metadata=metadata, tensors=tensors)
+            info = introspector.build_model_info()
+
+        return {
+            "layer_count": int(info.layer_count) if info.layer_count else 0,
+            "architecture": metadata.get("general.architecture", ""),
+            "context_length": int(info.context_length) if info.context_length else 0,
+            "vocab_size": int(info.vocab_size) if info.vocab_size else 0,
+            "embedding_length": int(info.embedding_length)
+            if info.embedding_length
+            else 0,
+            "attention_head_count": int(info.attention_head_count)
+            if info.attention_head_count
+            else 0,
+            "attention_head_count_kv": int(info.attention_head_count_kv)
+            if info.attention_head_count_kv
+            else 0,
+            "block_count": int(info.block_count) if info.block_count else 0,
+            "is_moe": bool(info.is_moe),
+            "expert_count": int(info.expert_count) if info.expert_count else 0,
+            "experts_used_count": int(info.experts_used_count)
+            if info.experts_used_count
+            else 0,
+        }
     except Exception as e:
         logger.error(
             f"Failed to get model layer info from {model_path}: {e}", exc_info=True
diff --git a/backend/huggingface.py b/backend/huggingface.py
index 288ab57..077ffb4 100644
--- a/backend/huggingface.py
+++ b/backend/huggingface.py
@@ -863,9 +863,6 @@ async def _fetch_and_merge(repo_id: Optional[str]):
             metadata["tokenizer"] = tokenizer_json
 
     await _fetch_and_merge(huggingface_id)
-    if huggingface_id and huggingface_id.lower().endswith("-gguf"):
-        base_repo = huggingface_id[:-5]
-        await _fetch_and_merge(base_repo)
 
     try:
         layer_info = get_model_layer_info(file_path) or {}
@@ -1210,6 +1207,20 @@ async def process_model(model):
         if result is not None:
             valid_results.append(result)
 
+    if model_format == "gguf":
+        def _gguf_sort_key(item: Dict[str, Any]):
+            quantizations = item.get("quantizations") or {}
+            size_candidates = [
+                q.get("total_size") or 0
+                for q in quantizations.values()
+                if isinstance(q, dict)
+            ]
+            positive_sizes = [size for size in size_candidates if size > 0]
+            min_size = min(positive_sizes) if positive_sizes else float("inf")
+            return (min_size, -(item.get("downloads") or 0), item.get("id") or "")
+
+        valid_results.sort(key=_gguf_sort_key)
+
     return valid_results[:limit]
 
 
@@ -1219,20 +1230,18 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
         logger.info(f"Processing model: {model.id}")
 
         quantizations: Dict[str, Dict] = {}
+        mmproj_files: List[Dict[str, Any]] = []
         safetensors_files: List[Dict] = []
         repo_files: List[Dict[str, Any]] = []
 
         if hasattr(model, "siblings") and model.siblings:
             if model_format == "gguf":
-                # Group GGUF files by logical quantization, handling multi-part shards
-                # Accept both plain `.gguf` and multi-part patterns like `.gguf.part1of2`
-                # Exclude mmproj (vision/multimodal projection) files – they are extensions, not standalone quants
+                # Group GGUF files by logical quantization, handling multi-part shards.
                 gguf_siblings = [
                     s
                     for s in model.siblings
                     if isinstance(getattr(s, "rfilename", None), str)
                     and re.search(r"\.gguf(\.|$)", s.rfilename)
-                    and "mmproj" not in s.rfilename.lower()
                 ]
                 logger.info(f"Model {model.id}: {len(gguf_siblings)} GGUF files found")
                 if not gguf_siblings:
@@ -1240,6 +1249,14 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
 
                 for sibling in gguf_siblings:
                     filename = sibling.rfilename
+                    if "mmproj" in filename.lower():
+                        mmproj_files.append(
+                            {
+                                "filename": filename,
+                                "size": getattr(sibling, "size", 0) or 0,
+                            }
+                        )
+                        continue
                     # Normalize filename by stripping shard suffix patterns like:
                     #   -00001-of-00002.gguf (TheBloke-style)
                     #   .gguf.part1of2 (Hugging Face-style multi-part)
@@ -1298,25 +1315,9 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                         else 0.0
                     )
 
-                # Siblings from list_models often have size=None; fetch accurate sizes from Hub
-                try:
-                    all_filenames = [s.rfilename for s in gguf_siblings]
-                    accurate_sizes = get_accurate_file_sizes(model.id, all_filenames)
-                    if accurate_sizes:
-                        for entry in quantizations.values():
-                            for f in entry["files"]:
-                                f["size"] = accurate_sizes.get(f["filename"]) or f["size"] or 0
-                            entry["total_size"] = sum(f["size"] for f in entry["files"])
-                            entry["size_mb"] = (
-                                round(entry["total_size"] / (1024 * 1024), 2)
-                                if entry["total_size"]
-                                else 0.0
-                            )
-                except Exception as size_err:
-                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
-
-                # If no quantizations were detected after grouping, skip this model
-                if not quantizations:
+                # Search should stay to a single HF API call. Accurate file sizes are lazy-loaded on expand.
+                # If no downloadable GGUF entries were detected after grouping, skip this model.
+                if not quantizations and not mmproj_files:
                     return None
             else:
                 safetensors_files = []
@@ -1338,15 +1339,6 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                 )
                 if not safetensors_files:
                     return None
-                # Fetch accurate sizes; list_models siblings often have size=None
-                try:
-                    st_filenames = [f["filename"] for f in safetensors_files]
-                    accurate_sizes = get_accurate_file_sizes(model.id, st_filenames)
-                    if accurate_sizes:
-                        for f in safetensors_files:
-                            f["size"] = accurate_sizes.get(f["filename"]) or 0
-                except Exception as size_err:
-                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
         else:
             return None
 
@@ -1364,6 +1356,7 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
             "tags": model.tags or [],
             "model_format": model_format,
             "quantizations": quantizations if model_format == "gguf" else {},
+            "mmproj_files": mmproj_files if model_format == "gguf" else [],
             "safetensors_files": (
                 safetensors_files if model_format == "safetensors" else []
             ),
@@ -1668,7 +1661,7 @@ async def get_model_details(model_id: str) -> Dict:
                 config_path = hf_hub_download(
                     repo_id=model_id,
                     filename="config.json",
-                    local_dir="data/temp",
+                    local_dir="data/hf-cache",
                     local_dir_use_symlinks=False,
                 )
 
diff --git a/backend/llama_manager.py b/backend/llama_manager.py
index cbcb928..55f5d2e 100644
--- a/backend/llama_manager.py
+++ b/backend/llama_manager.py
@@ -2273,13 +2273,19 @@ def set_flag(flag: str, value: bool):
             logger.error(f"Build failed: {e}")
             if progress_manager and task_id:
                 try:
-                    await progress_manager.send_build_progress(
-                        task_id=task_id,
-                        stage="error",
-                        progress=0,
-                        message=f"Build failed: {str(e)}",
-                        log_lines=[f"Error: {str(e)}"],
+                    existing_task = progress_manager.get_task(task_id)
+                    existing_logs = (
+                        (existing_task or {}).get("metadata", {}).get("log_lines") or []
                     )
+                    error_text = str(e)
+                    if error_text not in existing_logs:
+                        await progress_manager.send_build_progress(
+                            task_id=task_id,
+                            stage="error",
+                            progress=0,
+                            message=f"Build failed: {error_text}",
+                            log_lines=[f"Error: {error_text}"],
+                        )
                 except Exception as ws_error:
                     logger.error(f"Failed to send error via SSE: {ws_error}")
             raise Exception(f"Failed to build from source {commit_sha}: {e}")
diff --git a/backend/llama_swap_client.py b/backend/llama_swap_client.py
index 454fe2a..3f3e5e3 100644
--- a/backend/llama_swap_client.py
+++ b/backend/llama_swap_client.py
@@ -149,3 +149,23 @@ async def get_model_info(self, model_id: str, upstream_path: str = "v1/models"):
         except Exception as e:
             logger.error(f"Failed to get model info for {model_id}: {e}")
             raise
+
+    async def load_model(self, model_name: str, retries: int = 20, delay: float = 0.5):
+        """Trigger on-demand model loading via llama-swap's upstream route."""
+        last_error = None
+        for _ in range(max(1, retries)):
+            try:
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(
+                        f"{self.base_url}/upstream/{model_name}/v1/models", timeout=30
+                    )
+                    response.raise_for_status()
+                    self._loading_models.discard(model_name)
+                    return response.json()
+            except Exception as e:
+                last_error = e
+                self._loading_models.add(model_name)
+                await asyncio.sleep(delay)
+        self._loading_models.discard(model_name)
+        logger.error(f"Failed to load model {model_name}: {last_error}")
+        raise last_error
diff --git a/backend/llama_swap_config.py b/backend/llama_swap_config.py
index 3e071e7..6829449 100644
--- a/backend/llama_swap_config.py
+++ b/backend/llama_swap_config.py
@@ -510,44 +510,73 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any:
             return m.get(key, default)
         return getattr(m, key, default)
 
+    from backend.data_store import (
+        generate_proxy_name as _generate_proxy_name,
+        normalize_proxy_alias as _normalize_proxy_alias,
+        resolve_proxy_name as _resolve_proxy_name,
+    )
+
     # Resolve LMDeploy binary and build proxy->model map for overlay (used for both all_models and running overlay)
     lmdeploy_bin = None
     all_models_by_proxy: Dict[str, Any] = {}
+    all_models_by_legacy_proxy: Dict[str, Any] = {}
     try:
         from backend.data_store import get_store as _get_store
         store = _get_store()
-        lmdeploy_status = store.get_lmdeploy_status()
-        if lmdeploy_status.get("installed") and lmdeploy_status.get("venv_path"):
-            venv = lmdeploy_status["venv_path"]
-            lmdeploy_bin = os.path.join(venv, "bin", "lmdeploy")
-            if not os.path.isabs(lmdeploy_bin):
-                lmdeploy_bin = os.path.join("/app", lmdeploy_bin)
-            if not os.path.exists(lmdeploy_bin):
-                lmdeploy_bin = None
+        # Prefer the active versioned LMDeploy engine, same pattern as llama_cpp.
+        active_lmdeploy = store.get_active_engine_version("lmdeploy")
+        venv = active_lmdeploy.get("venv_path") if active_lmdeploy else None
+        # Fallback to legacy single-status layout if no active version is found.
+        if not venv:
+            legacy_status = store.get_lmdeploy_status()
+            if legacy_status.get("installed"):
+                venv = legacy_status.get("venv_path")
+        if venv:
+            # Ensure the venv path still exists before resolving the binary.
+            if not os.path.isabs(venv):
+                venv = os.path.join("/app", venv)
+            if os.path.isdir(venv):
+                candidate = os.path.join(venv, "bin", "lmdeploy")
+                if not os.path.isabs(candidate):
+                    candidate = os.path.join("/app", candidate)
+                if os.path.exists(candidate) and os.access(candidate, os.X_OK):
+                    lmdeploy_bin = candidate
+                else:
+                    logger.debug(
+                        f"LMDeploy binary not found or not executable at {candidate}; "
+                        "LMDeploy engine entries will be skipped in llama-swap config"
+                    )
+            else:
+                logger.debug(
+                    f"LMDeploy venv_path does not exist at {venv}; "
+                    "LMDeploy engine entries will be skipped in llama-swap config"
+                )
     except Exception as e:
         logger.debug(f"Could not resolve LMDeploy binary: {e}")
 
     # First, add all models from the data store (if provided)
     if all_models:
-        from backend.data_store import generate_proxy_name as _gen_proxy_name
-
         for model in all_models:
-            proxy_model_name = _model_attr(model, "proxy_name")
-            if not proxy_model_name:
-                proxy_model_name = _gen_proxy_name(
-                    _model_attr(model, "huggingface_id", ""),
-                    _model_attr(model, "quantization"),
-                )
+            proxy_model_name = _resolve_proxy_name(model)
             if not proxy_model_name:
                 logger.warning(
                     f"Model '{_model_attr(model, 'display_name') or _model_attr(model, 'name')}' does not have a proxy_name set, skipping"
                 )
                 continue
             all_models_by_proxy[proxy_model_name] = model
+            legacy_proxy_name = _normalize_proxy_alias(_model_attr(model, "proxy_name"))
+            if legacy_proxy_name and legacy_proxy_name != proxy_model_name:
+                all_models_by_legacy_proxy[legacy_proxy_name] = model
+            generated_proxy_name = _generate_proxy_name(
+                _model_attr(model, "huggingface_id", ""),
+                _model_attr(model, "quantization"),
+            )
+            if generated_proxy_name and generated_proxy_name != proxy_model_name:
+                all_models_by_legacy_proxy[generated_proxy_name] = model
 
             engine = _model_attr(model, "engine")
-            model_format = _model_attr(model, "format") or _model_attr(model, "model_format") or "gguf"
-            is_lmdeploy = engine == "lmdeploy" or model_format == "safetensors"
+            # LMDeploy-backed models are detected strictly by engine, not by format.
+            is_lmdeploy = engine == "lmdeploy"
             if is_lmdeploy and lmdeploy_bin:
                 config = _coerce_model_config(_model_attr(model, "config"))
                 try:
@@ -558,30 +587,33 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any:
                 continue
 
             hf_id = _model_attr(model, "huggingface_id")
-            filename = _model_attr(model, "filename") or (
-                os.path.basename(_model_attr(model, "file_path") or "") or None
-            )
-
-            # Resolve model path: HF cache first, then legacy file_path
+            quantization = _model_attr(model, "quantization")
+
+            # Prefer llama.cpp's native HF integration when we have a repo id and quant.
+            # This lets us use: --hf-repo <user>/<model>:<quant>, and llama.cpp will
+            # resolve/download the correct GGUF (including multi‑shard) on its own.
+            hf_repo_arg = None
+            if hf_id and quantization:
+                hf_repo_arg = f"{hf_id}:{str(quantization).lower()}"
+
+            # For legacy/local models without huggingface_id+quantization, fall back
+            # to a stored file_path. New HF-backed models never rely on a specific
+            # filename or file path; llama.cpp pulls from Hugging Face via --hf-repo.
             model_path = None
-            if hf_id and filename:
-                from backend.huggingface import resolve_cached_model_path
-                model_path = resolve_cached_model_path(hf_id, filename)
-
-            if not model_path:
-                # Legacy fallback: stored file_path (old-style records)
+            if not hf_repo_arg:
                 legacy = _model_attr(model, "file_path")
                 if legacy:
                     model_path = legacy if os.path.isabs(legacy) else f"/app/{legacy}"
 
-            if not model_path:
+            # If we don't have either an HF repo+quant or a legacy path, skip.
+            if not hf_repo_arg and not model_path:
                 logger.warning(
-                    f"Model '{proxy_model_name}' path could not be resolved (hf_id={hf_id}, filename={filename}), skipping"
+                    f"Model '{proxy_model_name}' path could not be resolved (hf_id={hf_id}), skipping"
                 )
                 continue
 
-            # Ensure absolute path (HF cache returns absolute; legacy may not)
-            if not os.path.isabs(model_path):
+            # Ensure absolute path when we are in local-path mode.
+            if model_path and not os.path.isabs(model_path):
                 model_path = f"/app/{model_path}"
 
             # Get the working directory and build directory for LD_LIBRARY_PATH
@@ -610,20 +642,30 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any:
                     f"Model {proxy_model_name}: jinja={config.get('jinja')} (type: {type(config.get('jinja'))})"
                 )
 
-            # Build llama.cpp command arguments
-            # Quote model path if it contains spaces or special characters
-            quoted_model_path = _quote_arg_if_needed(model_path)
+            # Build llama.cpp command arguments (excluding the base launcher).
+            # We keep the first 3 entries in cmd_args unused; only cmd_args[3:]
+            # (starting from "--port") are appended to the final command string.
             cmd_args = [
-                llama_server_path,
-                "--model",
-                quoted_model_path,
+                None,
+                None,
+                None,
                 "--port",
                 "${PORT}",
             ]
-            # Vision: if model has mmproj (multimodal projector), add --mmproj so vision is available
+
+            # If the user provided a model_alias in config, propagate it to llama.cpp
+            # via --alias so that /v1/models exposes this name.
+            alias_for_api = config.get("model_alias")
+            if isinstance(alias_for_api, str) and alias_for_api.strip():
+                cmd_args.extend(["--alias", alias_for_api.strip()])
+
+            # Vision: if model has mmproj (multimodal projector) and we're using a
+            # local model path, add --mmproj so vision is available. When using
+            # --hf-repo, llama.cpp will auto-download mmproj if available.
             mmproj_filename = _model_attr(model, "mmproj_filename")
-            if mmproj_filename and hf_id:
+            if mmproj_filename and hf_id and not hf_repo_arg:
                 from backend.huggingface import resolve_cached_model_path
+
                 mmproj_path = resolve_cached_model_path(hf_id, mmproj_filename)
                 if mmproj_path and os.path.exists(mmproj_path):
                     if not os.path.isabs(mmproj_path):
@@ -820,49 +862,75 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any:
             except Exception as e:
                 logger.debug(f"Could not get CUDA library path: {e}")
 
-            # Create the command with proper shell syntax for environment variables
+            # Create the command with proper shell syntax for environment variables.
+            # Prefer llama.cpp's HF integration when we have an HF repo id + quant;
+            # otherwise fall back to a direct local GGUF path.
+            if hf_repo_arg:
+                launcher = f"./{binary_name} --hf-repo {hf_repo_arg}"
+            else:
+                quoted_model_path = _quote_arg_if_needed(model_path)
+                launcher = f"./{binary_name} --model {quoted_model_path}"
+
             cmd_with_env = (
-                f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} ./{binary_name} --model {model_path} "
+                f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} {launcher} "
                 + " ".join(cmd_args[3:])
                 + "'"
-            )  # Skip llama_server_path, --model, model_path, --port
+            )
 
             config_data["models"][proxy_model_name] = {"cmd": cmd_with_env}
 
     # Then, add/update with running models (these take precedence for active models)
     for proxy_model_name, model_data in models.items():
-        overlay_model = all_models_by_proxy.get(proxy_model_name)
+        overlay_model = all_models_by_proxy.get(proxy_model_name) or all_models_by_legacy_proxy.get(proxy_model_name)
+        resolved_proxy_model_name = (
+            _resolve_proxy_name(overlay_model)
+            if overlay_model
+            else _normalize_proxy_alias(model_data.get("config", {}).get("model_alias")) or proxy_model_name
+        )
         engine = _model_attr(overlay_model, "engine") if overlay_model else None
-        model_format = _model_attr(overlay_model, "format") or _model_attr(overlay_model, "model_format") if overlay_model else None
-        is_lmdeploy_overlay = (engine == "lmdeploy" or model_format == "safetensors") and lmdeploy_bin and overlay_model
+        # For overlay models, also rely solely on the engine flag to detect LMDeploy.
+        is_lmdeploy_overlay = engine == "lmdeploy" and lmdeploy_bin and overlay_model
         if is_lmdeploy_overlay:
             config = _coerce_model_config(model_data.get("config"))
             try:
                 cmd_with_env = _build_lmdeploy_cmd(overlay_model, config, lmdeploy_bin, _model_attr)
-                config_data["models"][proxy_model_name] = {"cmd": cmd_with_env}
+                config_data["models"].pop(proxy_model_name, None)
+                config_data["models"][resolved_proxy_model_name] = {"cmd": cmd_with_env}
             except Exception as e:
-                logger.warning(f"Failed to build LMDeploy overlay cmd for {proxy_model_name}: {e}")
+                logger.warning(f"Failed to build LMDeploy overlay cmd for {resolved_proxy_model_name}: {e}")
             continue
 
         model_path = model_data["model_path"]
         llama_cpp_config = model_data["config"]
 
-        # Build llama.cpp command arguments (using full path to llama-server)
-        # Quote model path if it contains spaces or special characters
+        # Build llama.cpp command arguments (using full path to llama-server).
+        # For overlay models, also prefer HF repo + quant when available.
+        hf_id_overlay = _model_attr(overlay_model, "huggingface_id") if overlay_model else None
+        quantization_overlay = _model_attr(overlay_model, "quantization") if overlay_model else None
+        hf_repo_arg_overlay = None
+        if hf_id_overlay and quantization_overlay:
+            hf_repo_arg_overlay = f"{hf_id_overlay}:{str(quantization_overlay).lower()}"
+
+        # Quote model path if it contains spaces or special characters (local-path mode).
         quoted_model_path = _quote_arg_if_needed(model_path)
         cmd_args = [
-            llama_server_path,
-            "--model",
-            quoted_model_path,
+            None,
+            None,
+            None,
             "--port",
             "${PORT}",
         ]
+        # Propagate model_alias from the live llama_cpp_config if present so that
+        # llama.cpp exposes this name via /v1/models.
+        alias_for_api_overlay = llama_cpp_config.get("model_alias")
+        if isinstance(alias_for_api_overlay, str) and alias_for_api_overlay.strip():
+            cmd_args.extend(["--alias", alias_for_api_overlay.strip()])
         # Vision: add --mmproj if model has mmproj_filename
-        if overlay_model:
+        if overlay_model and not hf_repo_arg_overlay:
             mmproj_fn = _model_attr(overlay_model, "mmproj_filename")
-            hf_id_overlay = _model_attr(overlay_model, "huggingface_id")
             if mmproj_fn and hf_id_overlay:
                 from backend.huggingface import resolve_cached_model_path
+
                 mmproj_path = resolve_cached_model_path(hf_id_overlay, mmproj_fn)
                 if mmproj_path and os.path.exists(mmproj_path):
                     if not os.path.isabs(mmproj_path):
@@ -1052,14 +1120,20 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any:
         # The shared libraries are in the same directory as the binary
         library_path = build_dir
 
-        # Create the command with proper shell syntax for environment variables
+        # Create the command with proper shell syntax for environment variables.
+        if hf_repo_arg_overlay:
+            launcher = f"./{binary_name} --hf-repo {hf_repo_arg_overlay}"
+        else:
+            launcher = f"./{binary_name} --model {quoted_model_path}"
+
         cmd_with_env = (
-            f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} ./{binary_name} --model {model_path} "
+            f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} {launcher} "
             + " ".join(cmd_args[3:])
             + "'"
-        )  # Skip llama_server_path, --model, model_path, --port
+        )
 
-        config_data["models"][proxy_model_name] = {"cmd": cmd_with_env}
+        config_data["models"].pop(proxy_model_name, None)
+        config_data["models"][resolved_proxy_model_name] = {"cmd": cmd_with_env}
 
     # Add groups configuration to allow multiple models to run simultaneously
     # Note: This means models won't be unloaded when new ones start - user must manage memory
diff --git a/backend/llama_swap_manager.py b/backend/llama_swap_manager.py
index d08a0c1..7b75507 100644
--- a/backend/llama_swap_manager.py
+++ b/backend/llama_swap_manager.py
@@ -345,7 +345,6 @@ async def register_model(self, model: Any, config: Dict[str, Any]) -> str:
         """
         Registers a model with llama-swap by storing its configuration.
         Returns the proxy_model_name used by llama-swap.
-        Note: This only stores the model info, config is written separately.
         model can be a dict or an object with proxy_name, file_path, display_name/name.
         """
         proxy_name = model.get("proxy_name") if isinstance(model, dict) else getattr(model, "proxy_name", None)
@@ -365,6 +364,9 @@ async def register_model(self, model: Any, config: Dict[str, Any]) -> str:
             "config": config,
         }
 
+        # Persist the updated model registry immediately so llama-swap can watch and reload it.
+        await self._write_config()
+
         logger.info(
             f"Model '{name}' registered as '{proxy_name}' with llama-swap"
         )
diff --git a/backend/lmdeploy_installer.py b/backend/lmdeploy_installer.py
deleted file mode 100644
index 875b2f7..0000000
--- a/backend/lmdeploy_installer.py
+++ /dev/null
@@ -1,416 +0,0 @@
-import asyncio
-import json
-import os
-import shutil
-import subprocess
-import sys
-from asyncio.subprocess import PIPE, STDOUT
-from datetime import datetime, timezone
-from typing import Any, Awaitable, Dict, Optional
-
-from backend.logging_config import get_logger
-from backend.progress_manager import get_progress_manager
-
-
-def _utcnow() -> str:
-    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
-
-
-logger = get_logger(__name__)
-
-_installer_instance: Optional["LMDeployInstaller"] = None
-
-
-def get_lmdeploy_installer() -> "LMDeployInstaller":
-    global _installer_instance
-    if _installer_instance is None:
-        _installer_instance = LMDeployInstaller()
-    return _installer_instance
-
-
-class LMDeployInstaller:
-    """Install or remove LMDeploy inside the runtime environment on demand."""
-
-    def __init__(
-        self,
-        *,
-        log_path: Optional[str] = None,
-        state_path: Optional[str] = None,
-        base_dir: Optional[str] = None,
-    ) -> None:
-        self._lock = asyncio.Lock()
-        self._operation: Optional[str] = None
-        self._operation_started_at: Optional[str] = None
-        self._current_task: Optional[asyncio.Task] = None
-        self._last_error: Optional[str] = None
-        data_root = os.path.abspath("data")
-        base_path = base_dir or os.path.join(data_root, "lmdeploy")
-        self._base_dir = os.path.abspath(base_path)
-        self._venv_path = os.path.join(self._base_dir, "venv")
-        log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log")
-        state_path = state_path or os.path.join(
-            data_root, "configs", "lmdeploy_installer.json"
-        )
-        self._log_path = os.path.abspath(log_path)
-        self._state_path = os.path.abspath(state_path)
-        self._ensure_directories()
-
-    def _ensure_directories(self) -> None:
-        os.makedirs(self._base_dir, exist_ok=True)
-        os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
-        os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
-
-    def _venv_bin(self, executable: str) -> str:
-        if os.name == "nt":
-            exe = (
-                executable
-                if executable.lower().endswith(".exe")
-                else f"{executable}.exe"
-            )
-            return os.path.join(self._venv_path, "Scripts", exe)
-        return os.path.join(self._venv_path, "bin", executable)
-
-    def _venv_python(self) -> str:
-        return self._venv_bin("python")
-
-    def _ensure_venv(self) -> None:
-        python_path = self._venv_python()
-        if os.path.exists(python_path):
-            return
-        os.makedirs(self._base_dir, exist_ok=True)
-        try:
-            subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True)
-        except subprocess.CalledProcessError as exc:
-            raise RuntimeError(
-                f"Failed to create LMDeploy virtual environment: {exc}"
-            ) from exc
-
-    def _load_state(self) -> Dict[str, Any]:
-        if not os.path.exists(self._state_path):
-            return {}
-        try:
-            with open(self._state_path, "r", encoding="utf-8") as handle:
-                data = json.load(handle)
-                return data if isinstance(data, dict) else {}
-        except Exception as exc:
-            logger.warning(f"Failed to load LMDeploy installer state: {exc}")
-            return {}
-
-    def _save_state(self, state: Dict[str, Any]) -> None:
-        tmp_path = f"{self._state_path}.tmp"
-        with open(tmp_path, "w", encoding="utf-8") as handle:
-            json.dump(state, handle, indent=2)
-        os.replace(tmp_path, self._state_path)
-
-    def _detect_installed_version(self) -> Optional[str]:
-        python_exe = self._venv_python()
-        if not os.path.exists(python_exe):
-            return None
-        script = (
-            "import importlib, sys\n"
-            "try:\n"
-            "    from importlib import metadata\n"
-            "except ImportError:\n"
-            "    import importlib_metadata as metadata\n"
-            "try:\n"
-            "    print(metadata.version('lmdeploy'))\n"
-            "except metadata.PackageNotFoundError:\n"
-            "    sys.exit(1)\n"
-        )
-        try:
-            output = subprocess.check_output(
-                [python_exe, "-c", script], text=True
-            ).strip()
-            return output or None
-        except subprocess.CalledProcessError:
-            return None
-        except Exception as exc:  # pragma: no cover
-            logger.debug(f"Unable to determine LMDeploy version: {exc}")
-            return None
-
-    def _resolve_binary_path(self) -> Optional[str]:
-        override = os.getenv("LMDEPLOY_BIN")
-        if override:
-            override_path = os.path.abspath(os.path.expanduser(override))
-            if os.path.exists(override_path):
-                return override_path
-            resolved_override = shutil.which(override)
-            if resolved_override:
-                return resolved_override
-
-        candidate = self._venv_bin("lmdeploy")
-        if os.path.exists(candidate) and os.access(candidate, os.X_OK):
-            return os.path.abspath(candidate)
-
-        resolved = shutil.which("lmdeploy")
-        return resolved
-
-    def _update_installed_state(
-        self, installed: bool, version: Optional[str] = None
-    ) -> None:
-        state = self._load_state()
-        if installed:
-            state["installed_at"] = _utcnow()
-            if version:
-                state["installed_version"] = version
-            state["venv_path"] = self._venv_path
-        else:
-            state["installed_version"] = None
-            state["installed_at"] = None
-            state["removed_at"] = _utcnow()
-            state["venv_path"] = self._venv_path
-        self._save_state(state)
-
-    def _refresh_state_from_environment(self) -> None:
-        state = self._load_state()
-        version = self._detect_installed_version()
-        state["installed_version"] = version
-        if version is None:
-            state["removed_at"] = _utcnow()
-        state["venv_path"] = self._venv_path
-        self._save_state(state)
-
-    async def _run_pip(
-        self,
-        args: list[str],
-        operation: str,
-        ensure_venv: bool = True,
-        cwd: Optional[str] = None,
-    ) -> int:
-        if ensure_venv:
-            self._ensure_venv()
-        python_exe = self._venv_python()
-        if not os.path.exists(python_exe):
-            raise RuntimeError(
-                "LMDeploy virtual environment is missing; cannot run pip."
-            )
-        header = (
-            f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n"
-        )
-        with open(self._log_path, "w", encoding="utf-8") as log_file:
-            log_file.write(header)
-        process = await asyncio.create_subprocess_exec(
-            python_exe,
-            "-m",
-            "pip",
-            *args,
-            stdout=PIPE,
-            stderr=STDOUT,
-            cwd=cwd,
-        )
-
-        async def _stream_output() -> None:
-            if process.stdout is None:
-                return
-            with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
-                while True:
-                    chunk = await process.stdout.readline()
-                    if not chunk:
-                        break
-                    text = chunk.decode("utf-8", errors="replace")
-                    log_file.write(text)
-                    await self._broadcast_log_line(text.rstrip("\n"))
-
-        await asyncio.gather(process.wait(), _stream_output())
-        return process.returncode or 0
-
-    async def _broadcast_log_line(self, line: str) -> None:
-        try:
-            await get_progress_manager().broadcast(
-                {
-                    "type": "lmdeploy_install_log",
-                    "line": line,
-                    "timestamp": _utcnow(),
-                }
-            )
-        except Exception as exc:  # pragma: no cover
-            logger.debug(f"Failed to broadcast LMDeploy log line: {exc}")
-
-    async def _set_operation(self, operation: str) -> None:
-        self._operation = operation
-        self._operation_started_at = _utcnow()
-        self._last_error = None
-        await get_progress_manager().broadcast(
-            {
-                "type": "lmdeploy_install_status",
-                "status": operation,
-                "started_at": self._operation_started_at,
-            }
-        )
-
-    async def _finish_operation(self, success: bool, message: str = "") -> None:
-        payload = {
-            "type": "lmdeploy_install_status",
-            "status": "completed" if success else "failed",
-            "operation": self._operation,
-            "message": message,
-            "ended_at": _utcnow(),
-        }
-        await get_progress_manager().broadcast(payload)
-        self._operation = None
-        self._operation_started_at = None
-
-    def _create_task(self, coro: Awaitable[Any]) -> None:
-        loop = asyncio.get_running_loop()
-        task = loop.create_task(coro)
-        self._current_task = task
-
-        def _cleanup(fut: asyncio.Future) -> None:
-            try:
-                fut.result()
-            except Exception as exc:  # pragma: no cover - surfaced via status
-                logger.error(f"LMDeploy installer task error: {exc}")
-            finally:
-                self._current_task = None
-
-        task.add_done_callback(_cleanup)
-
-    async def install(
-        self, version: Optional[str] = None, force_reinstall: bool = False
-    ) -> Dict[str, Any]:
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another LMDeploy installer operation is already running"
-                )
-            await self._set_operation("install")
-            args = ["install", "--upgrade"]
-            if force_reinstall:
-                args.append("--force-reinstall")
-            package = "lmdeploy"
-            if version:
-                package = f"lmdeploy=={version}"
-            args.append(package)
-
-            async def _runner():
-                try:
-                    code = await self._run_pip(args, "install")
-                    if code != 0:
-                        raise RuntimeError(f"pip exited with status {code}")
-                    detected_version = self._detect_installed_version()
-                    self._update_installed_state(True, detected_version)
-                    await self._finish_operation(True, "LMDeploy installed")
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    self._refresh_state_from_environment()
-                    await self._finish_operation(False, str(exc))
-
-            self._create_task(_runner())
-            return {"message": "LMDeploy installation started"}
-
-    async def install_from_source(
-        self,
-        repo_url: str = "https://github.com/InternLM/lmdeploy.git",
-        branch: str = "main",
-    ) -> Dict[str, Any]:
-        """Install LMDeploy from a git repo and branch (for development)."""
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another LMDeploy installer operation is already running"
-                )
-            await self._set_operation("install_source")
-            clone_dir = os.path.join(self._base_dir, "source")
-            async def _runner():
-                try:
-                    self._ensure_venv()
-                    if os.path.exists(clone_dir):
-                        shutil.rmtree(clone_dir)
-                    os.makedirs(clone_dir, exist_ok=True)
-                    proc = await asyncio.create_subprocess_exec(
-                        "git", "clone", "--depth", "1", "--branch", branch, repo_url, clone_dir,
-                        stdout=PIPE, stderr=STDOUT,
-                    )
-                    await proc.wait()
-                    if proc.returncode != 0:
-                        raise RuntimeError(f"git clone failed with code {proc.returncode}")
-                    code = await self._run_pip(
-                        ["install", "-e", "."],
-                        "install_source",
-                        cwd=clone_dir,
-                    )
-                    if code != 0:
-                        raise RuntimeError(f"pip install -e . failed with code {code}")
-                    detected = self._detect_installed_version()
-                    self._update_installed_state(True, detected)
-                    from backend.data_store import get_store
-                    get_store().update_lmdeploy({
-                        "install_type": "source",
-                        "source_repo": repo_url,
-                        "source_branch": branch,
-                    })
-                    await self._finish_operation(True, f"Installed from {branch}")
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    self._refresh_state_from_environment()
-                    await self._finish_operation(False, str(exc))
-            self._create_task(_runner())
-            return {"message": "LMDeploy install from source started", "repo": repo_url, "branch": branch}
-
-    async def remove(self) -> Dict[str, Any]:
-        async with self._lock:
-            if self._operation:
-                raise RuntimeError(
-                    "Another LMDeploy installer operation is already running"
-                )
-            await self._set_operation("remove")
-            args = ["uninstall", "-y", "lmdeploy"]
-
-            async def _runner():
-                try:
-                    python_exists = os.path.exists(self._venv_python())
-                    if python_exists:
-                        code = await self._run_pip(args, "remove", ensure_venv=False)
-                        if code != 0:
-                            raise RuntimeError(f"pip exited with status {code}")
-                    shutil.rmtree(self._venv_path, ignore_errors=True)
-                    self._update_installed_state(False)
-                    await self._finish_operation(True, "LMDeploy removed")
-                except Exception as exc:
-                    self._last_error = str(exc)
-                    self._refresh_state_from_environment()
-                    await self._finish_operation(False, str(exc))
-
-            self._create_task(_runner())
-            return {"message": "LMDeploy removal started"}
-
-    def status(self) -> Dict[str, Any]:
-        version = self._detect_installed_version()
-        binary_path = self._resolve_binary_path()
-        installed = version is not None and binary_path is not None
-        state = self._load_state()
-        return {
-            "installed": installed,
-            "version": version,
-            "binary_path": binary_path,
-            "venv_path": state.get("venv_path") or self._venv_path,
-            "installed_at": state.get("installed_at"),
-            "removed_at": state.get("removed_at"),
-            "operation": self._operation,
-            "operation_started_at": self._operation_started_at,
-            "last_error": self._last_error,
-            "log_path": self._log_path,
-        }
-
-    async def _broadcast_status(self) -> None:
-        """Broadcast current status via SSE."""
-        try:
-            status_data = self.status()
-            get_progress_manager().emit("lmdeploy_status", {**status_data, "timestamp": _utcnow()})
-        except Exception as exc:
-            logger.debug(f"Failed to broadcast LMDeploy status: {exc}")
-
-    def is_operation_running(self) -> bool:
-        return self._operation is not None
-
-    def read_log_tail(self, max_bytes: int = 8192) -> str:
-        if not os.path.exists(self._log_path):
-            return ""
-        with open(self._log_path, "rb") as log_file:
-            log_file.seek(0, os.SEEK_END)
-            size = log_file.tell()
-            log_file.seek(max(0, size - max_bytes))
-            data = log_file.read().decode("utf-8", errors="replace")
-            if size > max_bytes:
-                data = data.split("\n", 1)[-1]
-            return data.strip()
diff --git a/backend/lmdeploy_manager.py b/backend/lmdeploy_manager.py
index 6328d71..c6aa4c9 100644
--- a/backend/lmdeploy_manager.py
+++ b/backend/lmdeploy_manager.py
@@ -1,841 +1,479 @@
 import asyncio
 import json
 import os
-import shlex
 import shutil
-from datetime import datetime
-from typing import Optional, Dict, Any, List
-
-import httpx
-import psutil
-from asyncio.subprocess import Process, STDOUT
+import subprocess
+import sys
+from asyncio.subprocess import PIPE, STDOUT
+from datetime import datetime, timezone
+from typing import Any, Awaitable, Dict, Optional
 
 from backend.logging_config import get_logger
-from backend.data_store import get_store
-from backend.huggingface import DEFAULT_LMDEPLOY_CONTEXT, MAX_LMDEPLOY_CONTEXT
 from backend.progress_manager import get_progress_manager
+from backend.data_store import get_store
+
+
+def _utcnow() -> str:
+  return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+
 
 logger = get_logger(__name__)
 
-_lmdeploy_manager_instance: Optional["LMDeployManager"] = None
+_manager_instance: Optional["LMDeployManager"] = None
 
 
 def get_lmdeploy_manager() -> "LMDeployManager":
-    """Return singleton LMDeploy manager."""
-    global _lmdeploy_manager_instance
-    if _lmdeploy_manager_instance is None:
-        _lmdeploy_manager_instance = LMDeployManager()
-    return _lmdeploy_manager_instance
+  """Singleton accessor, mirroring the llama manager pattern."""
+  global _manager_instance
+  if _manager_instance is None:
+    _manager_instance = LMDeployManager()
+  return _manager_instance
 
 
 class LMDeployManager:
-    """Manage LMDeploy TurboMind runtime lifecycle."""
-
-    def __init__(
-        self,
-        binary_path: Optional[str] = None,
-        host: str = "0.0.0.0",
-        port: int = 2001,
-    ):
-        self.binary_path = binary_path or os.getenv("LMDEPLOY_BIN", "lmdeploy")
-        self.host = host
-        self.port = int(os.getenv("LMDEPLOY_PORT", port))
-        self._process: Optional[Process] = None
-        self._log_file = None
-        self._lock = asyncio.Lock()
-        self._current_instance: Optional[Dict[str, Any]] = None
-        self._started_at: Optional[str] = None
-        self._log_path = os.path.join("data", "logs", "lmdeploy.log")
-        self._health_timeout = 180  # seconds
-        self._last_health_status: Optional[Dict[str, Any]] = None
-        self._last_detected_external: Optional[Dict[str, Any]] = None
-        self._last_broadcast_log_position = 0
-
-    async def start(
-        self, model_entry: Dict[str, Any], config: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Start LMDeploy serving the provided model. Only one model may run at once."""
-        async with self._lock:
-            if self._process and self._process.returncode is None:
-                raise RuntimeError("LMDeploy runtime is already running")
-
-            model_path = model_entry.get("file_path")
-            if not model_path or not os.path.exists(model_path):
-                raise FileNotFoundError(f"Model file not found at {model_path}")
-            model_dir = model_entry.get("model_dir") or os.path.dirname(model_path)
-            if not os.path.isdir(model_dir):
-                raise FileNotFoundError(f"Model directory not found at {model_dir}")
-            model_dir_abs = os.path.abspath(model_dir)
-
-            # Derive a stable model name for LMDeploy's --model-name flag.
-            # Preference order:
-            # 1) Explicit model_name passed in model_entry
-            # 2) Base model / display name from model_entry
-            # 3) Hugging Face repo id
-            # 4) Directory name
-            model_name = (
-                model_entry.get("model_name")
-                or model_entry.get("display_name")
-                or model_entry.get("huggingface_id")
-                or os.path.basename(model_dir_abs.rstrip(os.sep))
-            )
-
-            # Inject model_name into config passed to LMDeploy so the command builder
-            # can add --model-name and we persist it in status/config reflection.
-            effective_config = dict(config or {})
-            if model_name and not effective_config.get("model_name"):
-                effective_config["model_name"] = model_name
-
-            binary = self._resolve_binary()
-            command = self._build_command(binary, model_dir_abs, effective_config)
-            env = os.environ.copy()
-            env.setdefault("LMDEPLOY_LOG_DIR", os.path.dirname(self._log_path))
-            os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
-            self._log_file = open(self._log_path, "ab", buffering=0)
-
-            logger.info(f"Starting LMDeploy with command: {' '.join(command)}")
-            self._process = await asyncio.create_subprocess_exec(
-                *command,
-                stdout=self._log_file,
-                stderr=STDOUT,
-                cwd=model_dir_abs,
-                env=env,
-            )
-            self._started_at = datetime.utcnow().isoformat() + "Z"
-            self._current_instance = {
-                "model_id": model_entry.get("model_id"),
-                "huggingface_id": model_entry.get("huggingface_id"),
-                "file_path": model_path,
-                "config": effective_config,
-                "pid": self._process.pid,
-            }
-
+  """
+  Manage LMDeploy installation into its own venv, similar in spirit to LlamaManager.
+
+  Responsibilities:
+  - Create a dedicated venv under data/lmdeploy
+  - Install LMDeploy from PyPI (release) or from a git source checkout
+  - Track install status, version, binary path and venv path
+  - Emit progress events so the UI can show logs and status
+  """
+
+  def __init__(
+    self,
+    *,
+    log_path: Optional[str] = None,
+    state_path: Optional[str] = None,
+    base_dir: Optional[str] = None,
+  ) -> None:
+    self._lock = asyncio.Lock()
+    self._operation: Optional[str] = None
+    self._operation_started_at: Optional[str] = None
+    self._current_task: Optional[asyncio.Task] = None
+    self._last_error: Optional[str] = None
+
+    data_root = os.path.abspath("data")
+    base_path = base_dir or os.path.join(data_root, "lmdeploy")
+    # Root directory under which versioned LMDeploy environments are created.
+    self._root_dir = os.path.abspath(base_path)
+    # Default venv path (used only as a fallback when no versioned install exists).
+    self._base_dir = self._root_dir
+    self._venv_path = os.path.join(self._base_dir, "venv")
+    log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log")
+    state_path = state_path or os.path.join(
+      data_root, "configs", "lmdeploy_manager.json"
+    )
+    self._log_path = os.path.abspath(log_path)
+    self._state_path = os.path.abspath(state_path)
+    self._ensure_directories()
+
+  # --- Venv and filesystem helpers -------------------------------------------------
+
+  def _ensure_directories(self) -> None:
+    os.makedirs(self._base_dir, exist_ok=True)
+    os.makedirs(os.path.dirname(self._log_path), exist_ok=True)
+    os.makedirs(os.path.dirname(self._state_path), exist_ok=True)
+
+  def _venv_bin(self, executable: str) -> str:
+    if os.name == "nt":
+      exe = executable if executable.lower().endswith(".exe") else f"{executable}.exe"
+      return os.path.join(self._venv_path, "Scripts", exe)
+    return os.path.join(self._venv_path, "bin", executable)
+
+  def _venv_python(self) -> str:
+    return self._venv_bin("python")
+
+  def _prepare_versioned_paths(self, label: str = "") -> str:
+    """
+    Prepare a new versioned install directory under the LMDeploy root.
+
+    Returns:
+      A version directory name component (e.g. '20250309-123456-pip').
+    """
+    ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
+    suffix = f"-{label}" if label else ""
+    version_dir = f"{ts}{suffix}"
+    self._base_dir = os.path.join(self._root_dir, version_dir)
+    self._venv_path = os.path.join(self._base_dir, "venv")
+    self._ensure_directories()
+    return version_dir
+
+  def _ensure_venv(self) -> None:
+    python_path = self._venv_python()
+    if os.path.exists(python_path):
+      return
+    os.makedirs(self._base_dir, exist_ok=True)
+    try:
+      subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True)
+    except subprocess.CalledProcessError as exc:
+      raise RuntimeError(f"Failed to create LMDeploy virtual environment: {exc}") from exc
+
+  # --- State persistence -----------------------------------------------------------
+
+  def _load_state(self) -> Dict[str, Any]:
+    if not os.path.exists(self._state_path):
+      return {}
+    try:
+      with open(self._state_path, "r", encoding="utf-8") as handle:
+        data = json.load(handle)
+        return data if isinstance(data, dict) else {}
+    except Exception as exc:
+      logger.warning(f"Failed to load LMDeploy manager state: {exc}")
+      return {}
+
+  def _save_state(self, state: Dict[str, Any]) -> None:
+    tmp_path = f"{self._state_path}.tmp"
+    with open(tmp_path, "w", encoding="utf-8") as handle:
+      json.dump(state, handle, indent=2)
+    os.replace(tmp_path, self._state_path)
+
+  def _detect_installed_version(self) -> Optional[str]:
+    python_exe = self._venv_python()
+    if not os.path.exists(python_exe):
+      return None
+    script = (
+      "import importlib, sys\n"
+      "try:\n"
+      "    from importlib import metadata\n"
+      "except ImportError:\n"
+      "    import importlib_metadata as metadata\n"
+      "try:\n"
+      "    print(metadata.version('lmdeploy'))\n"
+      "except metadata.PackageNotFoundError:\n"
+      "    sys.exit(1)\n"
+    )
+    try:
+      output = subprocess.check_output([python_exe, "-c", script], text=True).strip()
+      return output or None
+    except subprocess.CalledProcessError:
+      return None
+    except Exception as exc:  # pragma: no cover
+      logger.debug(f"Unable to determine LMDeploy version: {exc}")
+      return None
+
+  def _resolve_binary_path(self) -> Optional[str]:
+    override = os.getenv("LMDEPLOY_BIN")
+    if override:
+      override_path = os.path.abspath(os.path.expanduser(override))
+      if os.path.exists(override_path):
+        return override_path
+      resolved_override = shutil.which(override)
+      if resolved_override:
+        return resolved_override
+
+    candidate = self._venv_bin("lmdeploy")
+    if os.path.exists(candidate) and os.access(candidate, os.X_OK):
+      return os.path.abspath(candidate)
+
+    return shutil.which("lmdeploy")
+
+  def _update_installed_state(self, installed: bool, version: Optional[str]) -> None:
+    state = self._load_state()
+    if installed:
+      state["installed_at"] = _utcnow()
+      state["installed_version"] = version
+      state["venv_path"] = self._venv_path
+    else:
+      state["installed_version"] = None
+      state["installed_at"] = None
+      state["removed_at"] = _utcnow()
+      state["venv_path"] = self._venv_path
+    self._save_state(state)
+
+  def _refresh_state_from_environment(self) -> None:
+    state = self._load_state()
+    version = self._detect_installed_version()
+    state["installed_version"] = version
+    if version is None:
+      state["removed_at"] = _utcnow()
+    state["venv_path"] = self._venv_path
+    self._save_state(state)
+
+  # --- PIP helpers and progress broadcasting --------------------------------------
+
+  async def _run_pip(
+    self,
+    args: list[str],
+    operation: str,
+    ensure_venv: bool = True,
+    cwd: Optional[str] = None,
+  ) -> int:
+    if ensure_venv:
+      self._ensure_venv()
+    python_exe = self._venv_python()
+    if not os.path.exists(python_exe):
+      raise RuntimeError("LMDeploy virtual environment is missing; cannot run pip.")
+
+    header = f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n"
+    with open(self._log_path, "w", encoding="utf-8") as log_file:
+      log_file.write(header)
+
+    process = await asyncio.create_subprocess_exec(
+      python_exe,
+      "-m",
+      "pip",
+      *args,
+      stdout=PIPE,
+      stderr=STDOUT,
+      cwd=cwd,
+    )
+
+    async def _stream_output() -> None:
+      if process.stdout is None:
+        return
+      with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file:
+        while True:
+          chunk = await process.stdout.readline()
+          if not chunk:
+            break
+          text = chunk.decode("utf-8", errors="replace")
+          log_file.write(text)
+          await self._broadcast_log_line(text.rstrip("\n"))
+
+    await asyncio.gather(process.wait(), _stream_output())
+    return process.returncode or 0
+
+  async def _broadcast_log_line(self, line: str) -> None:
+    try:
+      await get_progress_manager().broadcast(
+        {"type": "lmdeploy_install_log", "line": line, "timestamp": _utcnow()}
+      )
+    except Exception as exc:  # pragma: no cover
+      logger.debug(f"Failed to broadcast LMDeploy log line: {exc}")
+
+  async def _set_operation(self, operation: str) -> None:
+    self._operation = operation
+    self._operation_started_at = _utcnow()
+    self._last_error = None
+    await get_progress_manager().broadcast(
+      {
+        "type": "lmdeploy_install_status",
+        "status": operation,
+        "started_at": self._operation_started_at,
+      }
+    )
+
+  async def _finish_operation(self, success: bool, message: str = "") -> None:
+    payload = {
+      "type": "lmdeploy_install_status",
+      "status": "completed" if success else "failed",
+      "operation": self._operation,
+      "message": message,
+      "ended_at": _utcnow(),
+    }
+    await get_progress_manager().broadcast(payload)
+    self._operation = None
+    self._operation_started_at = None
+
+  def _create_task(self, coro: Awaitable[Any]) -> None:
+    loop = asyncio.get_running_loop()
+    task = loop.create_task(coro)
+    self._current_task = task
+
+    def _cleanup(fut: asyncio.Future) -> None:
+      try:
+        fut.result()
+      except Exception as exc:  # pragma: no cover
+        logger.error(f"LMDeploy manager task error: {exc}")
+      finally:
+        self._current_task = None
+
+    task.add_done_callback(_cleanup)
+
+  # --- Public interface -----------------------------------------------------------
+
+  async def install_release(
+    self, version: Optional[str] = None, force_reinstall: bool = False
+  ) -> Dict[str, Any]:
+    """Install LMDeploy from PyPI into its own venv."""
+    async with self._lock:
+      if self._operation:
+        raise RuntimeError("Another LMDeploy operation is already running")
+      await self._set_operation("install")
+      # Create a fresh, versioned install directory for this LMDeploy release.
+      self._prepare_versioned_paths(label="pip")
+      args = ["install", "--upgrade"]
+      if force_reinstall:
+        args.append("--force-reinstall")
+      package = "lmdeploy"
+      if version:
+        package = f"lmdeploy=={version}"
+      args.append(package)
+
+      async def _runner():
         try:
-            await self._wait_for_ready()
+          code = await self._run_pip(args, "install")
+          if code != 0:
+            raise RuntimeError(f"pip exited with status {code}")
+          detected_version = self._detect_installed_version()
+          self._update_installed_state(True, detected_version)
+          # Persist engine metadata in engines.yaml (used by llama-swap config)
+          try:
+            store = get_store()
+            version_name = detected_version or f"pip-{_utcnow()}"
+            meta: Dict[str, Any] = {
+              "version": version_name,
+              "install_type": "pip",
+              "venv_path": self._venv_path,
+              "installed_at": _utcnow(),
+            }
+            # Register LMDeploy as a versioned engine, same pattern as llama_cpp.
+            store.add_engine_version("lmdeploy", meta)
+            store.set_active_engine_version("lmdeploy", version_name)
+          except Exception as exc:
+            logger.debug(f"Failed to persist LMDeploy engine metadata: {exc}")
+          await self._finish_operation(True, "LMDeploy installed")
         except Exception as exc:
-            await self.stop(force=True)
-            raise exc
-
-        return self.status()
-
-    async def stop(self, force: bool = False) -> None:
-        """Stop LMDeploy process if running."""
-        async with self._lock:
-            if not self._process:
-                return
-            if self._process.returncode is None:
-                try:
-                    self._process.terminate()
-                    await asyncio.wait_for(self._process.wait(), timeout=30)
-                except asyncio.TimeoutError:
-                    logger.warning(
-                        "LMDeploy did not terminate gracefully; killing process"
-                    )
-                    self._process.kill()
-                    await self._process.wait()
-                except ProcessLookupError:
-                    logger.debug("LMDeploy process already stopped")
-            elif force:
-                try:
-                    self._process.kill()
-                except ProcessLookupError:
-                    pass
-            self._cleanup_process_state()
-
-    async def restart(
-        self, model_entry: Dict[str, Any], config: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Restart LMDeploy with a new model/config."""
-        await self.stop()
-        return await self.start(model_entry, config)
-
-    def status(self) -> Dict[str, Any]:
-        """Return status payload describing the running instance."""
-        running = bool(self._process and self._process.returncode is None)
-        detection = None
-        if not running:
-            detection = self._detect_external_process()
-            if detection:
-                running = True
-                self._last_detected_external = detection
-                if not self._current_instance:
-                    self._current_instance = detection.get("instance")
-                if not self._started_at:
-                    self._started_at = detection.get("started_at")
-            else:
-                self._last_detected_external = None
-        else:
-            self._last_detected_external = None
-
-        return {
-            "running": running,
-            "port": self.port,
-            "host": self.host,
-            "process_id": self._process.pid if running else None,
-            "started_at": self._started_at,
-            "current_instance": self._current_instance if running else None,
-            "health": self._last_health_status,
-            "binary_path": self._current_binary_path(),
-            "log_path": self._log_path,
-            "auto_detected": bool(detection),
-            "detection": detection,
-        }
-
-    def _current_binary_path(self) -> Optional[str]:
-        try:
-            return self._resolve_binary()
-        except FileNotFoundError:
-            return None
-
-    def _resolve_binary(self) -> str:
+          self._last_error = str(exc)
+          self._refresh_state_from_environment()
+          await self._finish_operation(False, str(exc))
+
+      self._create_task(_runner())
+      return {"message": "LMDeploy installation started"}
+
+  async def install_from_source(
+    self,
+    repo_url: str = "https://github.com/InternLM/lmdeploy.git",
+    branch: str = "main",
+  ) -> Dict[str, Any]:
+    """Install LMDeploy from a git repo and branch (for development)."""
+    async with self._lock:
+      if self._operation:
+        raise RuntimeError("Another LMDeploy operation is already running")
+      await self._set_operation("install_source")
+      # Create a fresh, versioned install directory for this LMDeploy source build.
+      self._prepare_versioned_paths(label="source")
+      clone_dir = os.path.join(self._base_dir, "source")
+
+      async def _runner():
         try:
-            from backend.lmdeploy_installer import get_lmdeploy_installer
-
-            installer_binary = get_lmdeploy_installer().status().get("binary_path")
-            if installer_binary and os.path.exists(installer_binary):
-                return installer_binary
+          self._ensure_venv()
+          if os.path.exists(clone_dir):
+            shutil.rmtree(clone_dir)
+          os.makedirs(clone_dir, exist_ok=True)
+          proc = await asyncio.create_subprocess_exec(
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "--branch",
+            branch,
+            repo_url,
+            clone_dir,
+            stdout=PIPE,
+            stderr=STDOUT,
+          )
+          await proc.wait()
+          if proc.returncode != 0:
+            raise RuntimeError(f"git clone failed with code {proc.returncode}")
+          code = await self._run_pip(
+            ["install", "-e", "."], "install_source", cwd=clone_dir
+          )
+          if code != 0:
+            raise RuntimeError(f"pip install -e . failed with code {code}")
+          detected = self._detect_installed_version()
+          self._update_installed_state(True, detected)
+          try:
+            store = get_store()
+            base_version = detected or branch or "source"
+            version_name = f"{base_version}-{_utcnow()}"
+            meta: Dict[str, Any] = {
+              "version": version_name,
+              "install_type": "source",
+              "source_repo": repo_url,
+              "source_branch": branch,
+              "venv_path": self._venv_path,
+              "installed_at": _utcnow(),
+            }
+            store.add_engine_version("lmdeploy", meta)
+            store.set_active_engine_version("lmdeploy", version_name)
+          except Exception as exc:
+            logger.debug(f"Failed to persist LMDeploy engine metadata (source): {exc}")
+          await self._finish_operation(True, f"Installed from {branch}")
         except Exception as exc:
-            logger.debug(
-                f"Failed to resolve LMDeploy binary via installer status: {exc}"
-            )
-
-        resolved = shutil.which(self.binary_path)
-        if resolved:
-            return resolved
-
-        candidate = os.path.expanduser(self.binary_path)
-        if os.path.isabs(candidate) and os.path.exists(candidate):
-            return candidate
-        raise FileNotFoundError(
-            "LMDeploy binary not found in PATH. Install LMDeploy from the LMDeploy page or set LMDEPLOY_BIN."
-        )
-
-    def _build_command(
-        self, binary: str, model_dir: str, config: Dict[str, Any]
-    ) -> list:
-        """Convert stored config into lmdeploy CLI arguments."""
-        tensor_parallel = max(1, int(config.get("tensor_parallel") or 1))
-        base_session_len = max(
-            1024,
-            int(
-                config.get("session_len")
-                or config.get("context_length")
-                or DEFAULT_LMDEPLOY_CONTEXT
-            ),
-        )
-        rope_scaling_mode = str(config.get("rope_scaling_mode") or "disabled").lower()
-        rope_scaling_factor = float(config.get("rope_scaling_factor") or 1.0)
-        scaling_enabled = (
-            rope_scaling_mode not in {"", "none", "disabled"}
-            and rope_scaling_factor > 1.0
-        )
-        effective_session_len = base_session_len
-        if scaling_enabled:
-            scaled = int(base_session_len * rope_scaling_factor)
-            effective_session_len = max(
-                base_session_len, min(scaled, MAX_LMDEPLOY_CONTEXT)
-            )
-        max_batch_size = max(1, int(config.get("max_batch_size") or 4))
-        base_prefill = int(
-            config.get("max_prefill_token_num")
-            or config.get("max_batch_tokens")
-            or (base_session_len * 2)
-        )
-        if scaling_enabled:
-            scaled_prefill = int(base_prefill * rope_scaling_factor)
-            max_prefill_token_num = scaled_prefill
-        else:
-            max_prefill_token_num = base_prefill
-
-        command = [
-            binary,
-            "serve",
-            "api_server",
-            model_dir,
-            "--backend",
-            "turbomind",
-            "--server-name",
-            self.host,
-            "--server-port",
-            str(self.port),
-            "--tp",
-            str(tensor_parallel),
-            "--session-len",
-            str(effective_session_len),
-            "--max-batch-size",
-            str(max_batch_size),
-        ]
-
-        # Optional model identity for OpenAI-style /v1/models listing
-        model_name = config.get("model_name")
-        if model_name and str(model_name).strip():
-            command.extend(["--model-name", str(model_name).strip()])
-
-        # Optional inference settings
-        dtype = config.get("dtype")
-        if dtype and str(dtype).strip():
-            command.extend(["--dtype", str(dtype).strip()])
-        if max_prefill_token_num:
-            command.extend(["--max-prefill-token-num", str(max_prefill_token_num)])
-        cache_max_entry_count = config.get("cache_max_entry_count")
-        if cache_max_entry_count is not None:
-            command.extend(["--cache-max-entry-count", str(cache_max_entry_count)])
-        cache_block_seq_len = config.get("cache_block_seq_len")
-        if cache_block_seq_len:
-            command.extend(["--cache-block-seq-len", str(cache_block_seq_len)])
-        if config.get("enable_prefix_caching"):
-            command.append("--enable-prefix-caching")
-        quant_policy = config.get("quant_policy")
-        if quant_policy is not None:
-            command.extend(["--quant-policy", str(quant_policy)])
-        model_format = config.get("model_format")
-        if model_format and str(model_format).strip():
-            command.extend(["--model-format", str(model_format).strip()])
-        hf_overrides = config.get("hf_overrides")
-        if isinstance(hf_overrides, dict) and hf_overrides:
-
-            def _flatten(prefix: str, value: Any):
-                if isinstance(value, dict):
-                    for key, nested in value.items():
-                        if not isinstance(key, str) or not key:
-                            continue
-                        new_prefix = f"{prefix}.{key}" if prefix else key
-                        yield from _flatten(new_prefix, nested)
-                else:
-                    yield prefix, value
-
-            def _format_override_value(val: Any) -> str:
-                if isinstance(val, bool):
-                    return "true" if val else "false"
-                if val is None:
-                    return "null"
-                return str(val)
-
-            for path, value in _flatten("", hf_overrides):
-                if not path:
-                    continue
-                command.extend(
-                    [f"--hf-overrides.{path}", _format_override_value(value)]
-                )
-        elif isinstance(hf_overrides, str) and hf_overrides.strip():
-            command.extend(["--hf-overrides", hf_overrides.strip()])
-        # LMDeploy uses --disable-metrics (inverted logic)
-        # When enable_metrics=false, send --disable-metrics
-        # When enable_metrics=true (default), don't send anything (metrics enabled by default)
-        if not config.get("enable_metrics", True):
-            command.append("--disable-metrics")
-        if scaling_enabled:
-            command.extend(["--rope-scaling-factor", str(rope_scaling_factor)])
-        num_tokens_per_iter = config.get("num_tokens_per_iter")
-        if num_tokens_per_iter:
-            command.extend(["--num-tokens-per-iter", str(num_tokens_per_iter)])
-        max_prefill_iters = config.get("max_prefill_iters")
-        if max_prefill_iters:
-            command.extend(["--max-prefill-iters", str(max_prefill_iters)])
-        communicator = config.get("communicator")
-        if communicator and str(communicator).strip():
-            command.extend(["--communicator", str(communicator).strip()])
-
-        # Server configuration parameters
-        allow_origins = config.get("allow_origins")
-        if allow_origins:
-            if isinstance(allow_origins, list):
-                command.extend(
-                    ["--allow-origins"] + [str(origin) for origin in allow_origins]
-                )
-            elif isinstance(allow_origins, str):
-                command.extend(["--allow-origins", allow_origins])
-        if config.get("allow_credentials"):
-            command.append("--allow-credentials")
-        allow_methods = config.get("allow_methods")
-        if allow_methods:
-            if isinstance(allow_methods, list):
-                command.extend(
-                    ["--allow-methods"] + [str(method) for method in allow_methods]
-                )
-            elif isinstance(allow_methods, str):
-                command.extend(["--allow-methods", allow_methods])
-        allow_headers = config.get("allow_headers")
-        if allow_headers:
-            if isinstance(allow_headers, list):
-                command.extend(
-                    ["--allow-headers"] + [str(header) for header in allow_headers]
-                )
-            elif isinstance(allow_headers, str):
-                command.extend(["--allow-headers", allow_headers])
-        proxy_url = config.get("proxy_url")
-        if proxy_url and str(proxy_url).strip():
-            command.extend(["--proxy-url", str(proxy_url).strip()])
-        max_concurrent_requests = config.get("max_concurrent_requests")
-        if max_concurrent_requests is not None:
-            command.extend(
-                ["--max-concurrent-requests", str(int(max_concurrent_requests))]
-            )
-        log_level = config.get("log_level")
-        if log_level and str(log_level).strip():
-            command.extend(["--log-level", str(log_level).strip()])
-        api_keys = config.get("api_keys")
-        if api_keys:
-            if isinstance(api_keys, list):
-                command.extend(["--api-keys"] + [str(key) for key in api_keys])
-            elif isinstance(api_keys, str):
-                command.extend(["--api-keys", api_keys])
-        if config.get("ssl"):
-            command.append("--ssl")
-        max_log_len = config.get("max_log_len")
-        if max_log_len is not None:
-            command.extend(["--max-log-len", str(int(max_log_len))])
-        if config.get("disable_fastapi_docs"):
-            command.append("--disable-fastapi-docs")
-        if config.get("allow_terminate_by_client"):
-            command.append("--allow-terminate-by-client")
-        if config.get("enable_abort_handling"):
-            command.append("--enable-abort-handling")
-
-        # Model configuration parameters
-        chat_template = config.get("chat_template")
-        if chat_template and str(chat_template).strip():
-            command.extend(["--chat-template", str(chat_template).strip()])
-        tool_call_parser = config.get("tool_call_parser")
-        if tool_call_parser and str(tool_call_parser).strip():
-            command.extend(["--tool-call-parser", str(tool_call_parser).strip()])
-        reasoning_parser = config.get("reasoning_parser")
-        if reasoning_parser and str(reasoning_parser).strip():
-            command.extend(["--reasoning-parser", str(reasoning_parser).strip()])
-        revision = config.get("revision")
-        if revision and str(revision).strip():
-            command.extend(["--revision", str(revision).strip()])
-        download_dir = config.get("download_dir")
-        if download_dir and str(download_dir).strip():
-            command.extend(["--download-dir", str(download_dir).strip()])
-        adapters = config.get("adapters")
-        if adapters:
-            if isinstance(adapters, list):
-                command.extend(["--adapters"] + [str(adapter) for adapter in adapters])
-            elif isinstance(adapters, str):
-                command.extend(["--adapters", adapters])
-        device = config.get("device")
-        if device and str(device).strip():
-            command.extend(["--device", str(device).strip()])
-        if config.get("eager_mode"):
-            command.append("--eager-mode")
-        if config.get("disable_vision_encoder"):
-            command.append("--disable-vision-encoder")
-        logprobs_mode = config.get("logprobs_mode")
-        if logprobs_mode is not None:
-            command.extend(["--logprobs-mode", str(logprobs_mode)])
-
-        # DLLM parameters
-        dllm_block_length = config.get("dllm_block_length")
-        if dllm_block_length is not None:
-            command.extend(["--dllm-block-length", str(int(dllm_block_length))])
-        dllm_unmasking_strategy = config.get("dllm_unmasking_strategy")
-        if dllm_unmasking_strategy and str(dllm_unmasking_strategy).strip():
-            command.extend(
-                ["--dllm-unmasking-strategy", str(dllm_unmasking_strategy).strip()]
-            )
-        dllm_denoising_steps = config.get("dllm_denoising_steps")
-        if dllm_denoising_steps is not None:
-            command.extend(["--dllm-denoising-steps", str(int(dllm_denoising_steps))])
-        dllm_confidence_threshold = config.get("dllm_confidence_threshold")
-        if dllm_confidence_threshold is not None:
-            command.extend(
-                ["--dllm-confidence-threshold", str(float(dllm_confidence_threshold))]
-            )
-
-        # Distributed/Multi-node parameters
-        dp = config.get("dp")
-        if dp is not None:
-            command.extend(["--dp", str(int(dp))])
-        ep = config.get("ep")
-        if ep is not None:
-            command.extend(["--ep", str(int(ep))])
-        if config.get("enable_microbatch"):
-            command.append("--enable-microbatch")
-        if config.get("enable_eplb"):
-            command.append("--enable-eplb")
-        role = config.get("role")
-        if role and str(role).strip():
-            command.extend(["--role", str(role).strip()])
-        migration_backend = config.get("migration_backend")
-        if migration_backend and str(migration_backend).strip():
-            command.extend(["--migration-backend", str(migration_backend).strip()])
-        node_rank = config.get("node_rank")
-        if node_rank is not None:
-            command.extend(["--node-rank", str(int(node_rank))])
-        nnodes = config.get("nnodes")
-        if nnodes is not None:
-            command.extend(["--nnodes", str(int(nnodes))])
-        cp = config.get("cp")
-        if cp is not None:
-            command.extend(["--cp", str(int(cp))])
-        if config.get("enable_return_routed_experts"):
-            command.append("--enable-return-routed-experts")
-        distributed_executor_backend = config.get("distributed_executor_backend")
-        if distributed_executor_backend and str(distributed_executor_backend).strip():
-            command.extend(
-                [
-                    "--distributed-executor-backend",
-                    str(distributed_executor_backend).strip(),
-                ]
-            )
-
-        # Vision parameters
-        vision_max_batch_size = config.get("vision_max_batch_size")
-        if vision_max_batch_size is not None:
-            command.extend(["--vision-max-batch-size", str(int(vision_max_batch_size))])
-
-        # Speculative decoding parameters
-        speculative_algorithm = config.get("speculative_algorithm")
-        if speculative_algorithm and str(speculative_algorithm).strip():
-            command.extend(
-                ["--speculative-algorithm", str(speculative_algorithm).strip()]
-            )
-        speculative_draft_model = config.get("speculative_draft_model")
-        if speculative_draft_model and str(speculative_draft_model).strip():
-            command.extend(
-                ["--speculative-draft-model", str(speculative_draft_model).strip()]
-            )
-        speculative_num_draft_tokens = config.get("speculative_num_draft_tokens")
-        if speculative_num_draft_tokens is not None:
-            command.extend(
-                [
-                    "--speculative-num-draft-tokens",
-                    str(int(speculative_num_draft_tokens)),
-                ]
-            )
-
-        additional_args = config.get("additional_args")
-        if isinstance(additional_args, str) and additional_args.strip():
-            command.extend(shlex.split(additional_args.strip()))
-
-        return command
-
-    async def _wait_for_ready(self) -> None:
-        """Poll LMDeploy server until healthy or timeout."""
-        start_time = asyncio.get_event_loop().time()
-        url = f"http://{self.host}:{self.port}/v1/models"
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            while True:
-                if self._process and self._process.returncode not in (None, 0):
-                    self._raise_with_logs(
-                        f"LMDeploy exited unexpectedly with code {self._process.returncode}"
-                    )
-                try:
-                    response = await client.get(url)
-                    if response.status_code == 200:
-                        self._last_health_status = {
-                            "status": "ready",
-                            "checked_at": datetime.utcnow().isoformat() + "Z",
-                        }
-                        return
-                except Exception as exc:
-                    logger.debug(f"LMDeploy health check pending: {exc}")
-                if asyncio.get_event_loop().time() - start_time > self._health_timeout:
-                    self._raise_with_logs(
-                        "Timed out waiting for LMDeploy server to become ready"
-                    )
-                await asyncio.sleep(2)
-
-    def _cleanup_process_state(self) -> None:
-        if self._log_file:
-            try:
-                self._log_file.close()
-            except Exception:
-                pass
-            self._log_file = None
-        self._process = None
-        self._current_instance = None
-        self._started_at = None
-        self._last_health_status = {
-            "status": "stopped",
-            "checked_at": datetime.utcnow().isoformat() + "Z",
-        }
-
-    def read_log_tail(self, max_bytes: int = 8192) -> str:
-        """Return the tail of the lmdeploy log file for debugging."""
+          self._last_error = str(exc)
+          self._refresh_state_from_environment()
+          await self._finish_operation(False, str(exc))
+
+      self._create_task(_runner())
+      return {
+        "message": "LMDeploy install from source started",
+        "repo": repo_url,
+        "branch": branch,
+      }
+
+  async def remove(self) -> Dict[str, Any]:
+    """Remove LMDeploy from its venv and clean up state."""
+    async with self._lock:
+      if self._operation:
+        raise RuntimeError("Another LMDeploy operation is already running")
+      await self._set_operation("remove")
+      args = ["uninstall", "-y", "lmdeploy"]
+
+      async def _runner():
         try:
-            with open(self._log_path, "rb") as log_file:
-                log_file.seek(0, os.SEEK_END)
-                file_size = log_file.tell()
-                seek_pos = max(0, file_size - max_bytes)
-                log_file.seek(seek_pos)
-                data = log_file.read().decode("utf-8", errors="replace")
-                if seek_pos > 0:
-                    # Remove potential partial first line
-                    data = data.split("\n", 1)[-1]
-                return data.strip()
+          from backend.data_store import get_store
+
+          store = get_store()
+          active = store.get_active_engine_version("lmdeploy")
+          venv_path = active.get("venv_path") if active else self._venv_path
+
+          python_exists = os.path.exists(self._venv_python())
+          if python_exists:
+            code = await self._run_pip(args, "remove", ensure_venv=False)
+            if code != 0:
+              raise RuntimeError(f"pip exited with status {code}")
+          if venv_path:
+            shutil.rmtree(venv_path, ignore_errors=True)
+          if active and active.get("version"):
+            try:
+              store.delete_engine_version("lmdeploy", active["version"])
+            except Exception as exc:  # pragma: no cover
+              logger.debug(f"Failed to delete LMDeploy engine version metadata: {exc}")
+          self._update_installed_state(False, None)
+          await self._finish_operation(True, "LMDeploy removed")
         except Exception as exc:
-            logger.error(f"Failed to read LMDeploy log tail: {exc}")
-            return ""
+          self._last_error = str(exc)
+          self._refresh_state_from_environment()
+          await self._finish_operation(False, str(exc))
+
+      self._create_task(_runner())
+      return {"message": "LMDeploy removal started"}
+
+  # --- Introspection --------------------------------------------------------------
+
+  def status(self) -> Dict[str, Any]:
+    version = self._detect_installed_version()
+    binary_path = self._resolve_binary_path()
+    installed = version is not None and binary_path is not None
+    state = self._load_state()
+    return {
+      "installed": installed,
+      "version": version,
+      "binary_path": binary_path,
+      "venv_path": state.get("venv_path") or self._venv_path,
+      "installed_at": state.get("installed_at"),
+      "removed_at": state.get("removed_at"),
+      "operation": self._operation,
+      "operation_started_at": self._operation_started_at,
+      "last_error": self._last_error,
+      "log_path": self._log_path,
+    }
+
+  def is_operation_running(self) -> bool:
+    return self._operation is not None
+
+  def read_log_tail(self, max_bytes: int = 8192) -> str:
+    if not os.path.exists(self._log_path):
+      return ""
+    with open(self._log_path, "rb") as log_file:
+      log_file.seek(0, os.SEEK_END)
+      size = log_file.tell()
+      log_file.seek(max(0, size - max_bytes))
+      data = log_file.read().decode("utf-8", errors="replace")
+      if size > max_bytes:
+        data = data.split("\n", 1)[-1]
+      return data.strip()
 
-    async def _broadcast_runtime_logs(self) -> None:
-        """Broadcast new runtime log lines via SSE."""
-        try:
-            if not os.path.exists(self._log_path):
-                return
-            
-            # Read new content since last broadcast
-            current_size = os.path.getsize(self._log_path)
-            if current_size <= self._last_broadcast_log_position:
-                return  # No new content
-            
-            # Read only new content
-            with open(self._log_path, "rb") as log_file:
-                log_file.seek(self._last_broadcast_log_position)
-                new_content = log_file.read().decode("utf-8", errors="replace")
-                self._last_broadcast_log_position = current_size
-            
-            if new_content:
-                # Split into lines and broadcast each non-empty line via SSE
-                lines = new_content.split('\n')
-                for line in lines:
-                    if line.strip():
-                        get_progress_manager().emit("lmdeploy_runtime_log", {"line": line.strip(), "timestamp": datetime.utcnow().isoformat()})
-        except Exception as exc:
-            logger.debug(f"Failed to broadcast LMDeploy runtime logs: {exc}")
-
-    def _read_log_tail(self, max_bytes: int = 8192) -> str:
-        """Private alias for backward compatibility."""
-        return self.read_log_tail(max_bytes)
-
-    def _raise_with_logs(self, message: str) -> None:
-        """Raise a runtime error that includes the recent LMDeploy logs."""
-        log_tail = self.read_log_tail()
-        if log_tail:
-            logger.error(
-                f"{message}\n--- LMDeploy log tail ---\n{log_tail}\n--- end ---"
-            )
-            raise RuntimeError(f"{message}. See logs for details.\n{log_tail}")
-        raise RuntimeError(message)
-
-    def _detect_external_process(self) -> Optional[Dict[str, Any]]:
-        """Scan system processes for an LMDeploy server launched outside the manager."""
-        try:
-            for proc in psutil.process_iter(attrs=["pid", "cmdline", "create_time"]):
-                cmdline: List[str] = proc.info.get("cmdline") or []
-                if not cmdline:
-                    continue
-                lowered = " ".join(cmdline).lower()
-                if "lmdeploy" not in lowered:
-                    continue
-                if "serve" not in lowered or "api_server" not in lowered:
-                    continue
-
-                try:
-                    api_server_idx = cmdline.index("api_server")
-                except ValueError:
-                    continue
-                model_dir = (
-                    cmdline[api_server_idx + 1]
-                    if len(cmdline) > api_server_idx + 1
-                    else None
-                )
-                detection = {
-                    "pid": proc.info["pid"],
-                    "cmdline": cmdline,
-                    "model_dir": model_dir,
-                    "detected_at": datetime.utcnow().isoformat() + "Z",
-                }
-
-                config = self._config_from_cmdline(cmdline)
-                model_entry = (
-                    self._lookup_model_by_dir(model_dir) if model_dir else None
-                )
-                if model_entry:
-                    self._ensure_running_instance_record(model_entry.get("id"), config)
-                    detection["instance"] = {
-                        "model_id": model_entry.get("id"),
-                        "huggingface_id": model_entry.get("huggingface_id"),
-                        "file_path": model_entry.get("file_path"),
-                        "config": config,
-                        "pid": proc.info["pid"],
-                        "auto_detected": True,
-                    }
-                    detection["model_id"] = model_entry.get("id")
-                    detection["huggingface_id"] = model_entry.get("huggingface_id")
-                else:
-                    detection["instance"] = {
-                        "model_id": None,
-                        "huggingface_id": None,
-                        "file_path": model_dir,
-                        "config": config,
-                        "pid": proc.info["pid"],
-                        "auto_detected": True,
-                    }
-
-                started_at = proc.info.get("create_time")
-                if started_at:
-                    detection["started_at"] = (
-                        datetime.utcfromtimestamp(started_at).isoformat() + "Z"
-                    )
-                else:
-                    detection["started_at"] = datetime.utcnow().isoformat() + "Z"
-                return detection
-        except Exception as exc:
-            logger.debug(f"LMDeploy external scan failed: {exc}")
-        return None
-
-    def _config_from_cmdline(self, cmdline: List[str]) -> Dict[str, Any]:
-        """Reconstruct a minimal config dict from lmdeploy CLI arguments."""
-
-        def _extract(flag: str, cast, default=None):
-            if flag in cmdline:
-                idx = cmdline.index(flag)
-                if idx + 1 < len(cmdline):
-                    try:
-                        return cast(cmdline[idx + 1])
-                    except (ValueError, TypeError):
-                        return default
-            return default
-
-        def _extract_list(flag: str, default=None):
-            """Extract list of values for flags that accept multiple arguments."""
-            if flag not in cmdline:
-                return default
-            idx = cmdline.index(flag)
-            result = []
-            i = idx + 1
-            while i < len(cmdline) and not cmdline[i].startswith("--"):
-                result.append(cmdline[i])
-                i += 1
-            return result if result else default
-
-        session_len = _extract("--session-len", int, DEFAULT_LMDEPLOY_CONTEXT)
-        max_prefill = _extract("--max-prefill-token-num", int, session_len)
-        # Note: --max-context-token-num doesn't exist in LMDeploy, so derive from session_len
-        max_context = session_len
-
-        rope_scaling_factor = _extract("--rope-scaling-factor", float, 1.0)
-        rope_scaling_mode = "disabled"
-        if rope_scaling_factor and rope_scaling_factor > 1.0:
-            rope_scaling_mode = "detected"
-
-        hf_overrides: Dict[str, Any] = {}
-
-        def _assign_nested(target: Dict[str, Any], path: List[str], value: Any) -> None:
-            current = target
-            for segment in path[:-1]:
-                current = current.setdefault(segment, {})
-            current[path[-1]] = value
-
-        def _coerce_override_value(raw: str) -> Any:
-            lowered = raw.lower()
-            if lowered in {"true", "false"}:
-                return lowered == "true"
-            if lowered == "null":
-                return None
-            try:
-                if "." in raw:
-                    return float(raw)
-                return int(raw)
-            except ValueError:
-                return raw
-
-        i = 0
-        while i < len(cmdline):
-            token = cmdline[i]
-            if token.startswith("--hf-overrides."):
-                path_str = token[len("--hf-overrides.") :]
-                if path_str and i + 1 < len(cmdline):
-                    value = _coerce_override_value(cmdline[i + 1])
-                    _assign_nested(hf_overrides, path_str.split("."), value)
-                    i += 2
-                    continue
-            i += 1
-
-        config = {
-            "session_len": session_len,
-            "tensor_parallel": _extract("--tp", int, 1),
-            "max_batch_size": _extract("--max-batch-size", int, 4),
-            "max_prefill_token_num": max_prefill,
-            "max_context_token_num": max_context,
-            "dtype": _extract("--dtype", str, "auto"),
-            "cache_max_entry_count": _extract("--cache-max-entry-count", float, 0.8),
-            "cache_block_seq_len": _extract("--cache-block-seq-len", int, 64),
-            "enable_prefix_caching": "--enable-prefix-caching" in cmdline,
-            "quant_policy": _extract("--quant-policy", int, 0),
-            "model_format": _extract("--model-format", str, ""),
-            "hf_overrides": hf_overrides or _extract("--hf-overrides", str, ""),
-            # LMDeploy uses --disable-metrics, so enable_metrics=True when flag is NOT present
-            "enable_metrics": "--disable-metrics" not in cmdline,
-            "rope_scaling_factor": rope_scaling_factor,
-            "rope_scaling_mode": rope_scaling_mode,
-            "num_tokens_per_iter": _extract("--num-tokens-per-iter", int, 0),
-            "max_prefill_iters": _extract("--max-prefill-iters", int, 1),
-            "communicator": _extract("--communicator", str, "nccl"),
-            "model_name": _extract("--model-name", str, ""),
-            # Server configuration
-            "allow_origins": _extract_list("--allow-origins"),
-            "allow_credentials": "--allow-credentials" in cmdline,
-            "allow_methods": _extract_list("--allow-methods"),
-            "allow_headers": _extract_list("--allow-headers"),
-            "proxy_url": _extract("--proxy-url", str, ""),
-            "max_concurrent_requests": _extract("--max-concurrent-requests", int),
-            "log_level": _extract("--log-level", str, ""),
-            "api_keys": _extract_list("--api-keys"),
-            "ssl": "--ssl" in cmdline,
-            "max_log_len": _extract("--max-log-len", int),
-            "disable_fastapi_docs": "--disable-fastapi-docs" in cmdline,
-            "allow_terminate_by_client": "--allow-terminate-by-client" in cmdline,
-            "enable_abort_handling": "--enable-abort-handling" in cmdline,
-            # Model configuration
-            "chat_template": _extract("--chat-template", str, ""),
-            "tool_call_parser": _extract("--tool-call-parser", str, ""),
-            "reasoning_parser": _extract("--reasoning-parser", str, ""),
-            "revision": _extract("--revision", str, ""),
-            "download_dir": _extract("--download-dir", str, ""),
-            "adapters": _extract_list("--adapters"),
-            "device": _extract("--device", str, ""),
-            "eager_mode": "--eager-mode" in cmdline,
-            "disable_vision_encoder": "--disable-vision-encoder" in cmdline,
-            "logprobs_mode": _extract("--logprobs-mode", str),
-            # DLLM parameters
-            "dllm_block_length": _extract("--dllm-block-length", int),
-            "dllm_unmasking_strategy": _extract("--dllm-unmasking-strategy", str, ""),
-            "dllm_denoising_steps": _extract("--dllm-denoising-steps", int),
-            "dllm_confidence_threshold": _extract("--dllm-confidence-threshold", float),
-            # Distributed/Multi-node parameters
-            "dp": _extract("--dp", int),
-            "ep": _extract("--ep", int),
-            "enable_microbatch": "--enable-microbatch" in cmdline,
-            "enable_eplb": "--enable-eplb" in cmdline,
-            "role": _extract("--role", str, ""),
-            "migration_backend": _extract("--migration-backend", str, ""),
-            "node_rank": _extract("--node-rank", int),
-            "nnodes": _extract("--nnodes", int),
-            "cp": _extract("--cp", int),
-            "enable_return_routed_experts": "--enable-return-routed-experts" in cmdline,
-            "distributed_executor_backend": _extract(
-                "--distributed-executor-backend", str, ""
-            ),
-            # Vision parameters
-            "vision_max_batch_size": _extract("--vision-max-batch-size", int),
-            # Speculative decoding parameters
-            "speculative_algorithm": _extract("--speculative-algorithm", str, ""),
-            "speculative_draft_model": _extract("--speculative-draft-model", str, ""),
-            "speculative_num_draft_tokens": _extract(
-                "--speculative-num-draft-tokens", int
-            ),
-            "additional_args": "",
-        }
-
-        return config
-
-    def _lookup_model_by_dir(self, model_dir: Optional[str]) -> Optional[Dict[str, Any]]:
-        if not model_dir:
-            return None
-        store = get_store()
-        for candidate in store.list_models():
-            if (candidate.get("format") or candidate.get("model_format")) != "safetensors":
-                continue
-            fp = candidate.get("file_path")
-            if fp and os.path.dirname(fp) == model_dir:
-                return candidate
-        return None
-
-    def _ensure_running_instance_record(
-        self, model_id: Optional[Any], config: Dict[str, Any]
-    ) -> None:
-        # No-op: running state is not persisted to DB (Phase 1 YAML store)
-        pass
diff --git a/backend/main.py b/backend/main.py
index c692ee4..d182e60 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -15,13 +15,10 @@
     llama_versions,
     status,
     gpu_info,
-    llama_version_manager,
-    lmdeploy,
+    lmdeploy_versions,
 )
 from backend.huggingface import set_huggingface_token
 from backend.logging_config import setup_logging, get_logger
-from backend.lmdeploy_installer import get_lmdeploy_installer
-from backend.lmdeploy_manager import get_lmdeploy_manager
 
 # Set up logging
 setup_logging(level="INFO")
@@ -133,32 +130,9 @@ async def register_all_models_with_llama_swap():
         logger.warning("llama-server not found, skipping model registration")
         return
 
-    from backend.routes.models import _get_model_file_path
-    from backend.data_store import generate_proxy_name
-
-    for model in model_list:
-        file_path = _get_model_file_path(model)
-        if not file_path or not os.path.exists(file_path):
-            logger.debug(f"Model '{model.get('id')}' not found in HF cache, skipping")
-            continue
-        try:
-            proxy_name = generate_proxy_name(
-                model.get("huggingface_id", ""),
-                model.get("quantization"),
-            )
-            config = (model.get("config") or {}).copy()
-            config.setdefault("host", "0.0.0.0")
-            config.setdefault("ctx_size", 2048)
-            config.setdefault("batch_size", 512)
-            config.setdefault("threads", 4)
-            model_with_proxy = dict(model, proxy_name=proxy_name)
-            await llama_swap_manager.register_model(model_with_proxy, config)
-            logger.info(
-                f"Registered model '{model.get('display_name', model.get('id'))}' as '{proxy_name}' with llama-swap"
-            )
-        except Exception as e:
-            logger.error(f"Failed to register model '{model.get('id')}' with llama-swap: {e}")
-
+    # Legacy auto-registration based on local file paths has been removed.
+    # llama-swap configuration is now generated purely from logical models
+    # (Hugging Face repo + quantization) via generate_llama_swap_config.
     await llama_swap_manager.regenerate_config_with_active_version()
 
 
@@ -266,12 +240,9 @@ async def lifespan(app: FastAPI):
 app.include_router(
     llama_versions.router, prefix="/api/llama-versions", tags=["llama-versions"]
 )
-app.include_router(
-    llama_version_manager.router, prefix="/api", tags=["llama-version-manager"]
-)
 app.include_router(status.router, prefix="/api", tags=["status"])
 app.include_router(gpu_info.router, prefix="/api", tags=["gpu"])
-app.include_router(lmdeploy.router, prefix="/api", tags=["lmdeploy"])
+app.include_router(lmdeploy_versions.router, prefix="/api", tags=["lmdeploy"])
 
 # SSE endpoint for progress tracking
 from backend.progress_manager import get_progress_manager
diff --git a/backend/model_introspection.py b/backend/model_introspection.py
new file mode 100644
index 0000000..ec365b4
--- /dev/null
+++ b/backend/model_introspection.py
@@ -0,0 +1,555 @@
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+from backend.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class ModelInfo:
+    """Normalized, high-level view of a GGUF model."""
+
+    architecture: str
+    layer_count: int
+    block_count: int
+    context_length: int
+    parameter_count_display: Optional[str]
+    vocab_size: Optional[int]
+    embedding_length: Optional[int]
+    attention_head_count: Optional[int]
+    attention_head_count_kv: Optional[int]
+    is_moe: bool
+    expert_count: Optional[int]
+    experts_used_count: Optional[int]
+    raw_metadata: Dict[str, Any]
+
+
+@dataclass
+class TensorInfo:
+    """Lightweight description of a tensor from GGUF metadata."""
+
+    name: str
+    shape: Tuple[int, ...]
+    type_id: int
+    offset: int
+
+
+def _parse_numeric_with_suffix(value: Any) -> Optional[int]:
+    """
+    Parse human-readable numeric strings like '7B', '1.7M', or plain integers.
+
+    Returns an integer number of parameters / units, or None if parsing fails.
+    """
+    if isinstance(value, (int, float)):
+        return int(value) if value > 0 else None
+
+    if not isinstance(value, str):
+        return None
+
+    text = value.strip()
+    if not text:
+        return None
+
+    # Normalize underscores and commas
+    text = text.replace("_", "").replace(",", "")
+    last = text[-1].upper()
+
+    multiplier = 1
+    number_part = text
+    if last in ("K", "M", "B"):
+        number_part = text[:-1]
+        if last == "K":
+            multiplier = int(1e3)
+        elif last == "M":
+            multiplier = int(1e6)
+        else:
+            multiplier = int(1e9)
+
+    try:
+        num = float(number_part)
+        if num <= 0:
+            return None
+        return int(num * multiplier)
+    except (ValueError, TypeError):
+        return None
+
+
+def _format_human_readable(value: Optional[int]) -> Optional[str]:
+    """Format an integer as K/M/B string for display, or return None."""
+    if value is None:
+        return None
+    if value >= 1_000_000_000:
+        base = value / 1_000_000_000
+        return f"{int(base)}B" if base.is_integer() else f"{base:.1f}B"
+    if value >= 1_000_000:
+        base = value / 1_000_000
+        return f"{int(base)}M" if base.is_integer() else f"{base:.1f}M"
+    if value >= 1_000:
+        base = value / 1_000
+        return f"{int(base)}K" if base.is_integer() else f"{base:.1f}K"
+    return str(value)
+
+
+def _find_numeric_candidates(
+    metadata: Dict[str, Any],
+    include_terms: Iterable[str],
+    exclude_terms: Iterable[str] | None = None,
+    max_value: Optional[int] = None,
+) -> List[Tuple[str, int]]:
+    """Return (key, value) pairs whose key and numeric value match the filters."""
+    exclude_terms = tuple(exclude_terms or ())
+    include_terms = tuple(include_terms)
+
+    candidates: List[Tuple[str, int]] = []
+    for key, value in metadata.items():
+        key_lower = key.lower()
+        if not all(term in key_lower for term in include_terms):
+            continue
+        if any(term in key_lower for term in exclude_terms):
+            continue
+
+        parsed = _parse_numeric_with_suffix(value)
+        if parsed is None:
+            continue
+        if max_value is not None and parsed > max_value:
+            continue
+        candidates.append((key, parsed))
+
+    return candidates
+
+
+_INTROSPECTION_CONFIG: Optional[Dict[str, Any]] = None
+
+
+def _load_introspection_config() -> Dict[str, Any]:
+    """
+    Load optional JSON config for architecture-specific GGUF introspection rules.
+
+    The file is expected at ``backend/gguf_introspection_config.json``. Any
+    errors while loading are logged and result in an empty config.
+    """
+    global _INTROSPECTION_CONFIG
+    if _INTROSPECTION_CONFIG is not None:
+        return _INTROSPECTION_CONFIG
+
+    cfg_path = os.path.join(os.path.dirname(__file__), "gguf_introspection_config.json")
+    if not os.path.exists(cfg_path):
+        _INTROSPECTION_CONFIG = {}
+        return _INTROSPECTION_CONFIG
+
+    try:
+        with open(cfg_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            if isinstance(data, dict):
+                _INTROSPECTION_CONFIG = data
+            else:
+                logger.warning(
+                    "gguf_introspection_config.json must contain a JSON object; got %s",
+                    type(data),
+                )
+                _INTROSPECTION_CONFIG = {}
+    except Exception as exc:
+        logger.warning("Failed to load gguf_introspection_config.json: %s", exc)
+        _INTROSPECTION_CONFIG = {}
+
+    return _INTROSPECTION_CONFIG
+
+
+class GgufIntrospector:
+    """
+    Data-driven GGUF model introspector.
+
+    Consumes raw GGUF metadata and tensor descriptors and produces a normalized
+    ModelInfo structure using generic key-pattern matching and simple heuristics.
+    """
+
+    # Sanity limits to defend against corrupted or adversarial metadata
+    MAX_CONTEXT = 1_000_000_000
+    MAX_LAYERS = 4096
+    MAX_HEADS = 8192
+
+    def __init__(
+        self,
+        metadata: Dict[str, Any],
+        tensors: Dict[str, Dict[str, Any]] | None = None,
+    ):
+        self.metadata = metadata or {}
+        self.tensors = tensors or {}
+        self.architecture = str(
+            self.metadata.get("general.architecture", "") or ""
+        ).lower()
+        self._config = _load_introspection_config()
+
+    # Public orchestration -------------------------------------------------
+
+    def build_model_info(self) -> ModelInfo:
+        context_length = self._extract_context_length()
+        block_count, layer_count = self._extract_layer_and_block_counts()
+        param_count_int, param_display = self._extract_parameter_count()
+        (
+            attention_head_count,
+            attention_head_count_kv,
+        ) = self._extract_attention_heads()
+        is_moe, expert_count, experts_used_count = self._extract_moe_info()
+        embedding_length = self._extract_embedding_length()
+        vocab_size = self._extract_vocab_size()
+
+        return ModelInfo(
+            architecture=self.architecture,
+            layer_count=layer_count,
+            block_count=block_count,
+            context_length=context_length,
+            parameter_count_display=param_display,
+            vocab_size=vocab_size,
+            embedding_length=embedding_length,
+            attention_head_count=attention_head_count,
+            attention_head_count_kv=attention_head_count_kv,
+            is_moe=is_moe,
+            expert_count=expert_count,
+            experts_used_count=experts_used_count,
+            raw_metadata=self.metadata,
+        )
+
+    # Property extractors --------------------------------------------------
+
+    def _get_property_configs(self, prop: str) -> List[Dict[str, Any]]:
+        """
+        Return a list of config sections relevant for the given property.
+
+        Order of precedence:
+        1. Global section
+        2. Architecture-specific sections whose ``match_arch`` entries are
+           contained in the lowercased architecture string.
+        """
+        cfg = self._config or {}
+        results: List[Dict[str, Any]] = []
+
+        global_cfg = cfg.get("global")
+        if isinstance(global_cfg, dict):
+            prop_cfg = global_cfg.get(prop)
+            if isinstance(prop_cfg, dict):
+                results.append(prop_cfg)
+
+        for name, section in cfg.items():
+            if name == "global" or not isinstance(section, dict):
+                continue
+            match_arch = section.get("match_arch") or []
+            if not isinstance(match_arch, list):
+                continue
+            if not any(
+                isinstance(token, str) and token.lower() in self.architecture
+                for token in match_arch
+            ):
+                continue
+            prop_cfg = section.get(prop)
+            if isinstance(prop_cfg, dict):
+                results.append(prop_cfg)
+
+        return results
+
+    def _extract_context_length(self) -> int:
+        candidates: List[int] = []
+
+        # 1) Config-driven preferred keys
+        for cfg in self._get_property_configs("context_length"):
+            preferred = cfg.get("preferred_keys") or []
+            for key in preferred:
+                if key in self.metadata:
+                    parsed = _parse_numeric_with_suffix(self.metadata[key])
+                    if parsed is None or parsed <= 0 or parsed > self.MAX_CONTEXT:
+                        continue
+                    candidates.append(parsed)
+
+            if candidates:
+                break
+
+            fallback_terms = cfg.get("fallback_terms") or []
+            if fallback_terms:
+                for _, value in _find_numeric_candidates(
+                    self.metadata,
+                    include_terms=tuple(fallback_terms),
+                    exclude_terms=("generation", "prefill"),
+                    max_value=self.MAX_CONTEXT,
+                ):
+                    candidates.append(value)
+
+            if candidates:
+                break
+
+        # 2) Generic terms for context length (if config did not resolve it)
+        if not candidates:
+            terms_sets = [
+                ("context",),
+                ("model_max_length",),
+                ("max_position_embeddings",),
+                ("max_seq_len",),
+                ("max_sequence_length",),
+            ]
+
+            for terms in terms_sets:
+                for _, value in _find_numeric_candidates(
+                    self.metadata,
+                    include_terms=terms,
+                    exclude_terms=("generation", "prefill"),
+                    max_value=self.MAX_CONTEXT,
+                ):
+                    candidates.append(value)
+
+        if not candidates:
+            # As a last resort, look for any key that mentions both "max" and "length"
+            for _, value in _find_numeric_candidates(
+                self.metadata,
+                include_terms=("max", "length"),
+                max_value=self.MAX_CONTEXT,
+            ):
+                candidates.append(value)
+
+        if not candidates:
+            return 0
+
+        best = max(candidates)
+        if len(set(candidates)) > 1:
+            logger.debug(
+                "Multiple context length candidates detected %s, using max=%s",
+                candidates,
+                best,
+            )
+        return best
+
+    def _extract_layer_and_block_counts(self) -> Tuple[int, int]:
+        numeric_candidates: List[int] = []
+
+        # 1) Config-driven preferred keys
+        for cfg in self._get_property_configs("layer_count"):
+            preferred = cfg.get("preferred_keys") or []
+            for key in preferred:
+                if key in self.metadata:
+                    parsed = _parse_numeric_with_suffix(self.metadata[key])
+                    if parsed is None or parsed <= 0 or parsed > self.MAX_LAYERS:
+                        continue
+                    numeric_candidates.append(parsed)
+
+            if numeric_candidates:
+                break
+
+            fallback_terms = cfg.get("fallback_terms") or []
+            if fallback_terms:
+                for _, value in _find_numeric_candidates(
+                    self.metadata,
+                    include_terms=tuple(fallback_terms),
+                    max_value=self.MAX_LAYERS,
+                ):
+                    numeric_candidates.append(value)
+
+            if numeric_candidates:
+                break
+
+        # 2) Generic key-based candidates
+        if not numeric_candidates:
+            key_terms = [
+                ("block_count",),
+                ("layer_count",),
+                ("n_layer",),
+                ("num_layers",),
+                ("num_hidden_layers",),
+            ]
+
+            for terms in key_terms:
+                for _, value in _find_numeric_candidates(
+                    self.metadata,
+                    include_terms=terms,
+                    max_value=self.MAX_LAYERS,
+                ):
+                    numeric_candidates.append(value)
+
+        block_count = layer_count = 0
+        if numeric_candidates:
+            layer_count = max(numeric_candidates)
+            block_count = layer_count
+            if len(set(numeric_candidates)) > 1:
+                logger.debug(
+                    "Multiple layer/block candidates detected %s, using max=%s",
+                    numeric_candidates,
+                    layer_count,
+                )
+        else:
+            # Tensor-based heuristic: count distinct block indices if names contain ".block."
+            block_indices = self._infer_blocks_from_tensors()
+            if block_indices:
+                block_count = len(block_indices)
+                layer_count = block_count + 1  # usually add output head
+            else:
+                # Fallback default for unknown models
+                layer_count = 32
+                block_count = 32
+                logger.debug(
+                    "No explicit layer/block metadata found; using default=%s", layer_count
+                )
+
+        return block_count, layer_count
+
+    def _infer_blocks_from_tensors(self) -> List[int]:
+        indices: set[int] = set()
+        for name in self.tensors.keys():
+            lower = name.lower()
+            # Common patterns: layers.N., layer.N., blk.N., block.N.
+            for marker in ("layers.", "layer.", "blk.", "block."):
+                if marker in lower:
+                    try:
+                        after = lower.split(marker, 1)[1]
+                        num_str = ""
+                        for ch in after:
+                            if ch.isdigit():
+                                num_str += ch
+                            else:
+                                break
+                        if num_str:
+                            indices.add(int(num_str))
+                    except Exception:
+                        continue
+        return sorted(indices)
+
+    def _extract_parameter_count(self) -> Tuple[Optional[int], Optional[str]]:
+        # Look for any key mentioning parameters
+        raw_candidates: List[int] = []
+        for key, value in self.metadata.items():
+            key_lower = key.lower()
+            if "param" not in key_lower:
+                continue
+            parsed = _parse_numeric_with_suffix(value)
+            if parsed is not None and parsed > 0:
+                raw_candidates.append(parsed)
+
+        if not raw_candidates:
+            return None, None
+
+        best = max(raw_candidates)
+        if len(set(raw_candidates)) > 1:
+            logger.debug(
+                "Multiple parameter count candidates detected %s, using max=%s",
+                raw_candidates,
+                best,
+            )
+
+        return best, _format_human_readable(best)
+
+    def _extract_attention_heads(self) -> Tuple[Optional[int], Optional[int]]:
+        # Attention heads
+        att_candidates: List[int] = []
+        for _, value in _find_numeric_candidates(
+            self.metadata,
+            include_terms=("attention", "head"),
+            max_value=self.MAX_HEADS,
+        ):
+            att_candidates.append(value)
+
+        head_count = max(att_candidates) if att_candidates else None
+
+        # KV heads (GQA)
+        kv_candidates: List[int] = []
+        for _, value in _find_numeric_candidates(
+            self.metadata,
+            include_terms=("attention", "head", "kv"),
+            max_value=self.MAX_HEADS,
+        ):
+            kv_candidates.append(value)
+
+        head_count_kv = max(kv_candidates) if kv_candidates else None
+
+        return head_count, head_count_kv
+
+    def _extract_moe_info(self) -> Tuple[bool, Optional[int], Optional[int]]:
+        architecture = str(self.metadata.get("general.architecture", "") or "").lower()
+        is_moe = "moe" in architecture or "experts" in architecture
+
+        expert_candidates: List[int] = []
+        experts_used_candidates: List[int] = []
+
+        for key, value in self.metadata.items():
+            key_lower = key.lower()
+            if "expert" not in key_lower and "experts" not in key_lower:
+                continue
+
+            parsed = _parse_numeric_with_suffix(value)
+            if parsed is None or parsed <= 0:
+                continue
+
+            if any(term in key_lower for term in ("per_tok", "used", "active")):
+                experts_used_candidates.append(parsed)
+            else:
+                expert_candidates.append(parsed)
+
+        expert_count = max(expert_candidates) if expert_candidates else None
+        experts_used_count = (
+            max(experts_used_candidates) if experts_used_candidates else None
+        )
+
+        if expert_count:
+            is_moe = True
+
+        # Default active experts if only total experts is known
+        if is_moe and experts_used_count is None and expert_count:
+            if expert_count >= 64:
+                experts_used_count = 8
+            elif expert_count >= 32:
+                experts_used_count = 4
+            else:
+                experts_used_count = 2
+
+        return is_moe, expert_count, experts_used_count
+
+    def _extract_embedding_length(self) -> Optional[int]:
+        # First try explicit metadata
+        candidates: List[int] = []
+        for _, value in _find_numeric_candidates(
+            self.metadata,
+            include_terms=("embedding",),
+        ):
+            candidates.append(value)
+
+        if candidates:
+            return max(candidates)
+
+        # Fallback: use tensor shapes for token embeddings
+        best: Optional[int] = None
+        for name, info in self.tensors.items():
+            lower = name.lower()
+            if not any(term in lower for term in ("token_emb", "embed_tokens", "tok_embeddings", "tok_embed")):
+                continue
+            shape = info.get("shape") or []
+            if len(shape) >= 2:
+                dim = int(shape[-1])
+                if best is None or dim > best:
+                    best = dim
+        return best
+
+    def _extract_vocab_size(self) -> Optional[int]:
+        # Prefer scalar vocab size keys
+        candidates: List[int] = []
+        for _, value in _find_numeric_candidates(
+            self.metadata,
+            include_terms=("vocab_size",),
+        ):
+            candidates.append(value)
+
+        if candidates:
+            return max(candidates)
+
+        # Fallback: derive from embedding matrix first dimension
+        best: Optional[int] = None
+        for name, info in self.tensors.items():
+            lower = name.lower()
+            if not any(term in lower for term in ("token_emb", "embed_tokens", "tok_embeddings", "tok_embed")):
+                continue
+            shape = info.get("shape") or []
+            if len(shape) >= 2:
+                size = int(shape[0])
+                if best is None or size > best:
+                    best = size
+        return best
+
diff --git a/backend/param_registry.py b/backend/param_registry.py
index 9b8e5bb..b8c5cbd 100644
--- a/backend/param_registry.py
+++ b/backend/param_registry.py
@@ -12,6 +12,7 @@
 # Basic params shown by default (most common for chat/embedding)
 # Host and port are not included: they are managed by llama-swap (--port ${PORT}, host default 0.0.0.0)
 LLAMA_CPP_BASIC: List[ParamDef] = [
+    {"key": "model_alias", "label": "Model alias", "type": "string", "default": "", "description": "Expose this model under a custom runtime ID instead of the default Hugging Face-derived name"},
     {"key": "ctx_size", "label": "Context size", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum context length in tokens"},
     {"key": "n_gpu_layers", "label": "GPU layers", "type": "int", "default": -1, "min": -1, "max": 1000, "description": "Number of layers to offload to GPU (-1 = all)"},
     {"key": "batch_size", "label": "Batch size", "type": "int", "default": 512, "min": 1, "max": 2048, "description": "Batch size for prompt processing"},
@@ -71,6 +72,7 @@
 
 # LMDeploy (safetensors / TurboMind)
 LMDEPLOY_BASIC: List[ParamDef] = [
+    {"key": "model_alias", "label": "Model alias", "type": "string", "default": "", "description": "Expose this model under a custom runtime ID instead of the default Hugging Face-derived name"},
     {"key": "session_len", "label": "Session length", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum session length"},
     {"key": "max_batch_size", "label": "Max batch size", "type": "int", "default": 128, "min": 1, "max": 1024, "description": "Maximum batch size"},
     {"key": "tensor_parallel", "label": "Tensor parallel", "type": "int", "default": 1, "min": 1, "max": 8, "description": "Tensor parallelism degree"},
diff --git a/backend/routes/llama_version_manager.py b/backend/routes/llama_version_manager.py
deleted file mode 100644
index 6b9ee9a..0000000
--- a/backend/routes/llama_version_manager.py
+++ /dev/null
@@ -1,159 +0,0 @@
-from fastapi import APIRouter, HTTPException
-import os
-import shutil
-import stat
-import time
-
-from backend.data_store import get_store
-from backend.logging_config import get_logger
-
-logger = get_logger(__name__)
-router = APIRouter()
-
-
-def _remove_readonly(func, path, exc):
-    try:
-        os.chmod(path, stat.S_IWRITE)
-        func(path)
-    except Exception as e:
-        logger.warning(f"Could not remove {path}: {e}")
-
-
-def _robust_rmtree(path: str, max_retries: int = 3) -> None:
-    if not os.path.exists(path):
-        return
-    for attempt in range(max_retries):
-        try:
-            shutil.rmtree(path, onerror=_remove_readonly)
-            logger.info(f"Successfully deleted directory: {path}")
-            return
-        except (PermissionError, OSError) as e:
-            if attempt < max_retries - 1:
-                time.sleep(0.5)
-            else:
-                logger.error(f"Failed to delete {path} after {max_retries} attempts: {e}")
-                raise
-
-
-def _resolve_binary_path(binary_path: str) -> str:
-    if not binary_path:
-        return ""
-    if os.path.isabs(binary_path):
-        return binary_path
-    return os.path.join("/app", binary_path)
-
-
-@router.get("/llama-versions")
-async def list_llama_versions():
-    """List all installed llama-cpp versions (llama_cpp engine)."""
-    store = get_store()
-    versions = store.get_engine_versions("llama_cpp")
-    result = []
-    for i, v in enumerate(versions):
-        binary_path = _resolve_binary_path(v.get("binary_path"))
-        result.append({
-            "id": i,
-            "version": v.get("version"),
-            "install_type": v.get("type", "source"),
-            "source_commit": v.get("source_commit"),
-            "is_active": store.get_active_engine_version("llama_cpp") and store.get_active_engine_version("llama_cpp").get("version") == v.get("version"),
-            "installed_at": v.get("installed_at"),
-            "binary_path": v.get("binary_path"),
-            "exists": os.path.exists(binary_path) if binary_path else False,
-        })
-    return {"versions": result}
-
-
-@router.post("/llama-versions/{version_id}/activate")
-async def activate_llama_version(version_id: str):
-    """Activate a specific llama-cpp version (version_id can be index, version string, or "llama_cpp:version")."""
-    store = get_store()
-    versions = store.get_engine_versions("llama_cpp")
-    # Frontend may send id from list endpoint: "llama_cpp:version_str"
-    lookup_id = version_id
-    if ":" in str(version_id):
-        parts = str(version_id).split(":", 1)
-        if parts[0] == "llama_cpp":
-            lookup_id = parts[1]
-    version_entry = None
-    try:
-        idx = int(lookup_id)
-        if 0 <= idx < len(versions):
-            version_entry = versions[idx]
-    except ValueError:
-        pass
-    if not version_entry:
-        version_entry = next((v for v in versions if str(v.get("version")) == str(lookup_id)), None)
-    if not version_entry:
-        raise HTTPException(status_code=404, detail="Version not found")
-    binary_path = _resolve_binary_path(version_entry.get("binary_path"))
-    if not os.path.exists(binary_path):
-        raise HTTPException(status_code=400, detail="Binary file does not exist")
-    version_str = str(version_entry.get("version"))
-    store.set_active_engine_version("llama_cpp", version_str)
-    try:
-        from backend.llama_swap_manager import get_llama_swap_manager
-        llama_swap_manager = get_llama_swap_manager()
-        await llama_swap_manager._ensure_correct_binary_path()
-        await llama_swap_manager.regenerate_config_with_active_version()
-        try:
-            await llama_swap_manager.start_proxy()
-        except Exception as e:
-            logger.warning(f"Failed to start llama-swap after version activation: {e}")
-    except Exception as e:
-        logger.error(f"Failed to regenerate llama-swap config: {e}")
-    logger.info(f"Activated llama-cpp version: {version_str}")
-    return {"message": f"Activated llama-cpp version {version_str}"}
-
-
-@router.delete("/llama-versions/{version_id}")
-async def delete_llama_version(version_id: str):
-    """Delete a llama-cpp version (version_id can be index or version string)."""
-    store = get_store()
-    versions = store.get_engine_versions("llama_cpp")
-    version_entry = None
-    try:
-        idx = int(version_id)
-        if 0 <= idx < len(versions):
-            version_entry = versions[idx]
-    except ValueError:
-        pass
-    if not version_entry:
-        version_entry = next((v for v in versions if str(v.get("version")) == str(version_id)), None)
-    if not version_entry:
-        raise HTTPException(status_code=404, detail="Version not found")
-    version_str = str(version_entry.get("version"))
-    active = store.get_active_engine_version("llama_cpp")
-    if active and str(active.get("version")) == version_str:
-        raise HTTPException(status_code=400, detail="Cannot delete active version")
-    binary_path = _resolve_binary_path(version_entry.get("binary_path"))
-    version_dir = os.path.dirname(os.path.dirname(binary_path)) if binary_path else None
-    if version_dir and os.path.exists(version_dir):
-        try:
-            _robust_rmtree(version_dir)
-        except Exception as e:
-            logger.error(f"Failed to delete directory {version_dir}: {e}")
-            raise HTTPException(status_code=500, detail=f"Failed to delete directory: {e}")
-    store.delete_engine_version("llama_cpp", version_str)
-    logger.info(f"Deleted llama-cpp version: {version_str}")
-    return {"message": f"Deleted llama-cpp version {version_str}"}
-
-
-@router.get("/llama-versions/active")
-async def get_active_llama_version():
-    """Get the currently active llama-cpp version."""
-    store = get_store()
-    active_version = store.get_active_engine_version("llama_cpp")
-    if not active_version:
-        return {"active_version": None}
-    binary_path = _resolve_binary_path(active_version.get("binary_path"))
-    return {
-        "active_version": {
-            "id": 0,
-            "version": active_version.get("version"),
-            "install_type": active_version.get("type"),
-            "source_commit": active_version.get("source_commit"),
-            "binary_path": active_version.get("binary_path"),
-            "exists": os.path.exists(binary_path) if binary_path else False,
-        }
-    }
diff --git a/backend/routes/llama_versions.py b/backend/routes/llama_versions.py
index 4941c5c..deb69ac 100644
--- a/backend/routes/llama_versions.py
+++ b/backend/routes/llama_versions.py
@@ -7,6 +7,7 @@
 import requests
 import time
 import platform
+import re
 import shutil
 import stat
 from datetime import datetime
@@ -84,6 +85,8 @@ async def list_llama_versions():
                 "install_type": v.get("type", "source"),
                 "binary_path": v.get("binary_path"),
                 "source_commit": v.get("source_commit"),
+                "source_ref": v.get("source_ref"),
+                "source_ref_type": v.get("source_ref_type"),
                 "patches": [],  # No longer storing patches in YAML
                 "installed_at": v.get("installed_at"),
                 "is_active": v.get("version") == active_version,
@@ -93,28 +96,178 @@ async def list_llama_versions():
     return result
 
 
+def _default_build_settings() -> dict:
+    """Default build-settings payload for engines when nothing is saved yet."""
+    return {
+        "cuda": False,
+        "flash_attention": False,
+        "native": True,
+        "backend_dl": False,
+        "cpu_all_variants": False,
+        "cuda_architectures": "",
+    }
+
+
+def _coerce_build_settings(settings: Optional[dict]) -> dict:
+    base = _default_build_settings()
+    if not isinstance(settings, dict):
+        return base
+
+    def _bool(v):
+        if isinstance(v, bool):
+            return v
+        if isinstance(v, str):
+            return v.strip().lower() in ("1", "true", "yes", "on")
+        return bool(v)
+
+    return {
+        "cuda": _bool(settings.get("cuda", base["cuda"])),
+        "flash_attention": _bool(settings.get("flash_attention", base["flash_attention"])),
+        "native": _bool(settings.get("native", base["native"])),
+        "backend_dl": _bool(settings.get("backend_dl", base["backend_dl"])),
+        "cpu_all_variants": _bool(settings.get("cpu_all_variants", base["cpu_all_variants"])),
+        "cuda_architectures": str(settings.get("cuda_architectures") or ""),
+    }
+
+
+def _build_config_from_settings(settings: Optional[dict]) -> BuildConfig:
+    normalized = _coerce_build_settings(settings)
+    return BuildConfig(
+        enable_cuda=normalized["cuda"],
+        enable_flash_attention=normalized["flash_attention"],
+        enable_native=normalized["native"],
+        enable_backend_dl=normalized["backend_dl"],
+        enable_cpu_all_variants=normalized["cpu_all_variants"],
+        cuda_architectures=normalized["cuda_architectures"],
+    )
+
+
+def _source_ref_slug(source_ref: str) -> str:
+    value = str(source_ref or "").strip().lower()
+    value = re.sub(r"[^a-z0-9._-]+", "-", value)
+    value = re.sub(r"-{2,}", "-", value).strip("-._")
+    return value[:32] or "source"
+
+
+def _resolve_engine_build_target(engine: str) -> tuple[str, str]:
+    if engine == "ik_llama":
+        repository_source = "ik_llama.cpp"
+    elif engine == "llama_cpp":
+        repository_source = "llama.cpp"
+    else:
+        raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'")
+
+    repository_url = llama_manager.REPOSITORY_SOURCES.get(repository_source)
+    if not repository_url:
+        raise HTTPException(status_code=400, detail=f"Unknown repository source: {repository_source}")
+    return repository_source, repository_url
+
+
+def _fetch_latest_release(repository_source: str) -> Optional[dict]:
+    if repository_source == "ik_llama.cpp":
+        releases_url = "https://api.github.com/repos/ikawrakow/ik_llama.cpp/releases?per_page=10"
+    else:
+        releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases?per_page=10"
+
+    response = requests.get(releases_url, allow_redirects=True)
+    if response.status_code == 404:
+        return None
+    response.raise_for_status()
+
+    releases = response.json()
+    if isinstance(releases, dict):
+        # Defensive fallback in case GitHub changes shape or proxies return a single object.
+        return releases
+    if not isinstance(releases, list):
+        return None
+
+    for release in releases:
+        if isinstance(release, dict) and not release.get("draft"):
+            return release
+    return None
+
+
+@router.get("/build-settings")
+async def get_build_settings(engine: str = "llama_cpp"):
+    """Get persisted build settings for an engine ('llama_cpp' or 'ik_llama')."""
+    if engine not in ("llama_cpp", "ik_llama"):
+        raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'")
+    store = get_store()
+    settings = store.get_engine_build_settings(engine) or {}
+    # Always return a full shape so the frontend can rely on defaults.
+    base = _default_build_settings()
+    base.update({k: v for k, v in settings.items() if k in base})
+    return base
+
+
+@router.put("/build-settings")
+async def update_build_settings(engine: str = "llama_cpp", settings: dict = Body(...)):
+    """Persist build settings for an engine ('llama_cpp' or 'ik_llama')."""
+    if engine not in ("llama_cpp", "ik_llama"):
+        raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'")
+    if not isinstance(settings, dict):
+        raise HTTPException(status_code=400, detail="settings must be an object")
+    store = get_store()
+    # Only persist known build keys; ignore extras.
+    allowed = _default_build_settings().keys()
+    filtered = {k: v for k, v in settings.items() if k in allowed}
+    stored = store.update_engine_build_settings(engine, filtered)
+    base = _default_build_settings()
+    base.update({k: v for k, v in stored.items() if k in base})
+    return base
+
+
+@router.post("/update")
+async def update_engine(request: dict):
+    """Build the latest source release for an engine using persisted build settings, then auto-activate it."""
+    engine = (request or {}).get("engine", "llama_cpp")
+    version_suffix = (request or {}).get("version_suffix")
+    repository_source, repository_url = _resolve_engine_build_target(engine)
+    store = get_store()
+    settings = store.get_engine_build_settings(engine) or {}
+    build_config = _build_config_from_settings(settings)
+
+    try:
+        latest_release = _fetch_latest_release(repository_source)
+        if not latest_release or not latest_release.get("tag_name"):
+            raise HTTPException(status_code=404, detail="No release found for this engine")
+        source_ref = latest_release["tag_name"]
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 403:
+            raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded. Please try again later.")
+        if e.response.status_code == 404:
+            raise HTTPException(status_code=404, detail="GitHub repository or release not found")
+        raise HTTPException(status_code=500, detail=f"GitHub API error: {str(e)}")
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=500, detail=f"Network error: {str(e)}")
+
+    return _schedule_source_build(
+        source_ref=source_ref,
+        patches=[],
+        build_config=build_config,
+        repository_source=repository_source,
+        repository_url=repository_url,
+        version_suffix=version_suffix,
+        auto_activate=True,
+        source_ref_type="release",
+    )
+
+
 @router.get("/check-updates")
 async def check_updates(source: str | None = None):
-    """Check for llama.cpp or ik_llama.cpp updates (releases and/or source).
+    """Check for llama.cpp or ik_llama.cpp source releases and latest commit.
     source: None or 'llama_cpp' for ggerganov/llama.cpp; 'ik_llama' for ikawrakow/ik_llama.cpp.
     """
     try:
         is_ik = source == "ik_llama"
         if is_ik:
-            commits_url = (
-                "https://api.github.com/repos/ikawrakow/ik_llama.cpp/commits?per_page=1"
-            )
-            latest_release = None
+            repository_source = "ik_llama.cpp"
+            commits_url = "https://api.github.com/repos/ikawrakow/ik_llama.cpp/commits?per_page=1"
         else:
-            # ai-dock/llama.cpp-cuda: pre-built releases with CUDA support
-            releases_url = "https://api.github.com/repos/ai-dock/llama.cpp-cuda/releases"
-            commits_url = (
-                "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1"
-            )
-            releases_response = requests.get(releases_url, allow_redirects=True)
-            releases_response.raise_for_status()
-            releases = releases_response.json()
-            latest_release = releases[0] if releases else None
+            repository_source = "llama.cpp"
+            commits_url = "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1"
+
+        latest_release = _fetch_latest_release(repository_source)
 
         commits_response = requests.get(commits_url, allow_redirects=True)
         commits_response.raise_for_status()
@@ -125,8 +278,8 @@ async def check_updates(source: str | None = None):
             "latest_release": (
                 {
                     "tag_name": latest_release["tag_name"],
-                    "published_at": latest_release["published_at"],
-                    "html_url": latest_release["html_url"],
+                    "published_at": latest_release.get("published_at"),
+                    "html_url": latest_release.get("html_url"),
                 }
                 if latest_release
                 else None
@@ -159,26 +312,10 @@ async def check_updates(source: str | None = None):
 
 @router.get("/releases/{tag_name}/assets")
 async def get_release_assets(tag_name: str):
-    """List compatible release artifacts for a given tag."""
-    try:
-        assets = llama_manager.get_release_assets(tag_name)
-        return assets
-    except requests.exceptions.HTTPError as e:
-        if e.response.status_code == 403:
-            raise HTTPException(
-                status_code=429,
-                detail="GitHub API rate limit exceeded. Please try again later.",
-            )
-        elif e.response.status_code == 404:
-            raise HTTPException(status_code=404, detail=f"Release {tag_name} not found")
-        else:
-            raise HTTPException(status_code=500, detail=f"GitHub API error: {str(e)}")
-    except requests.exceptions.RequestException as e:
-        raise HTTPException(status_code=500, detail=f"Network error: {str(e)}")
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to fetch release assets: {str(e)}"
-        )
+    raise HTTPException(
+        status_code=410,
+        detail="Prebuilt llama.cpp release installation has been removed. Build from source instead.",
+    )
 
 
 @router.get("/build-capabilities")
@@ -215,75 +352,10 @@ async def get_build_capabilities_endpoint():
 
 @router.post("/install-release")
 async def install_release(request: dict):
-    """Install llama.cpp from ai-dock/llama.cpp-cuda release (CUDA builds)."""
-    try:
-        tag_name = request.get("tag_name")
-        if not tag_name:
-            raise HTTPException(status_code=400, detail="tag_name is required")
-
-        raw_asset_id = request.get("asset_id")
-        asset_id = None
-        if raw_asset_id is not None:
-            try:
-                asset_id = int(raw_asset_id)
-            except (TypeError, ValueError):
-                raise HTTPException(
-                    status_code=400, detail="asset_id must be an integer"
-                )
-
-        try:
-            preview = llama_manager.get_release_install_preview(tag_name, asset_id)
-        except requests.exceptions.HTTPError as e:
-            if e.response.status_code == 403:
-                raise HTTPException(
-                    status_code=429,
-                    detail="GitHub API rate limit exceeded. Please try again later.",
-                )
-            elif e.response.status_code == 404:
-                raise HTTPException(
-                    status_code=404, detail=f"Release {tag_name} not found"
-                )
-            else:
-                raise HTTPException(
-                    status_code=500, detail=f"GitHub API error: {str(e)}"
-                )
-        except requests.exceptions.RequestException as e:
-            raise HTTPException(status_code=500, detail=f"Network error: {str(e)}")
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=str(e))
-
-        version_name = preview.get("version_name")
-
-        store = get_store()
-        existing_versions = store.get_engine_versions("llama_cpp")
-        existing = next(
-            (v for v in existing_versions if v.get("version") in (version_name, tag_name)),
-            None,
-        )
-        if existing:
-            detail = "400: Version already installed"
-            if version_name:
-                detail = f"{detail} ({version_name})"
-            raise HTTPException(status_code=400, detail=detail)
-
-        # Generate task ID for tracking
-        task_id = f"install_release_{tag_name}_{int(time.time())}"
-
-        # Start installation in background (asyncio.create_task so it runs regardless of middleware)
-        pm = get_progress_manager()
-        pm.create_task("install_release", f"Install {tag_name}", {"tag_name": tag_name}, task_id=task_id)
-        asyncio.create_task(install_release_task(tag_name, pm, task_id, asset_id))
-
-        return {
-            "message": f"Installing release {tag_name}",
-            "task_id": task_id,
-            "status": "started",
-            "progress": 0,
-            "asset_id": asset_id,
-            "version_name": version_name,
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
+    raise HTTPException(
+        status_code=410,
+        detail="Prebuilt llama.cpp release installation has been removed. Build from source instead.",
+    )
 
 
 async def install_release_task(
@@ -361,29 +433,17 @@ async def build_source(request: dict):
         build_config_dict = request.get("build_config")
         repository_source = request.get("repository_source", "llama.cpp")
         version_suffix = request.get("version_suffix")
+        auto_activate = bool(request.get("auto_activate"))
+        source_ref_type = request.get("source_ref_type", "ref")
 
         if not commit_sha:
             raise HTTPException(status_code=400, detail="commit_sha is required")
 
-        commit_short = commit_sha[:8]
-        if version_suffix:
-            version_name = f"source-{commit_short}-{version_suffix}"
+        if repository_source == "ik_llama.cpp":
+            _, repository_url = _resolve_engine_build_target("ik_llama")
+        elif repository_source == "llama.cpp":
+            _, repository_url = _resolve_engine_build_target("llama_cpp")
         else:
-            timestamp = int(time.time())
-            version_name = f"source-{commit_short}-{timestamp}"
-
-        store = get_store()
-        engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp"
-        existing_versions = store.get_engine_versions(engine)
-        existing = next((v for v in existing_versions if v.get("version") == version_name), None)
-        if existing:
-            raise HTTPException(
-                status_code=400, detail=f"Version '{version_name}' already installed"
-            )
-
-        # Get repository URL from source name
-        repository_url = llama_manager.REPOSITORY_SOURCES.get(repository_source)
-        if not repository_url:
             raise HTTPException(
                 status_code=400,
                 detail=f"Unknown repository source: {repository_source}",
@@ -414,33 +474,16 @@ def _bool(v):
                 logger.warning("BuildConfig from request failed (%s), using defaults", e)
                 build_config = BuildConfig()
 
-        # Generate task ID for tracking
-        task_id = f"build_{version_name}_{int(time.time())}"
-
-        # Start build in background (asyncio.create_task so it runs regardless of middleware)
-        pm = get_progress_manager()
-        pm.create_task("build", f"Build {repository_source} {commit_sha[:8]}", {"version_name": version_name}, task_id=task_id)
-        asyncio.create_task(
-            build_source_task(
-                commit_sha,
-                patches,
-                build_config or BuildConfig(),
-                version_name,
-                repository_source,
-                repository_url,
-                pm,
-                task_id,
-            )
+        return _schedule_source_build(
+            source_ref=commit_sha,
+            patches=patches,
+            build_config=build_config or BuildConfig(),
+            repository_source=repository_source,
+            repository_url=repository_url,
+            version_suffix=version_suffix,
+            auto_activate=auto_activate,
+            source_ref_type=source_ref_type,
         )
-
-        return {
-            "message": f"Building from source {commit_sha[:8]}",
-            "task_id": task_id,
-            "status": "started",
-            "progress": 0,
-            "version_name": version_name,
-            "repository_source": repository_source,
-        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
@@ -454,6 +497,8 @@ async def build_source_task(
     repository_url: str,
     progress_manager=None,
     task_id: str = None,
+    auto_activate: bool = False,
+    source_ref_type: str = "ref",
 ):
     """Background task to build from source with SSE progress"""
     logger.info(
@@ -485,12 +530,23 @@ async def build_source_task(
             "type": "patched" if patches else "source",
             "binary_path": binary_path,
             "source_commit": commit_sha,
+            "source_ref": commit_sha,
+            "source_ref_type": source_ref_type,
             "build_config": build_config_dict,
             "repository_source": repository_source,
             "installed_at": datetime.utcnow().isoformat() + "Z",
         }
         store.add_engine_version(engine, version_data)
 
+        if auto_activate:
+            try:
+                # Reuse the existing activation flow (includes llama-swap handling).
+                await _do_activate_version(f"{engine}:{version_name}")
+            except HTTPException as e:
+                logger.error("Auto-activation failed for %s:%s: %s", engine, version_name, e.detail)
+            except Exception as e:
+                logger.exception("Auto-activation failed for %s:%s: %s", engine, version_name, e)
+
         from backend.llama_swap_manager import get_llama_swap_manager
         active_version = store.get_active_engine_version(engine)
         if active_version and active_version.get("binary_path") and os.path.exists(active_version.get("binary_path", "")):
@@ -521,14 +577,6 @@ async def build_source_task(
                     message=f"Failed to build llama.cpp from source: {str(e)}",
                     type="error",
                 )
-                if task_id:
-                    await progress_manager.send_build_progress(
-                        task_id=task_id,
-                        stage="error",
-                        progress=0,
-                        message=f"Build task failed: {str(e)}",
-                        log_lines=[f"Task error: {str(e)}", f"Error type: {type(e).__name__}"],
-                    )
             except Exception as ws_error:
                 logger.error(f"Failed to send build failure notification: {ws_error}")
 
@@ -611,6 +659,71 @@ def _find_version_entry(store, version_id: str):
     return version_entry, engine
 
 
+def _schedule_source_build(
+    source_ref: str,
+    patches: List[str],
+    build_config: BuildConfig,
+    repository_source: str,
+    repository_url: str,
+    version_suffix: Optional[str] = None,
+    auto_activate: bool = False,
+    source_ref_type: str = "ref",
+):
+    store = get_store()
+    engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp"
+    ref_slug = _source_ref_slug(source_ref)
+    if version_suffix:
+        version_name = f"source-{ref_slug}-{version_suffix}"
+    else:
+        timestamp = int(time.time())
+        version_name = f"source-{ref_slug}-{timestamp}"
+
+    existing_versions = store.get_engine_versions(engine)
+    existing = next((v for v in existing_versions if v.get("version") == version_name), None)
+    if existing:
+        raise HTTPException(status_code=400, detail=f"Version '{version_name}' already installed")
+
+    task_id = f"build_{version_name}_{int(time.time())}"
+    pm = get_progress_manager()
+    pm.create_task(
+        "build",
+        f"Build {repository_source} {ref_slug}",
+        {
+            "version_name": version_name,
+            "engine": engine,
+            "repository_source": repository_source,
+            "auto_activate": auto_activate,
+            "source_ref": source_ref,
+            "source_ref_type": source_ref_type,
+        },
+        task_id=task_id,
+    )
+    asyncio.create_task(
+        build_source_task(
+            source_ref,
+            patches,
+            build_config or BuildConfig(),
+            version_name,
+            repository_source,
+            repository_url,
+            pm,
+            task_id,
+            auto_activate=auto_activate,
+            source_ref_type=source_ref_type,
+        )
+    )
+    return {
+        "message": f"Building from source {ref_slug}",
+        "task_id": task_id,
+        "status": "started",
+        "progress": 0,
+        "version_name": version_name,
+        "repository_source": repository_source,
+        "source_ref": source_ref,
+        "source_ref_type": source_ref_type,
+    }
+
+
 @router.post("/versions/activate")
 async def activate_version_body(payload: dict = Body(...)):
     """Activate a version; body: { \"version_id\": \"llama_cpp:version\" or \"version\" }."""
diff --git a/backend/routes/lmdeploy.py b/backend/routes/lmdeploy.py
deleted file mode 100644
index d24e5cf..0000000
--- a/backend/routes/lmdeploy.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from typing import Dict, Optional
-
-import httpx
-from fastapi import APIRouter, HTTPException
-
-from backend.lmdeploy_installer import get_lmdeploy_installer
-from backend.lmdeploy_manager import get_lmdeploy_manager
-
-router = APIRouter()
-
-
-@router.get("/lmdeploy/check-updates")
-async def lmdeploy_check_updates() -> Dict:
-    """Check PyPI for latest LMDeploy version."""
-    try:
-        async with httpx.AsyncClient() as client:
-            r = await client.get("https://pypi.org/pypi/lmdeploy/json", timeout=10.0)
-            r.raise_for_status()
-            data = r.json()
-            info = data.get("info", {})
-            return {
-                "latest_version": info.get("version"),
-                "releases": list(data.get("releases", {}).keys()),
-            }
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to check PyPI: {exc}")
-
-
-@router.get("/lmdeploy/status")
-async def lmdeploy_installer_status() -> Dict:
-    installer = get_lmdeploy_installer()
-    return installer.status()
-
-
-@router.post("/lmdeploy/install")
-async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict:
-    installer = get_lmdeploy_installer()
-    payload = request or {}
-    version = payload.get("version")
-    force_reinstall = bool(payload.get("force_reinstall"))
-    try:
-        return await installer.install(version=version, force_reinstall=force_reinstall)
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-
-@router.post("/lmdeploy/install-source")
-async def lmdeploy_install_source(request: Optional[Dict[str, str]] = None) -> Dict:
-    """Install LMDeploy from a git repo and branch (for development)."""
-    installer = get_lmdeploy_installer()
-    payload = request or {}
-    repo_url = payload.get("repo_url", "https://github.com/InternLM/lmdeploy.git")
-    branch = payload.get("branch", "main")
-    try:
-        return await installer.install_from_source(repo_url=repo_url, branch=branch)
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-
-@router.post("/lmdeploy/remove")
-async def lmdeploy_remove() -> Dict:
-    installer = get_lmdeploy_installer()
-    try:
-        return await installer.remove()
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-
-@router.get("/lmdeploy/logs")
-async def lmdeploy_logs(max_bytes: int = 8192) -> Dict[str, str]:
-    """Get LMDeploy installer logs."""
-    installer = get_lmdeploy_installer()
-    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
-    return {"log": installer.read_log_tail(max_bytes)}
-
-
-@router.get("/lmdeploy/runtime-logs")
-async def lmdeploy_runtime_logs(max_bytes: int = 8192) -> Dict[str, str]:
-    """Get LMDeploy runtime logs (from running server instances)."""
-    manager = get_lmdeploy_manager()
-    max_bytes = max(1024, min(max_bytes, 1024 * 1024))
-    return {"log": manager.read_log_tail(max_bytes)}
diff --git a/backend/routes/lmdeploy_versions.py b/backend/routes/lmdeploy_versions.py
new file mode 100644
index 0000000..944ec15
--- /dev/null
+++ b/backend/routes/lmdeploy_versions.py
@@ -0,0 +1,65 @@
+from typing import Dict, Optional
+
+import httpx
+from fastapi import APIRouter, HTTPException
+
+from backend.lmdeploy_manager import get_lmdeploy_manager
+
+router = APIRouter()
+
+
+@router.get("/lmdeploy/check-updates")
+async def lmdeploy_check_updates() -> Dict:
+  """Check PyPI for latest LMDeploy version."""
+  try:
+    async with httpx.AsyncClient() as client:
+      r = await client.get("https://pypi.org/pypi/lmdeploy/json", timeout=10.0)
+      r.raise_for_status()
+      data = r.json()
+      info = data.get("info", {})
+      return {
+        "latest_version": info.get("version"),
+        "releases": list(data.get("releases", {}).keys()),
+      }
+  except Exception as exc:
+    raise HTTPException(status_code=500, detail=f"Failed to check PyPI: {exc}")
+
+
+@router.get("/lmdeploy/status")
+async def lmdeploy_installer_status() -> Dict:
+  manager = get_lmdeploy_manager()
+  return manager.status()
+
+
+@router.post("/lmdeploy/install")
+async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict:
+  manager = get_lmdeploy_manager()
+  payload = request or {}
+  version = payload.get("version")
+  force_reinstall = bool(payload.get("force_reinstall"))
+  try:
+    return await manager.install_release(version=version, force_reinstall=force_reinstall)
+  except RuntimeError as exc:
+    raise HTTPException(status_code=409, detail=str(exc))
+
+
+@router.post("/lmdeploy/install-source")
+async def lmdeploy_install_source(request: Optional[Dict[str, str]] = None) -> Dict:
+  """Install LMDeploy from a git repo and branch (for development)."""
+  manager = get_lmdeploy_manager()
+  payload = request or {}
+  repo_url = payload.get("repo_url", "https://github.com/InternLM/lmdeploy.git")
+  branch = payload.get("branch", "main")
+  try:
+    return await manager.install_from_source(repo_url=repo_url, branch=branch)
+  except RuntimeError as exc:
+    raise HTTPException(status_code=409, detail=str(exc))
+
+
+@router.post("/lmdeploy/remove")
+async def lmdeploy_remove() -> Dict:
+  manager = get_lmdeploy_manager()
+  try:
+    return await manager.remove()
+  except RuntimeError as exc:
+    raise HTTPException(status_code=409, detail=str(exc))
diff --git a/backend/routes/models.py b/backend/routes/models.py
index 5859db4..cf739a2 100644
--- a/backend/routes/models.py
+++ b/backend/routes/models.py
@@ -8,7 +8,7 @@
 import re
 from datetime import datetime
 
-from backend.data_store import get_store, generate_proxy_name
+from backend.data_store import get_store, generate_proxy_name, resolve_proxy_name
 from backend.progress_manager import get_progress_manager
 from backend.huggingface import (
     search_models,
@@ -23,11 +23,8 @@
     list_safetensors_downloads,
     delete_safetensors_download,
     record_safetensors_download,
-    get_default_lmdeploy_config,
-    update_lmdeploy_config,
     list_grouped_safetensors_downloads,
     create_gguf_manifest_entry,
-    get_gguf_manifest_entry,
     get_safetensors_manifest_entries,
     save_safetensors_manifest_entries,
     DEFAULT_LMDEPLOY_CONTEXT,
@@ -35,7 +32,7 @@
     MAX_ROPE_SCALING_FACTOR,
     get_model_disk_size,
     get_accurate_file_sizes,
-    get_mmproj_f16_filename,
+    resolve_cached_model_path,
 )
 from backend.gpu_detector import get_gpu_info
 from backend.gguf_reader import get_model_layer_info
@@ -43,8 +40,6 @@
 
 logger = get_logger(__name__)
 from backend.llama_swap_config import get_supported_flags
-from backend.lmdeploy_manager import get_lmdeploy_manager
-from backend.lmdeploy_installer import get_lmdeploy_installer
 import psutil
 
 router = APIRouter()
@@ -68,6 +63,22 @@
     "minilm",
 ]
 
+
+def _is_mmproj_filename(filename: Optional[str]) -> bool:
+    name = (filename or "").strip().lower()
+    return bool(name) and "mmproj" in name and name.endswith(".gguf")
+
+
+async def _regenerate_llama_swap_config(reason: str):
+    try:
+        from backend.llama_swap_manager import get_llama_swap_manager
+
+        llama_swap_manager = get_llama_swap_manager()
+        await llama_swap_manager.regenerate_config_with_active_version()
+        logger.info("Regenerated llama-swap config after %s", reason)
+    except Exception as exc:
+        logger.warning("Failed to regenerate llama-swap config after %s: %s", reason, exc)
+
 # Lightweight cache for GPU info to avoid repeated NVML calls during rapid estimate requests
 _gpu_info_cache: Dict[str, Any] = {"data": None, "timestamp": 0.0}
 GPU_INFO_CACHE_TTL = 2.0  # seconds
@@ -115,7 +126,9 @@ def _get_actual_file_size(file_path: Optional[str]) -> Optional[int]:
     """Return actual file size in bytes from disk, or None if not available."""
     if not file_path:
         return None
-    path = _normalize_model_path(file_path)
+    # For new HF-backed models we do not store paths; this helper is only used for
+    # legacy/local models that still reference concrete filesystem locations.
+    path = file_path.replace("\\", "/")
     if not path or not os.path.exists(path):
         return None
     try:
@@ -125,54 +138,6 @@ def _get_actual_file_size(file_path: Optional[str]) -> Optional[int]:
         return None
 
 
-def _get_model_filename(model: dict) -> Optional[str]:
-    """Return the filename for a model record.
-
-    Prefers the dedicated ``filename`` field (new records). Falls back to
-    deriving it from the legacy ``file_path`` field (old records).
-    """
-    fname = model.get("filename")
-    if fname:
-        return fname
-    return _extract_filename(model.get("file_path")) or None
-
-
-def _get_model_file_path(model: dict) -> Optional[str]:
-    """Return the actual filesystem path for a model file.
-
-    Resolution order:
-    1. HF cache via huggingface_id + filename (new records).
-    2. Stored file_path (legacy records that still reference custom storage).
-    """
-    from backend.huggingface import resolve_cached_model_path
-
-    hf_id = model.get("huggingface_id")
-    filename = _get_model_filename(model)
-
-    if hf_id and filename:
-        cached = resolve_cached_model_path(hf_id, filename)
-        if cached:
-            return cached
-
-    return _normalize_model_path(model.get("file_path")) or None
-
-
-def _normalize_model_path(file_path: Optional[str]) -> Optional[str]:
-    if not file_path:
-        return None
-    normalized = file_path.replace("\\", "/")
-    normalized = os.path.normpath(normalized)
-    return normalized
-
-
-def _extract_filename(file_path: Optional[str]) -> str:
-    if not file_path:
-        return ""
-    normalized = file_path.replace("\\", "/")
-    parts = normalized.split("/")
-    return parts[-1] if parts else normalized
-
-
 def normalize_architecture(raw_architecture: str) -> str:
     """Normalize GGUF architecture string (stub after smart_auto removal)."""
     if not raw_architecture or not isinstance(raw_architecture, str):
@@ -233,25 +198,6 @@ def _assign_numeric(src_key: str, dest_keys):
     return defaults
 
 
-def _apply_hf_defaults_to_model(model: dict, metadata: Dict[str, Any], store) -> None:
-    if not metadata:
-        return
-    defaults = _derive_hf_defaults(metadata)
-    if not defaults:
-        return
-    config = _coerce_model_config(model.get("config"))
-    changed = False
-    for key, value in defaults.items():
-        if value is None:
-            continue
-        existing = config.get(key)
-        if existing in (None, "", 0):
-            config[key] = value
-            changed = True
-    if changed:
-        store.update_model(model["id"], {"config": config})
-
-
 def _coerce_model_config(config_value: Optional[Any]) -> Dict[str, Any]:
     """Return a dict regardless of whether config is stored as dict or JSON string."""
     if not config_value:
@@ -272,7 +218,11 @@ def _refresh_model_metadata_from_file(model: dict, store) -> Dict[str, Any]:
     Re-read GGUF metadata from disk and update the model record.
     Returns metadata details for downstream consumers.
     """
-    normalized_path = _get_model_file_path(model)
+    # Only supported for legacy/local models that still carry a concrete file_path.
+    file_path = model.get("file_path")
+    if not file_path:
+        raise FileNotFoundError("Model file not found on disk")
+    normalized_path = file_path.replace("\\", "/")
     if not normalized_path or not os.path.exists(normalized_path):
         raise FileNotFoundError("Model file not found on disk")
 
@@ -458,18 +408,21 @@ async def _save_safetensors_download(
 
     if not model_record:
         from datetime import timezone as _tz
+        # Safetensors-backed models are treated as a single logical entity per
+        # Hugging Face repo. Derive base name and type from the repo id, not the
+        # shard filename.
+        repo_name = huggingface_id.split("/")[-1] if isinstance(huggingface_id, str) else ""
+        base_model_name = repo_name or extract_base_model_name(filename)
+        model_type = extract_model_type(huggingface_id or repo_name or filename)
         model_record = {
             "id": model_id,
             "huggingface_id": huggingface_id,
-            "filename": filename,
-            "display_name": filename.replace(".safetensors", ""),
-            "base_model_name": extract_base_model_name(filename),
+            "display_name": base_model_name,
+            "base_model_name": base_model_name,
             "file_size": file_size,
-            "quantization": os.path.splitext(filename)[0],
-            "model_type": extract_model_type(filename),
+            "model_type": model_type,
             "downloaded_at": datetime.now(_tz.utc).isoformat(),
             "format": "safetensors",
-            "model_format": "safetensors",
             "pipeline_tag": detected_pipeline,
             "config": {"embedding": True} if is_embedding_like else {},
         }
@@ -498,7 +451,6 @@ async def _save_safetensors_download(
             store.update_model(model_id, updates)
         model_record = store.get_model(model_id) or model_record
 
-    lmdeploy_config = get_default_lmdeploy_config(max_context)
     record_safetensors_download(
         huggingface_id=huggingface_id,
         filename=filename,
@@ -506,7 +458,6 @@ async def _save_safetensors_download(
         file_size=file_size,
         metadata=safetensors_metadata,
         tensor_summary=tensor_summary,
-        lmdeploy_config=lmdeploy_config,
         model_id=model_record.get("id"),
     )
     logger.info(f"Safetensors download recorded for {huggingface_id}/{filename} (model_id={model_record.get('id')})")
@@ -518,12 +469,9 @@ def _get_safetensors_model(store, model_id: str) -> dict:
     model_format = (model.get("model_format") or model.get("format") or "gguf").lower()
     if model_format != "safetensors":
         raise HTTPException(status_code=400, detail="Model is not a safetensors download")
-    resolved_path = _get_model_file_path(model)
-    if not resolved_path or not os.path.exists(resolved_path):
-        raise HTTPException(status_code=400, detail="Model file not found on disk")
-    model = dict(model)
-    model["file_path"] = resolved_path
-    return model
+    # Safetensors models are treated as repo-level entities; concrete file paths
+    # are tracked in the safetensors manifest, not on the model record itself.
+    return dict(model)
 
 
 def _load_manifest_entry_for_model(model: dict) -> Dict[str, Any]:
@@ -658,472 +606,6 @@ def _sanitize(obj: Any) -> Any:
         status_code=400, detail="hf_overrides must be an object or JSON string"
     )
 
-
-def _validate_lmdeploy_config(
-    new_config: Optional[Dict[str, Any]], manifest_entry: Dict[str, Any]
-) -> Dict[str, Any]:
-    """
-    Merge and validate LMDeploy configuration.
-    """
-    if new_config is not None and not isinstance(new_config, dict):
-        raise HTTPException(status_code=400, detail="Config payload must be an object")
-
-    base_context_limit = _resolve_context_limit(manifest_entry)
-    stored_config = (manifest_entry.get("lmdeploy") or {}).get("config")
-    baseline = stored_config or get_default_lmdeploy_config(base_context_limit)
-    merged = dict(baseline)
-    if new_config:
-        merged.update(new_config)
-
-    def _as_int(key: str, minimum: int = 1, maximum: Optional[int] = None) -> int:
-        value = merged.get(key, minimum)
-        try:
-            value = int(value)
-        except (TypeError, ValueError):
-            raise HTTPException(status_code=400, detail=f"{key} must be an integer")
-        if value < minimum:
-            value = minimum
-        if maximum is not None and value > maximum:
-            value = maximum
-        return value
-
-    def _as_float(key: str, minimum: float, maximum: float) -> float:
-        value = merged.get(key, minimum)
-        try:
-            value = float(value)
-        except (TypeError, ValueError):
-            raise HTTPException(status_code=400, detail=f"{key} must be a number")
-        if value < minimum:
-            value = minimum
-        if value > maximum:
-            value = maximum
-        return value
-
-    legacy_keys = {
-        "context_length": "session_len",
-        "max_batch_tokens": "max_prefill_token_num",
-    }
-    for legacy, target in legacy_keys.items():
-        if legacy in merged and target not in merged:
-            merged[target] = merged[legacy]
-
-    session_len = _as_int("session_len", minimum=1024, maximum=base_context_limit)
-
-    raw_scaling_mode = str(
-        merged.get("rope_scaling_mode") or merged.get("rope_scaling_type") or "disabled"
-    ).lower()
-    if raw_scaling_mode in {"", "none", "disabled"}:
-        scaling_mode = "disabled"
-    else:
-        scaling_mode = raw_scaling_mode
-
-    scaling_factor_value = merged.get("rope_scaling_factor", 1.0)
-    try:
-        scaling_factor = float(scaling_factor_value)
-    except (TypeError, ValueError):
-        raise HTTPException(
-            status_code=400, detail="rope_scaling_factor must be a number"
-        )
-    if scaling_factor < 1.0:
-        scaling_factor = 1.0
-    if scaling_factor > MAX_ROPE_SCALING_FACTOR:
-        scaling_factor = MAX_ROPE_SCALING_FACTOR
-
-    if scaling_mode == "disabled" or scaling_factor <= 1.0:
-        scaling_mode = "disabled"
-        scaling_factor = 1.0
-    else:
-        # Scaling only makes sense when we know the base context; otherwise reject it.
-        if not base_context_limit:
-            raise HTTPException(
-                status_code=400,
-                detail="RoPE scaling cannot be enabled without a known base context length",
-            )
-
-        # Check if model_max_length > max_position_embeddings (means rope scaling can achieve model_max_length)
-        metadata = manifest_entry.get("metadata") or {}
-        config_data = (
-            metadata.get("config", {})
-            if isinstance(metadata.get("config"), dict)
-            else {}
-        )
-        model_max_length = _coerce_positive_int(metadata.get("model_max_length"))
-        max_position_embeddings = _coerce_positive_int(
-            config_data.get("max_position_embeddings")
-        )
-
-        if (
-            model_max_length
-            and max_position_embeddings
-            and model_max_length > max_position_embeddings
-        ):
-            # Adapt base context to model_max_length / 4 for scaling
-            # This allows 4x scaling to reach model_max_length
-            adapted_base = int(model_max_length / 4)
-            if adapted_base >= 1024:
-                session_len = adapted_base
-            else:
-                # If adapted base is too small, use base context limit
-                session_len = base_context_limit
-        else:
-            # Use base context limit (max_position_embeddings is used for clamping, not for scaling decisions)
-            session_len = base_context_limit
-
-    effective_session_len = session_len
-    if scaling_mode != "disabled":
-        effective_session_len = int(session_len * scaling_factor)
-        # Clamp to model_max_length if available, otherwise max_position_embeddings
-        metadata = manifest_entry.get("metadata") or {}
-        config_data = (
-            metadata.get("config", {})
-            if isinstance(metadata.get("config"), dict)
-            else {}
-        )
-        model_max_length = _coerce_positive_int(metadata.get("model_max_length"))
-        max_position_embeddings = _coerce_positive_int(
-            config_data.get("max_position_embeddings")
-        )
-        if model_max_length:
-            effective_session_len = min(effective_session_len, model_max_length)
-        elif max_position_embeddings:
-            effective_session_len = min(effective_session_len, max_position_embeddings)
-        # Also clamp to LMDeploy's maximum
-        effective_session_len = max(
-            session_len, min(effective_session_len, MAX_LMDEPLOY_CONTEXT)
-        )
-
-    merged["session_len"] = session_len
-    merged["effective_session_len"] = effective_session_len
-    merged["rope_scaling_mode"] = scaling_mode
-    merged["rope_scaling_factor"] = scaling_factor
-
-    max_context_token_num = _as_int(
-        "max_context_token_num",
-        minimum=session_len,
-        maximum=base_context_limit,
-    )
-    merged["max_context_token_num"] = max(max_context_token_num, session_len)
-
-    max_prefill_token_num = _as_int(
-        "max_prefill_token_num",
-        minimum=1,
-        maximum=None,
-    )
-    merged["max_prefill_token_num"] = max_prefill_token_num
-
-    merged["tensor_parallel"] = _as_int("tensor_parallel", minimum=1)
-    merged["max_batch_size"] = _as_int("max_batch_size", minimum=1)
-
-    merged["temperature"] = _as_float("temperature", 0.0, 2.0)
-    merged["top_p"] = _as_float("top_p", 0.0, 1.0)
-    merged["top_k"] = _as_int("top_k", minimum=1)
-    merged["kv_cache_percent"] = _as_float("kv_cache_percent", 0.0, 100.0)
-
-    # Note: tensor_split is kept for backward compatibility but not sent to LMDeploy (--tp-split doesn't exist)
-    tensor_split = merged.get("tensor_split") or []
-    if isinstance(tensor_split, str):
-        tensor_split = [
-            part.strip() for part in tensor_split.split(",") if part.strip()
-        ]
-    if tensor_split:
-        cleaned_split = []
-        for part in tensor_split:
-            try:
-                cleaned_split.append(float(part))
-            except (TypeError, ValueError):
-                raise HTTPException(
-                    status_code=400, detail="tensor_split values must be numbers"
-                )
-        merged["tensor_split"] = cleaned_split
-    else:
-        merged["tensor_split"] = []
-
-    # Server configuration validation
-    def _as_list(key: str) -> list:
-        value = merged.get(key)
-        if value is None:
-            return []
-        if isinstance(value, list):
-            return [str(v) for v in value]
-        if isinstance(value, str):
-            return [v.strip() for v in value.split(",") if v.strip()]
-        return [str(value)]
-
-    merged["allow_origins"] = _as_list("allow_origins")
-    merged["allow_credentials"] = bool(merged.get("allow_credentials", False))
-    merged["allow_methods"] = _as_list("allow_methods")
-    merged["allow_headers"] = _as_list("allow_headers")
-    merged["proxy_url"] = str(merged.get("proxy_url", "")).strip()
-    max_concurrent_requests = merged.get("max_concurrent_requests")
-    if max_concurrent_requests is not None:
-        merged["max_concurrent_requests"] = _as_int(
-            "max_concurrent_requests", minimum=1
-        )
-    log_level = merged.get("log_level")
-    if log_level is not None:
-        log_level = str(log_level).strip().upper()
-        valid_log_levels = {
-            "CRITICAL",
-            "FATAL",
-            "ERROR",
-            "WARN",
-            "WARNING",
-            "INFO",
-            "DEBUG",
-            "NOTSET",
-        }
-        if log_level and log_level not in valid_log_levels:
-            raise HTTPException(
-                status_code=400,
-                detail=f"log_level must be one of {sorted(valid_log_levels)}",
-            )
-        merged["log_level"] = log_level if log_level else None
-    else:
-        merged["log_level"] = None
-    merged["api_keys"] = _as_list("api_keys")
-    merged["ssl"] = bool(merged.get("ssl", False))
-    max_log_len = merged.get("max_log_len")
-    if max_log_len is not None:
-        merged["max_log_len"] = _as_int("max_log_len", minimum=1)
-    merged["disable_fastapi_docs"] = bool(merged.get("disable_fastapi_docs", False))
-    merged["allow_terminate_by_client"] = bool(
-        merged.get("allow_terminate_by_client", False)
-    )
-    merged["enable_abort_handling"] = bool(merged.get("enable_abort_handling", False))
-
-    # Model configuration validation
-    merged["chat_template"] = str(merged.get("chat_template", "")).strip()
-    merged["tool_call_parser"] = str(merged.get("tool_call_parser", "")).strip()
-    merged["reasoning_parser"] = str(merged.get("reasoning_parser", "")).strip()
-    merged["revision"] = str(merged.get("revision", "")).strip()
-    merged["download_dir"] = str(merged.get("download_dir", "")).strip()
-    merged["adapters"] = _as_list("adapters")
-    device = merged.get("device")
-    if device is not None:
-        device = str(device).strip().lower()
-        valid_devices = {"cuda", "ascend", "maca", "camb"}
-        if device and device not in valid_devices:
-            raise HTTPException(
-                status_code=400, detail=f"device must be one of {sorted(valid_devices)}"
-            )
-        merged["device"] = device if device else None
-    else:
-        merged["device"] = None
-    merged["eager_mode"] = bool(merged.get("eager_mode", False))
-    merged["disable_vision_encoder"] = bool(merged.get("disable_vision_encoder", False))
-    logprobs_mode = merged.get("logprobs_mode")
-    if logprobs_mode is not None:
-        logprobs_mode = str(logprobs_mode).strip()
-        valid_logprobs_modes = {"None", "raw_logits", "raw_logprobs"}
-        if logprobs_mode and logprobs_mode not in valid_logprobs_modes:
-            raise HTTPException(
-                status_code=400,
-                detail=f"logprobs_mode must be one of {sorted(valid_logprobs_modes)}",
-            )
-        merged["logprobs_mode"] = logprobs_mode if logprobs_mode else None
-    else:
-        merged["logprobs_mode"] = None
-
-    # DLLM parameters validation
-    dllm_block_length = merged.get("dllm_block_length")
-    if dllm_block_length is not None:
-        merged["dllm_block_length"] = _as_int("dllm_block_length", minimum=1)
-    dllm_unmasking_strategy = merged.get("dllm_unmasking_strategy")
-    if dllm_unmasking_strategy is not None:
-        dllm_unmasking_strategy = str(dllm_unmasking_strategy).strip()
-        valid_dllm_strategies = {
-            "low_confidence_dynamic",
-            "low_confidence_static",
-            "sequential",
-        }
-        if (
-            dllm_unmasking_strategy
-            and dllm_unmasking_strategy not in valid_dllm_strategies
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail=f"dllm_unmasking_strategy must be one of {sorted(valid_dllm_strategies)}",
-            )
-        merged["dllm_unmasking_strategy"] = (
-            dllm_unmasking_strategy if dllm_unmasking_strategy else None
-        )
-    else:
-        merged["dllm_unmasking_strategy"] = None
-    dllm_denoising_steps = merged.get("dllm_denoising_steps")
-    if dllm_denoising_steps is not None:
-        merged["dllm_denoising_steps"] = _as_int("dllm_denoising_steps", minimum=1)
-    dllm_confidence_threshold = merged.get("dllm_confidence_threshold")
-    if dllm_confidence_threshold is not None:
-        merged["dllm_confidence_threshold"] = _as_float(
-            "dllm_confidence_threshold", 0.0, 1.0
-        )
-
-    # Distributed/Multi-node parameters validation
-    dp = merged.get("dp")
-    if dp is not None:
-        merged["dp"] = _as_int("dp", minimum=1)
-    ep = merged.get("ep")
-    if ep is not None:
-        merged["ep"] = _as_int("ep", minimum=1)
-    merged["enable_microbatch"] = bool(merged.get("enable_microbatch", False))
-    merged["enable_eplb"] = bool(merged.get("enable_eplb", False))
-    role = merged.get("role")
-    if role is not None:
-        role = str(role).strip()
-        valid_roles = {"Hybrid", "Prefill", "Decode"}
-        if role and role not in valid_roles:
-            raise HTTPException(
-                status_code=400, detail=f"role must be one of {sorted(valid_roles)}"
-            )
-        merged["role"] = role if role else None
-    else:
-        merged["role"] = None
-    migration_backend = merged.get("migration_backend")
-    if migration_backend is not None:
-        migration_backend = str(migration_backend).strip()
-        valid_migration_backends = {"DLSlime", "Mooncake"}
-        if migration_backend and migration_backend not in valid_migration_backends:
-            raise HTTPException(
-                status_code=400,
-                detail=f"migration_backend must be one of {sorted(valid_migration_backends)}",
-            )
-        merged["migration_backend"] = migration_backend if migration_backend else None
-    else:
-        merged["migration_backend"] = None
-    node_rank = merged.get("node_rank")
-    if node_rank is not None:
-        merged["node_rank"] = _as_int("node_rank", minimum=0)
-    nnodes = merged.get("nnodes")
-    if nnodes is not None:
-        merged["nnodes"] = _as_int("nnodes", minimum=1)
-    cp = merged.get("cp")
-    if cp is not None:
-        merged["cp"] = _as_int("cp", minimum=1)
-    merged["enable_return_routed_experts"] = bool(
-        merged.get("enable_return_routed_experts", False)
-    )
-    distributed_executor_backend = merged.get("distributed_executor_backend")
-    if distributed_executor_backend is not None:
-        distributed_executor_backend = str(distributed_executor_backend).strip()
-        valid_executor_backends = {"uni", "mp", "ray"}
-        if (
-            distributed_executor_backend
-            and distributed_executor_backend not in valid_executor_backends
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail=f"distributed_executor_backend must be one of {sorted(valid_executor_backends)}",
-            )
-        merged["distributed_executor_backend"] = (
-            distributed_executor_backend if distributed_executor_backend else None
-        )
-    else:
-        merged["distributed_executor_backend"] = None
-
-    # Vision parameters validation
-    vision_max_batch_size = merged.get("vision_max_batch_size")
-    if vision_max_batch_size is not None:
-        merged["vision_max_batch_size"] = _as_int("vision_max_batch_size", minimum=1)
-
-    # Speculative decoding parameters validation
-    speculative_algorithm = merged.get("speculative_algorithm")
-    if speculative_algorithm is not None:
-        speculative_algorithm = str(speculative_algorithm).strip()
-        valid_speculative_algorithms = {"eagle", "eagle3", "deepseek_mtp"}
-        if (
-            speculative_algorithm
-            and speculative_algorithm not in valid_speculative_algorithms
-        ):
-            raise HTTPException(
-                status_code=400,
-                detail=f"speculative_algorithm must be one of {sorted(valid_speculative_algorithms)}",
-            )
-        merged["speculative_algorithm"] = (
-            speculative_algorithm if speculative_algorithm else None
-        )
-    else:
-        merged["speculative_algorithm"] = None
-    speculative_draft_model = merged.get("speculative_draft_model")
-    if speculative_draft_model is not None:
-        speculative_draft_model = str(speculative_draft_model).strip()
-        merged["speculative_draft_model"] = (
-            speculative_draft_model if speculative_draft_model else None
-        )
-    else:
-        merged["speculative_draft_model"] = None
-    speculative_num_draft_tokens = merged.get("speculative_num_draft_tokens")
-    if speculative_num_draft_tokens is not None:
-        merged["speculative_num_draft_tokens"] = _as_int(
-            "speculative_num_draft_tokens", minimum=1
-        )
-
-    # Boolean/style cleanups
-    merged["use_streaming"] = bool(merged.get("use_streaming", True))
-    additional_args = merged.get("additional_args")
-    if additional_args is None:
-        merged["additional_args"] = ""
-    elif not isinstance(additional_args, str):
-        raise HTTPException(status_code=400, detail="additional_args must be a string")
-
-    # Build hf_overrides from individual fields or use provided hf_overrides
-    hf_overrides_dict = _normalize_hf_overrides(merged.get("hf_overrides"))
-
-    # If scaling is enabled and model_max_length > max_position_embeddings,
-    # automatically set original_max_position_embeddings in HF overrides
-    if scaling_mode != "disabled":
-        metadata = manifest_entry.get("metadata") or {}
-        config_data = (
-            metadata.get("config", {})
-            if isinstance(metadata.get("config"), dict)
-            else {}
-        )
-        model_max_length = _coerce_positive_int(metadata.get("model_max_length"))
-        max_position_embeddings = _coerce_positive_int(
-            config_data.get("max_position_embeddings")
-        )
-
-        if (
-            model_max_length
-            and max_position_embeddings
-            and model_max_length > max_position_embeddings
-        ):
-            # Set original_max_position_embeddings to adapted base (model_max_length / 4)
-            adapted_base = int(model_max_length / 4)
-            if adapted_base >= 1024:
-                hf_overrides_dict.setdefault("rope_scaling", {})
-                hf_overrides_dict["rope_scaling"][
-                    "original_max_position_embeddings"
-                ] = adapted_base
-                # Also set rope_type if not already set and scaling mode is yarn
-                if (
-                    scaling_mode == "yarn"
-                    and "rope_type" not in hf_overrides_dict["rope_scaling"]
-                ):
-                    hf_overrides_dict["rope_scaling"]["rope_type"] = "yarn"
-                # Set factor if not already set
-                if "factor" not in hf_overrides_dict["rope_scaling"]:
-                    hf_overrides_dict["rope_scaling"]["factor"] = scaling_factor
-        elif max_position_embeddings and max_position_embeddings >= 1024:
-            # Fallback: use max_position_embeddings directly
-            hf_overrides_dict.setdefault("rope_scaling", {})
-            hf_overrides_dict["rope_scaling"][
-                "original_max_position_embeddings"
-            ] = max_position_embeddings
-            # Also set rope_type if not already set and scaling mode is yarn
-            if (
-                scaling_mode == "yarn"
-                and "rope_type" not in hf_overrides_dict["rope_scaling"]
-            ):
-                hf_overrides_dict["rope_scaling"]["rope_type"] = "yarn"
-            # Set factor if not already set
-            if "factor" not in hf_overrides_dict["rope_scaling"]:
-                hf_overrides_dict["rope_scaling"]["factor"] = scaling_factor
-
-    merged["hf_overrides"] = hf_overrides_dict
-
-    return merged
-
-
 class BundleProgressProxy:
     """Proxy progress manager that converts per-file progress into bundle-level updates."""
 
@@ -1228,12 +710,6 @@ async def get_cached_gpu_info() -> Dict[str, Any]:
 download_lock = asyncio.Lock()
 
 
-class EstimationRequest(BaseModel):
-    model_id: str  # YAML model id
-    config: dict
-    usage_mode: Optional[str] = "single_user"
-
-
 class SafetensorsBundleRequest(BaseModel):
     huggingface_id: str
     model_id: Optional[int] = None
@@ -1254,7 +730,10 @@ async def list_models():
     from backend.llama_swap_client import LlamaSwapClient
 
     store = get_store()
-    models = [m for m in store.list_models() if (m.get("format") or m.get("model_format") or "gguf") == "gguf"]
+    # Include all stored models (GGUF and safetensors). GGUF entries appear as
+    # individual quantizations; safetensors entries appear as a single logical
+    # quantization per repo with format "safetensors".
+    models = list(store.list_models())
     try:
         running_data = await LlamaSwapClient().get_running_models()
         running_list = running_data.get("running") or []
@@ -1266,7 +745,7 @@ async def list_models():
     for model in models:
         hf_id = model.get("huggingface_id") or ""
         base_name = model.get("base_model_name") or (hf_id.split("/")[-1] if hf_id else model.get("display_name") or "unknown")
-        proxy_name = generate_proxy_name(hf_id, model.get("quantization"))
+        proxy_name = resolve_proxy_name(model)
         is_active = proxy_name in running_names
         is_embedding = _model_is_embedding(model)
         key = f"{hf_id}_{base_name}"
@@ -1287,14 +766,21 @@ async def list_models():
             if is_embedding and not grouped_models[key].get("is_embedding_model"):
                 grouped_models[key]["is_embedding_model"] = True
 
-        # Resolve actual disk size: prefer HF cache, fall back to stored value
-        resolved_path = _get_model_file_path(model)
-        file_size = _get_actual_file_size(resolved_path) or model.get("file_size") or 0
+        # Resolve actual disk size:
+        # - For HF-backed GGUF models (identified by huggingface_id + quantization),
+        #   trust the aggregated file_size stored on the model record.
+        # - For legacy/local models, fall back to resolving a concrete file_path.
+        if (model.get("format") or model.get("model_format") or "gguf") == "gguf" and model.get("huggingface_id") and model.get("quantization"):
+            file_size = model.get("file_size") or 0
+        else:
+            legacy_path = model.get("file_path")
+            file_size = _get_actual_file_size(legacy_path) or model.get("file_size") or 0
 
         grouped_models[key]["quantizations"].append({
             "id": model.get("id"),
             "name": model.get("display_name") or model.get("name"),
-            "filename": _get_model_filename(model),
+            # No filename persisted for GGUF models; a model is a single logical
+            # entity per (huggingface_id, quantization).
             "file_size": file_size,
             "quantization": model.get("quantization"),
             "format": model.get("format") or model.get("model_format") or "gguf",
@@ -1302,6 +788,7 @@ async def list_models():
             "downloaded_at": model.get("downloaded_at"),
             "is_active": is_active,
             "has_config": bool(model.get("config")),
+            "mmproj_filename": model.get("mmproj_filename"),
             "huggingface_id": hf_id,
             "base_model_name": base_name,
             "model_type": model.get("model_type"),
@@ -1433,15 +920,9 @@ async def delete_safetensors_model(request: dict):
         if not target_model or (target_model.get("format") or target_model.get("model_format")) != "safetensors":
             raise HTTPException(status_code=404, detail="Safetensors model not found")
 
-        manager = get_lmdeploy_manager()
-        status = manager.status()
-        if status.get("running"):
-            current = status.get("current_instance") or {}
-            if str(current.get("model_id")) == str(model_id):
-                raise HTTPException(
-                    status_code=400,
-                    detail="Cannot delete a model currently served by LMDeploy",
-                )
+        # LMDeploy runtime is now managed via llama-swap; safetensors models
+        # are served through the same generic start/stop flow, so we don't
+        # need to special-case LMDeploy here.
 
         from backend.huggingface import (
             get_safetensors_manifest_entries,
@@ -1471,16 +952,8 @@ async def reload_safetensors_from_disk():
         from backend.huggingface import (
             SAFETENSORS_DIR,
             record_safetensors_download,
-            get_default_lmdeploy_config,
         )
 
-        manager = get_lmdeploy_manager()
-        if manager.status().get("running"):
-            raise HTTPException(
-                status_code=400,
-                detail="Cannot reload safetensors models while LMDeploy runtime is active. Please stop the runtime first.",
-            )
-
         store = get_store()
         safetensors_models = [
             m for m in store.list_models()
@@ -1587,46 +1060,6 @@ async def reload_safetensors_from_disk():
         raise HTTPException(status_code=500, detail=str(e))
 
 
-@router.get("/safetensors/{model_id:path}/lmdeploy/config")
-async def get_lmdeploy_config_endpoint(model_id: str):
-    """Return stored LMDeploy config and metadata for a safetensors model."""
-    store = get_store()
-    model = _get_safetensors_model(store, model_id)
-    manifest_entry = _load_manifest_entry_for_model(model)
-    metadata = manifest_entry.get("metadata") or {}
-    tensor_summary = manifest_entry.get("tensor_summary") or {}
-    max_context = manifest_entry.get("max_context_length") or metadata.get(
-        "max_context_length"
-    )
-    config = (manifest_entry.get("lmdeploy") or {}).get(
-        "config"
-    ) or get_default_lmdeploy_config(max_context)
-    manager_status = get_lmdeploy_manager().status()
-    installer_status = get_lmdeploy_installer().status()
-    return {
-        "config": config,
-        "metadata": metadata,
-        "tensor_summary": tensor_summary,
-        "max_context_length": max_context,
-        "manager": manager_status,
-        "installer": installer_status,
-    }
-
-
-@router.put("/safetensors/{model_id:path}/lmdeploy/config")
-async def update_lmdeploy_config_endpoint(model_id: str, request: Dict[str, Any]):
-    """Persist LMDeploy configuration changes for a safetensors model."""
-    store = get_store()
-    model = _get_safetensors_model(store, model_id)
-    manifest_entry = _load_manifest_entry_for_model(model)
-    validated_config = _validate_lmdeploy_config(request, manifest_entry)
-    updated_entry = update_lmdeploy_config(model.get("huggingface_id"), validated_config)
-    return {
-        "config": updated_entry.get("lmdeploy", {}).get("config", validated_config),
-        "updated_at": updated_entry.get("lmdeploy", {}).get("updated_at"),
-    }
-
-
 @router.post("/safetensors/{model_id:path}/metadata/regenerate")
 async def regenerate_safetensors_metadata_endpoint(model_id: str):
     """Refresh safetensors metadata/manifest entries without redownloading files."""
@@ -1705,11 +1138,6 @@ async def regenerate_safetensors_metadata_endpoint(model_id: str):
     if max_context:
         manifest["max_context_length"] = max_context
 
-    manifest.setdefault("lmdeploy", {})
-    manifest["lmdeploy"].setdefault(
-        "config", get_default_lmdeploy_config(manifest.get("max_context_length"))
-    )
-
     save_safetensors_manifest_entries(huggingface_id, manifest)
     return {
         "message": f"Metadata regenerated for {huggingface_id}",
@@ -1718,153 +1146,6 @@ async def regenerate_safetensors_metadata_endpoint(model_id: str):
     }
 
 
-@router.get("/safetensors/lmdeploy/status")
-async def get_lmdeploy_status():
-    """Return LMDeploy runtime status and running instance info."""
-    installer = get_lmdeploy_installer()
-    installer_status = installer.status()
-    if not installer_status.get("installed"):
-        raise HTTPException(
-            status_code=400,
-            detail="LMDeploy is not installed. Install it from the LMDeploy page before starting a runtime.",
-        )
-    if installer_status.get("operation"):
-        raise HTTPException(
-            status_code=409,
-            detail="An LMDeploy install/remove operation is still running. Try again once it finishes.",
-        )
-
-    manager = get_lmdeploy_manager()
-    manager_status = manager.status()
-
-    # Use manager's in-memory current_instance (no DB)
-    instance_payload = None
-    if manager_status.get("running"):
-        current_instance = manager_status.get("current_instance")
-        if current_instance:
-            instance_payload = {
-                "model_id": current_instance.get("model_id"),
-                "started_at": current_instance.get("started_at"),
-                "config": current_instance.get("config") if isinstance(current_instance.get("config"), dict) else {},
-            }
-
-    return {
-        "manager": manager_status,
-        "installer": installer.status(),
-        "running_instance": instance_payload,
-    }
-
-
-@router.post("/safetensors/{model_id:path}/lmdeploy/start")
-async def start_lmdeploy_runtime(
-    model_id: str,
-    request: Optional[Dict[str, Any]] = None,
-):
-    """Start LMDeploy runtime for a safetensors model."""
-    store = get_store()
-    model = _get_safetensors_model(store, model_id)
-    manifest_entry = _load_manifest_entry_for_model(model)
-    requested_config = (
-        (request or {}).get("config") if isinstance(request, dict) else None
-    )
-    validated_config = _validate_lmdeploy_config(requested_config, manifest_entry)
-
-    manager = get_lmdeploy_manager()
-    status = manager.status()
-    current_instance = status.get("current_instance") or {}
-    if status.get("running"):
-        if current_instance.get("model_id") == model.get("id"):
-            raise HTTPException(
-                status_code=400, detail="LMDeploy is already running for this model"
-            )
-        raise HTTPException(
-            status_code=400,
-            detail="Another safetensors model is already running via LMDeploy",
-        )
-
-    update_lmdeploy_config(model.get("huggingface_id"), validated_config)
-
-    try:
-        pm = get_progress_manager()
-        await pm.send_model_status_update(
-            model_id=model.get("id"),
-            status="starting",
-            details={
-                "runtime": "lmdeploy",
-                "message": f"Starting LMDeploy for {model.get('display_name') or model.get('name')}",
-            },
-        )
-    except Exception:
-        pass
-
-    try:
-        display_name = model.get("huggingface_id") or model.get("base_model_name") or model.get("display_name") or model.get("name")
-        resolved_file_path = _get_model_file_path(model)
-        model_dir = os.path.dirname(resolved_file_path or "")
-        runtime_status = await manager.start(
-            {
-                "model_id": model.get("id"),
-                "huggingface_id": model.get("huggingface_id"),
-                "file_path": resolved_file_path,
-                "model_dir": model_dir,
-                "model_name": display_name,
-                "display_name": display_name,
-            },
-            validated_config,
-        )
-    except Exception as exc:
-        try:
-            await get_progress_manager().send_model_status_update(
-                model_id=model.get("id"),
-                status="error",
-                details={"runtime": "lmdeploy", "message": str(exc)},
-            )
-        except Exception:
-            pass
-        raise HTTPException(status_code=500, detail=str(exc))
-
-    try:
-        await get_progress_manager().send_model_status_update(
-            model_id=model.get("id"),
-            status="running",
-            details={"runtime": "lmdeploy", "message": "LMDeploy is ready"},
-        )
-    except Exception:
-        pass
-
-    return {"manager": runtime_status, "config": validated_config}
-
-
-@router.post("/safetensors/{model_id:path}/lmdeploy/stop")
-async def stop_lmdeploy_runtime(model_id: str):
-    """Stop the LMDeploy runtime if it is running."""
-    manager = get_lmdeploy_manager()
-    status = manager.status()
-    if not status.get("running"):
-        raise HTTPException(status_code=404, detail="No LMDeploy runtime is active")
-    current_instance = status.get("current_instance") or {}
-    if str(current_instance.get("model_id")) != str(model_id):
-        raise HTTPException(
-            status_code=400, detail="A different model is currently running in LMDeploy"
-        )
-
-    try:
-        await manager.stop()
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=str(exc))
-
-    try:
-        await get_progress_manager().send_model_status_update(
-            model_id=model_id,
-            status="stopped",
-            details={"runtime": "lmdeploy", "message": "LMDeploy runtime stopped"},
-        )
-    except Exception:
-        pass
-
-    return {"message": "LMDeploy runtime stopped"}
-
-
 @router.post("/download")
 async def download_huggingface_model(
     request: dict, background_tasks: BackgroundTasks
@@ -1900,8 +1181,9 @@ async def download_huggingface_model(
             )
 
         store = get_store()
+        is_mmproj_download = model_format == "gguf" and "mmproj" in filename.lower()
         # Check if this specific quantization already exists
-        if model_format == "gguf":
+        if model_format == "gguf" and not is_mmproj_download:
             quantization = _extract_quantization(filename)
             model_id = f"{huggingface_id.replace('/', '--')}--{quantization}"
             if store.get_model(model_id):
@@ -1911,7 +1193,9 @@ async def download_huggingface_model(
 
         # Extract quantization for better task_id (use same function as search results)
         quantization = (
-            _extract_quantization(filename)
+            os.path.splitext(os.path.basename(filename))[0]
+            if is_mmproj_download
+            else _extract_quantization(filename)
             if model_format == "gguf"
             else os.path.splitext(filename)[0]
         )
@@ -2023,6 +1307,7 @@ async def download_model_task(
     try:
         model_record = None
         metadata_result = None
+        is_mmproj_download = model_format == "gguf" and "mmproj" in filename.lower()
 
         if progress_manager and task_id:
             file_path, file_size = await download_model_with_progress(
@@ -2039,7 +1324,7 @@ async def download_model_task(
                 huggingface_id, filename, model_format
             )
 
-        if model_format == "gguf":
+        if model_format == "gguf" and not is_mmproj_download:
             model_record, metadata_result = await _record_gguf_download_post_fetch(
                 store,
                 huggingface_id,
@@ -2047,29 +1332,10 @@ async def download_model_task(
                 file_path,
                 file_size,
                 pipeline_tag=pipeline_tag,
+                aggregate_size=True,
             )
-            # If vision (mmproj) is available, download F16 projector so the model can run with vision
-            if model_record:
-                mmproj_filename = get_mmproj_f16_filename(huggingface_id)
-                if mmproj_filename:
-                    try:
-                        await download_model(
-                            huggingface_id, mmproj_filename, "gguf"
-                        )
-                        store.update_model(
-                            model_record["id"], {"mmproj_filename": mmproj_filename}
-                        )
-                        model_record = store.get_model(model_record["id"]) or model_record
-                        if progress_manager and task_id:
-                            await progress_manager.send_notification(
-                                title="Vision extension",
-                                message=f"Downloaded {mmproj_filename} for vision support",
-                                type="info",
-                            )
-                    except Exception as mmproj_err:
-                        logger.warning(
-                            f"Could not download vision projector {mmproj_filename} for {huggingface_id}: {mmproj_err}"
-                        )
+        elif model_format == "gguf":
+            logger.info("Downloaded standalone mmproj file for %s: %s", huggingface_id, filename)
         else:
             model_record = await _save_safetensors_download(
                 store,
@@ -2137,13 +1403,20 @@ async def _record_gguf_download_post_fetch(
     file_path: str,
     file_size: int,
     pipeline_tag: Optional[str] = None,
+    aggregate_size: bool = True,
 ) -> Tuple[dict, Optional[Dict[str, Any]]]:
     """
     Shared helper to create GGUF model entries and manifest after a file has been downloaded.
     Returns (model_record dict, metadata_result).
     """
     quantization = _extract_quantization(filename)
-    base_model_name = extract_base_model_name(filename)
+    # Derive the base model name from the Hugging Face repo id instead of any
+    # specific filename. For typical repos like "unsloth/Qwen3.5-0.8B-GGUF",
+    # this yields "Qwen3.5-0.8B".
+    repo_name = huggingface_id.split("/")[-1] if isinstance(huggingface_id, str) else ""
+    base_model_name = repo_name
+    if repo_name.endswith("-GGUF"):
+        base_model_name = repo_name[: -len("-GGUF")]
     detected_pipeline = pipeline_tag
     is_embedding_like = _looks_like_embedding_model(
         detected_pipeline,
@@ -2160,18 +1433,20 @@ async def _record_gguf_download_post_fetch(
 
     if not model_record:
         from datetime import timezone as _tz
+        # New GGUF records do not persist any per-file name. The model is a single
+        # logical entity identified by (huggingface_id, quantization).
         model_record = {
             "id": model_id,
             "huggingface_id": huggingface_id,
-            "filename": filename,
-            "display_name": filename.replace(".gguf", ""),
+            "display_name": f"{base_model_name}-{quantization}",
             "base_model_name": base_model_name,
-            "file_size": file_size,
+            "file_size": file_size if aggregate_size else 0,
             "quantization": quantization,
             "model_type": extract_model_type(filename),
             "proxy_name": generate_proxy_name(huggingface_id, quantization),
+            # Persist only the canonical "format" field. "model_format" is still
+            # read for backward compatibility but no longer written for new records.
             "format": "gguf",
-            "model_format": "gguf",
             "downloaded_at": datetime.now(_tz.utc).isoformat(),
             "pipeline_tag": detected_pipeline,
             "config": {"embedding": True} if is_embedding_like else {},
@@ -2179,7 +1454,7 @@ async def _record_gguf_download_post_fetch(
         store.add_model(model_record)
     else:
         updates = {}
-        if file_size and file_size > 0:
+        if aggregate_size and file_size and file_size > 0:
             current_size = model_record.get("file_size") or 0
             updates["file_size"] = current_size + file_size
         if not model_record.get("pipeline_tag") and detected_pipeline:
@@ -2211,12 +1486,6 @@ async def _record_gguf_download_post_fetch(
         )
     except Exception as manifest_exc:
         logger.warning(f"Failed to record GGUF manifest entry for {filename}: {manifest_exc}")
-    if manifest_entry:
-        metadata_for_defaults = manifest_entry.get("metadata") or {}
-        try:
-            _apply_hf_defaults_to_model(model_record, metadata_for_defaults, store)
-        except Exception as default_exc:
-            logger.warning(f"Failed to apply HF defaults for model {model_record.get('id')}: {default_exc}")
 
     return model_record, metadata_result
 
@@ -2336,16 +1605,21 @@ async def download_gguf_bundle_task(
     task_id: str,
     total_bundle_bytes: int = 0,
     pipeline_tag: Optional[str] = None,
+    projector: Optional[Dict[str, Any]] = None,
 ):
     store = get_store()
     try:
-        total_files = len(files)
+        total_files = len(files) + (1 if projector and projector.get("filename") else 0)
         bytes_completed = 0
         aggregate_total = total_bundle_bytes or sum(
             max(f.get("size") or 0, 0) for f in files
         )
         aggregate_total = aggregate_total or None
 
+        # Track the total on-disk size of all GGUF shards for this quantization only
+        # (projector size is stored separately on the model record).
+        bundle_model_bytes = 0
+
         for index, file_info in enumerate(files):
             filename = file_info["filename"]
             size_hint = max(file_info.get("size") or 0, 0)
@@ -2372,6 +1646,9 @@ async def download_gguf_bundle_task(
             )
 
             try:
+                # For bundles, record manifest/metadata per shard but do not
+                # increment the model's stored file_size here. We will set the
+                # final aggregated size once at the end of the bundle download.
                 await _record_gguf_download_post_fetch(
                     store,
                     huggingface_id,
@@ -2379,11 +1656,61 @@ async def download_gguf_bundle_task(
                     file_path,
                     file_size,
                     pipeline_tag=pipeline_tag,
+                    aggregate_size=False,
                 )
             except Exception as exc:
                 logger.error(f"Failed to record GGUF download for {filename}: {exc}")
 
             bytes_completed += file_size
+            bundle_model_bytes += file_size
+
+        model_id = f"{huggingface_id.replace('/', '--')}--{quantization}"
+        model_record = store.get_model(model_id)
+
+        projector_filename = (projector or {}).get("filename")
+        if projector_filename and model_record:
+            projector_size_hint = max(int((projector or {}).get("size") or 0), 0)
+            cached_projector = resolve_cached_model_path(huggingface_id, projector_filename)
+            if cached_projector and os.path.exists(cached_projector):
+                try:
+                    bytes_completed += os.path.getsize(cached_projector)
+                except OSError:
+                    bytes_completed += projector_size_hint
+            else:
+                proxy = BundleProgressProxy(
+                    progress_manager,
+                    task_id,
+                    bytes_completed,
+                    aggregate_total or 0,
+                    len(files),
+                    total_files,
+                    projector_filename,
+                    huggingface_id,
+                    "gguf-bundle",
+                )
+                _, projector_file_size = await download_model_with_progress(
+                    huggingface_id,
+                    projector_filename,
+                    proxy,
+                    task_id,
+                    projector_size_hint,
+                    "gguf",
+                    huggingface_id,
+                )
+                bytes_completed += projector_file_size
+
+            store.update_model(model_id, {"mmproj_filename": projector_filename})
+
+        # Persist the aggregated GGUF shard size on the model record once,
+        # after all shards have been downloaded.
+        if model_record and bundle_model_bytes > 0:
+            try:
+                store.update_model(model_id, {"file_size": bundle_model_bytes})
+                model_record = store.get_model(model_id) or model_record
+            except Exception as size_exc:
+                logger.warning(
+                    f"Failed to update aggregated GGUF size for {model_id}: {size_exc}"
+                )
 
         final_total = aggregate_total or bytes_completed
         await progress_manager.send_download_progress(
@@ -2410,6 +1737,8 @@ async def download_gguf_bundle_task(
                 "model_format": "gguf-bundle",
                 "quantization": quantization,
                 "filenames": [f["filename"] for f in files],
+                "mmproj_filename": projector_filename,
+                "model_id": model_id,
                 "timestamp": datetime.utcnow().isoformat(),
             }
         )
@@ -2508,6 +1837,8 @@ async def download_gguf_bundle(
     quantization = request.get("quantization")
     files = request.get("files") or []
     pipeline_tag = request.get("pipeline_tag")
+    projector_filename = (request.get("mmproj_filename") or "").strip()
+    projector_size = max(int(request.get("mmproj_size") or 0), 0)
 
     if not huggingface_id:
         raise HTTPException(status_code=400, detail="huggingface_id is required")
@@ -2515,6 +1846,8 @@ async def download_gguf_bundle(
         raise HTTPException(status_code=400, detail="quantization is required")
     if not files:
         raise HTTPException(status_code=400, detail="Repository file list is required")
+    if projector_filename and not _is_mmproj_filename(projector_filename):
+        raise HTTPException(status_code=400, detail="Invalid projector filename")
 
     sanitized_files = []
     declared_total = 0
@@ -2529,6 +1862,11 @@ async def download_gguf_bundle(
     if not sanitized_files:
         raise HTTPException(status_code=400, detail="No valid files to download")
 
+    projector_payload = None
+    if projector_filename:
+        declared_total += projector_size
+        projector_payload = {"filename": projector_filename, "size": projector_size}
+
     task_id = f"download_gguf_bundle_{huggingface_id.replace('/', '_')}_{quantization}_{int(time.time() * 1000)}"
 
     async with download_lock:
@@ -2560,6 +1898,7 @@ async def download_gguf_bundle(
         task_id,
         declared_total,
         pipeline_tag,
+        projector_payload,
     )
 
     return {
@@ -2573,6 +1912,150 @@ async def download_gguf_bundle(
 # Removed duplicate extract_quantization; use `_extract_quantization` from backend.huggingface
 
 
+async def download_model_projector_task(
+    model_id: str,
+    mmproj_filename: str,
+    progress_manager,
+    task_id: str,
+    total_bytes: int = 0,
+):
+    store = get_store()
+    try:
+        model = store.get_model(model_id)
+        if not model:
+            raise RuntimeError("Model no longer exists")
+
+        huggingface_id = model.get("huggingface_id")
+        if not huggingface_id:
+            raise RuntimeError("Model is missing huggingface_id")
+
+        cached_path = resolve_cached_model_path(huggingface_id, mmproj_filename)
+        if cached_path and os.path.exists(cached_path):
+            file_path = cached_path
+            try:
+                file_size = os.path.getsize(cached_path)
+            except OSError:
+                file_size = max(int(total_bytes or 0), 0)
+        else:
+            file_path, file_size = await download_model_with_progress(
+                huggingface_id,
+                mmproj_filename,
+                progress_manager,
+                task_id,
+                total_bytes,
+                "gguf",
+                huggingface_id,
+            )
+
+        store.update_model(model_id, {"mmproj_filename": mmproj_filename})
+        await _regenerate_llama_swap_config(f"projector update for {model_id}")
+
+        if progress_manager:
+            progress_manager.complete_task(task_id, f"Applied projector {mmproj_filename}")
+            await progress_manager.broadcast(
+                {
+                    "type": "download_complete",
+                    "huggingface_id": huggingface_id,
+                    "model_format": "gguf-projector",
+                    "model_id": model_id,
+                    "filename": mmproj_filename,
+                    "mmproj_filename": mmproj_filename,
+                    "file_size": file_size,
+                    "file_path": file_path,
+                    "timestamp": datetime.utcnow().isoformat(),
+                }
+            )
+            await progress_manager.send_notification(
+                title="Projector Ready",
+                message=f"Applied projector {mmproj_filename}",
+                type="success",
+            )
+    except Exception as exc:
+        if progress_manager:
+            progress_manager.fail_task(task_id, str(exc))
+            await progress_manager.send_notification(
+                title="Projector Update Failed",
+                message=str(exc),
+                type="error",
+            )
+    finally:
+        if task_id:
+            async with download_lock:
+                active_downloads.pop(task_id, None)
+
+
+@router.post("/{model_id:path}/projector")
+async def update_model_projector(
+    model_id: str,
+    request: dict,
+    background_tasks: BackgroundTasks,
+):
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    if (model.get("format") or model.get("model_format")) != "gguf":
+        raise HTTPException(status_code=400, detail="Projectors are only supported for GGUF models")
+
+    mmproj_filename = (request.get("mmproj_filename") or "").strip() or None
+    total_bytes = max(int(request.get("total_bytes") or 0), 0)
+
+    if mmproj_filename and not _is_mmproj_filename(mmproj_filename):
+        raise HTTPException(status_code=400, detail="Invalid projector filename")
+
+    current_projector = model.get("mmproj_filename")
+    if mmproj_filename == current_projector:
+        return {"message": "Projector already selected", "applied": True}
+
+    if not mmproj_filename:
+        store.update_model(model_id, {"mmproj_filename": None})
+        await _regenerate_llama_swap_config(f"projector cleared for {model_id}")
+        return {"message": "Projector cleared", "applied": True}
+
+    huggingface_id = model.get("huggingface_id")
+    cached_path = resolve_cached_model_path(huggingface_id, mmproj_filename)
+    if cached_path and os.path.exists(cached_path):
+        store.update_model(model_id, {"mmproj_filename": mmproj_filename})
+        await _regenerate_llama_swap_config(f"projector update for {model_id}")
+        return {"message": "Projector applied", "applied": True}
+
+    task_id = f"download_projector_{model_id.replace('/', '_')}_{int(time.time() * 1000)}"
+    async with download_lock:
+        is_downloading = any(
+            d.get("model_id") == model_id
+            and d.get("filename") == mmproj_filename
+            and d.get("model_format") == "gguf-projector"
+            for d in active_downloads.values()
+        )
+        if is_downloading:
+            raise HTTPException(status_code=409, detail="This projector is already being applied")
+        active_downloads[task_id] = {
+            "huggingface_id": huggingface_id,
+            "model_id": model_id,
+            "filename": mmproj_filename,
+            "model_format": "gguf-projector",
+        }
+
+    pm = get_progress_manager()
+    pm.create_task(
+        "download",
+        f"Projector {mmproj_filename}",
+        {"huggingface_id": huggingface_id, "filename": mmproj_filename, "model_id": model_id},
+        task_id=task_id,
+    )
+    background_tasks.add_task(
+        download_model_projector_task,
+        model_id,
+        mmproj_filename,
+        pm,
+        task_id,
+        total_bytes,
+    )
+    return {
+        "message": "Projector download started",
+        "task_id": task_id,
+        "applied": False,
+    }
+
+
 def extract_model_type(filename: str) -> str:
     """Extract model type from filename"""
     filename_lower = filename.lower()
@@ -2584,6 +2067,10 @@ def extract_model_type(filename: str) -> str:
         return "codellama"
     elif "gemma" in filename_lower:
         return "gemma"
+    # Heuristic: treat any Qwen-family filename as "qwen" unless a more
+    # specific architecture is provided by GGUF metadata later.
+    elif "qwen" in filename_lower:
+        return "qwen"
     return "unknown"
 
 
@@ -2612,6 +2099,40 @@ def extract_base_model_name(filename: str) -> str:
     return name if name else filename
 
 
+@router.get("/{model_id:path}/limits")
+async def get_model_limits(model_id: str):
+    """
+    Return model limits in an engine-agnostic way. Always uses the Hugging Face
+    model card (config.json / model info).
+    - max_context_length: from model card (model_max_length / max_position_embeddings).
+    - layer_count: from model card config (num_hidden_layers / n_layer / num_layers).
+    """
+    store = get_store()
+    model = _get_model_or_404(store, model_id)
+    hf_id = model.get("huggingface_id")
+    if not hf_id:
+        return {"max_context_length": None, "layer_count": None}
+
+    max_ctx = None
+    layer_count = None
+    try:
+        details = await get_model_details(hf_id)
+        config = details.get("config") or {}
+        max_ctx = details.get("model_max_length") or config.get("max_position_embeddings")
+        if isinstance(max_ctx, (int, float)) and max_ctx > 0:
+            max_ctx = int(max_ctx)
+        else:
+            max_ctx = None
+        for key in ("num_hidden_layers", "n_layer", "num_layers"):
+            val = config.get(key)
+            if isinstance(val, (int, float)) and val > 0:
+                layer_count = int(val)
+                break
+    except Exception:
+        pass
+    return {"max_context_length": max_ctx, "layer_count": layer_count}
+
+
 @router.get("/{model_id:path}/config")
 async def get_model_config(model_id: str):
     """Get model's llama.cpp configuration"""
@@ -2640,43 +2161,6 @@ async def update_model_config(model_id: str, config: dict):
     return {"message": "Configuration updated"}
 
 
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.post("/{model_id:path}/auto-config")
-async def generate_auto_config(model_id: str):
-    """Stub: return current config (Smart Auto removed). Optionally apply defaults."""
-    store = get_store()
-    model = _get_model_or_404(store, model_id)
-    config = (model.get("config") or {}).copy()
-    config.setdefault("ctx_size", 2048)
-    config.setdefault("batch_size", 512)
-    config.setdefault("threads", 4)
-    config.setdefault("n_gpu_layers", -1)
-    store.update_model(model_id, {"config": config})
-    return config
-
-
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.post("/{model_id:path}/smart-auto")
-async def generate_smart_auto_config(
-    model_id: str,
-    preset: Optional[str] = None,
-    usage_mode: str = "single_user",
-    speed_quality: Optional[int] = None,
-    use_case: Optional[str] = None,
-    debug: Optional[bool] = False,
-):
-    """Stub: apply defaults (Smart Auto removed)."""
-    store = get_store()
-    model = _get_model_or_404(store, model_id)
-    config = (model.get("config") or {}).copy()
-    config.setdefault("ctx_size", 2048)
-    config.setdefault("batch_size", 512)
-    config.setdefault("threads", 4)
-    config.setdefault("n_gpu_layers", -1)
-    store.update_model(model_id, {"config": config})
-    return config
-
-
 @router.post("/{model_id:path}/start")
 async def start_model(model_id: str):
     """Start model via llama-swap"""
@@ -2684,9 +2168,7 @@ async def start_model(model_id: str):
 
     store = get_store()
     model = _get_model_or_404(store, model_id)
-    proxy_model_name = model.get("proxy_name") or generate_proxy_name(
-        model.get("huggingface_id"), model.get("quantization")
-    )
+    proxy_model_name = resolve_proxy_name(model)
 
     try:
         running_data = await LlamaSwapClient().get_running_models()
@@ -2717,6 +2199,9 @@ async def start_model(model_id: str):
         await llama_swap_manager.regenerate_config_with_active_version()
         model_with_proxy = {**(model or {}), "proxy_name": proxy_model_name}
         await llama_swap_manager.register_model(model_with_proxy, config)
+        client = LlamaSwapClient()
+        client.mark_model_loading(proxy_model_name)
+        await client.load_model(proxy_model_name)
     except Exception as e:
         try:
             await get_progress_manager().send_model_status_update(
@@ -2748,9 +2233,7 @@ async def stop_model(model_id: str):
 
     store = get_store()
     model = _get_model_or_404(store, model_id)
-    proxy_name = model.get("proxy_name") or generate_proxy_name(
-        model.get("huggingface_id"), model.get("quantization")
-    )
+    proxy_name = resolve_proxy_name(model)
 
     try:
         running_data = await LlamaSwapClient().get_running_models()
@@ -2862,7 +2345,7 @@ async def delete_model_group(request: DeleteGroupRequest):
 
     deleted_count = 0
     for model in models:
-        proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization"))
+        proxy_name = resolve_proxy_name(model)
         if proxy_name in running_names:
             try:
                 from backend.llama_swap_manager import get_llama_swap_manager
@@ -2870,15 +2353,6 @@ async def delete_model_group(request: DeleteGroupRequest):
             except Exception as e:
                 logger.warning(f"Failed to stop model {proxy_name}: {e}")
 
-        fname = _get_model_filename(model)
-        if model.get("huggingface_id") and fname:
-            from backend.huggingface import delete_cached_model_file
-            deleted_file = delete_cached_model_file(model.get("huggingface_id"), fname)
-            if not deleted_file:
-                legacy_path = _normalize_model_path(model.get("file_path"))
-                if legacy_path and os.path.exists(legacy_path):
-                    os.remove(legacy_path)
-
         store.delete_model(model.get("id"))
         deleted_count += 1
 
@@ -2892,7 +2366,7 @@ async def delete_model(model_id: str):
 
     store = get_store()
     model = _get_model_or_404(store, model_id)
-    proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization"))
+    proxy_name = resolve_proxy_name(model)
 
     try:
         running_data = await LlamaSwapClient().get_running_models()
@@ -2907,142 +2381,10 @@ async def delete_model(model_id: str):
         except Exception as e:
             logger.warning(f"Failed to stop model {proxy_name}: {e}")
 
-    huggingface_id = model.get("huggingface_id")
-    filename = _get_model_filename(model)
-
-    if huggingface_id and filename:
-        from backend.huggingface import delete_cached_model_file
-        deleted = delete_cached_model_file(huggingface_id, filename)
-        if not deleted:
-            # Fall back to direct removal for legacy records with file_path
-            legacy_path = _normalize_model_path(model.get("file_path"))
-            if legacy_path and os.path.exists(legacy_path):
-                os.remove(legacy_path)
-                logger.info(f"Removed legacy model file: {legacy_path}")
-
     store.delete_model(model_id)
     return {"message": "Model quantization deleted"}
 
 
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.get("/{model_id:path}/layer-info")
-async def get_model_layer_info_endpoint(model_id: str):
-    """Get model layer information from GGUF metadata"""
-    store = get_store()
-    model = _get_model_or_404(store, model_id)
-
-    layer_info = None
-    normalized_path = _get_model_file_path(model)
-    if normalized_path and os.path.exists(normalized_path):
-        try:
-            layer_info = get_model_layer_info(normalized_path)
-        except Exception as e:
-            logger.error(f"Failed to get layer info for model {model_id}: {e}")
-    if layer_info:
-        return {
-            "layer_count": layer_info["layer_count"],
-            "architecture": layer_info["architecture"],
-            "context_length": layer_info["context_length"],
-            "parameter_count": layer_info.get(
-                "parameter_count"
-            ),  # Formatted as "32B", "36B", etc.
-            "vocab_size": layer_info["vocab_size"],
-            "embedding_length": layer_info["embedding_length"],
-            "attention_head_count": layer_info["attention_head_count"],
-            "attention_head_count_kv": layer_info["attention_head_count_kv"],
-            "block_count": layer_info["block_count"],
-            "is_moe": layer_info.get("is_moe", False),
-            "expert_count": layer_info.get("expert_count", 0),
-            "experts_used_count": layer_info.get("experts_used_count", 0),
-        }
-    # Fallback to default values if metadata unavailable
-    logger.warning(
-        f"Using default layer info fallback (32 layers) for model_id={model_id}; "
-        "GGUF metadata could not be read or did not provide layer information."
-    )
-    return {
-        "layer_count": 32,
-        "architecture": "unknown",
-        "context_length": 0,
-        "vocab_size": 0,
-        "embedding_length": 0,
-        "attention_head_count": 0,
-        "attention_head_count_kv": 0,
-        "block_count": 0,
-        "is_moe": False,
-        "expert_count": 0,
-        "experts_used_count": 0,
-    }
-
-
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.get("/{model_id:path}/recommendations")
-async def get_model_recommendations_endpoint(model_id: str):
-    """Stub: recommendations removed with smart_auto. Returns empty defaults."""
-    return {"gpu_layers": None, "context_size": None, "batch_size": None}
-
-
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.get("/{model_id:path}/architecture-presets")
-async def get_architecture_presets_endpoint(model_id: str):
-    """Stub: presets removed. Returns minimal structure."""
-    return {"architecture": "unknown", "presets": {}, "available_presets": []}
-
-
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.post("/vram-estimate")
-async def estimate_vram_usage(request: EstimationRequest):
-    """Stub: simple VRAM estimate (smart_auto removed)."""
-    store = get_store()
-    _get_model_or_404(store, request.model_id)
-    cfg = request.config or {}
-    ngl = int(cfg.get("n_gpu_layers") or -1)
-    ctx = int(cfg.get("ctx_size") or 2048)
-    # Very rough: ~1GB base + per-layer and context
-    estimate_mb = 1024 + (abs(ngl) * 50 if ngl != -1 else 2000) + (ctx // 64)
-    return {"vram_estimate_mb": min(estimate_mb, 96 * 1024), "vram_estimate_gb": round(estimate_mb / 1024, 2)}
-
-
-# DEPRECATED: remove with ModelConfig.vue rewrite
-@router.post("/ram-estimate")
-async def estimate_ram_usage(request: EstimationRequest):
-    """Stub: simple RAM estimate (smart_auto removed)."""
-    store = get_store()
-    _get_model_or_404(store, request.model_id)
-    cfg = request.config or {}
-    ctx = int(cfg.get("ctx_size") or 2048)
-    estimate_mb = 512 + (ctx // 32)
-    return {"ram_estimate_mb": estimate_mb, "ram_estimate_gb": round(estimate_mb / 1024, 2)}
-
-
-@router.get("/{model_id:path}/hf-metadata")
-async def get_model_hf_metadata(model_id: str):
-    store = get_store()
-    model = _get_model_or_404(store, model_id)
-
-    metadata_entry = None
-    if (model.get("model_format") or model.get("format") or "gguf").lower() == "safetensors":
-        metadata_entry = _load_manifest_entry_for_model(model)
-    else:
-        filename = _get_model_filename(model)
-        if not filename:
-            raise HTTPException(status_code=400, detail="Model filename is not set")
-        metadata_entry = get_gguf_manifest_entry(model.get("huggingface_id"), filename)
-
-    if not metadata_entry:
-        raise HTTPException(status_code=404, detail="Metadata not found for model")
-
-    metadata = metadata_entry.get("metadata") or {}
-    defaults = _derive_hf_defaults(metadata)
-
-    return {
-        "metadata": metadata,
-        "gguf_layer_info": metadata_entry.get("gguf_layer_info"),
-        "max_context_length": metadata_entry.get("max_context_length"),
-        "hf_defaults": defaults,
-    }
-
-
 @router.post("/{model_id:path}/regenerate-info")
 async def regenerate_model_info_endpoint(model_id: str):
     """
diff --git a/backend/routes/status.py b/backend/routes/status.py
index c4211d0..62d7b36 100644
--- a/backend/routes/status.py
+++ b/backend/routes/status.py
@@ -3,13 +3,10 @@
 import os
 
 from backend.llama_swap_client import LlamaSwapClient
-from backend.lmdeploy_manager import get_lmdeploy_manager
-from backend.lmdeploy_installer import get_lmdeploy_installer
 
 router = APIRouter()
 
 DEFAULT_PROXY_PORT = 2000
-LMDEPLOY_PORT = 2001
 
 
 @router.get("/status")
@@ -25,12 +22,15 @@ async def get_system_status():
     else:
         running_list = running_data.get("running") or []
 
+    proxy_health = await client.check_health()
+
     active_instances = []
     for i, item in enumerate(running_list):
         proxy_model_name = item.get("model", "")
         state = item.get("state", "")
         runtime_type = "lmdeploy" if state == "lmdeploy" else "llama_cpp"
-        port = LMDEPLOY_PORT if runtime_type == "lmdeploy" else DEFAULT_PROXY_PORT
+        # All traffic is served via the unified llama-swap proxy on DEFAULT_PROXY_PORT.
+        port = DEFAULT_PROXY_PORT
         active_instances.append(
             {
                 "id": i,
@@ -50,10 +50,6 @@ async def get_system_status():
     except FileNotFoundError:
         disk = psutil.disk_usage("/")
 
-    lmdeploy_manager = get_lmdeploy_manager()
-    lmdeploy_status = lmdeploy_manager.status()
-    installer_status = get_lmdeploy_installer().status()
-
     return {
         "system": {
             "cpu_percent": cpu_percent,
@@ -72,15 +68,10 @@ async def get_system_status():
         "running_instances": active_instances,
         "proxy_status": {
             "enabled": True,
-            "port": 2000,
+            "port": DEFAULT_PROXY_PORT,
             "endpoint": "http://localhost:2000/v1/chat/completions",
-        },
-        "lmdeploy_status": {
-            "enabled": True,
-            "port": 2001,
-            "endpoint": "http://localhost:2001/v1/chat/completions",
-            "running": lmdeploy_status.get("running"),
-            "current_instance": lmdeploy_status.get("current_instance"),
-            "installer": installer_status,
+            "healthy": proxy_health.get("healthy", False),
+            "status_code": proxy_health.get("status_code"),
+            "loading_models": proxy_health.get("loading_models", []),
         },
     }
diff --git a/backend/tests/test_lmdeploy_installer.py b/backend/tests/test_lmdeploy_installer.py
index 23bdcb8..9f1d6c8 100644
--- a/backend/tests/test_lmdeploy_installer.py
+++ b/backend/tests/test_lmdeploy_installer.py
@@ -2,12 +2,12 @@
 
 import pytest
 
-from backend.lmdeploy_installer import LMDeployInstaller
+from backend.lmdeploy_manager import LMDeployManager
 
 
 @pytest.mark.asyncio
 async def test_install_prevents_parallel_operations(tmp_path: Path, monkeypatch):
-    installer = LMDeployInstaller(
+    installer = LMDeployManager(
         log_path=str(tmp_path / "lmdeploy.log"),
         state_path=str(tmp_path / "lmdeploy_state.json"),
         base_dir=str(tmp_path / "lmdeploy"),
@@ -19,15 +19,15 @@ def prevent_task(coro):
 
     monkeypatch.setattr(installer, "_create_task", prevent_task)
 
-    result = await installer.install()
+    result = await installer.install_release()
     assert result["message"].startswith("LMDeploy installation started")
 
     with pytest.raises(RuntimeError):
-        await installer.install()
+        await installer.install_release()
 
 
 def test_status_reflects_detection(tmp_path: Path, monkeypatch):
-    installer = LMDeployInstaller(
+    installer = LMDeployManager(
         log_path=str(tmp_path / "lmdeploy.log"),
         state_path=str(tmp_path / "lmdeploy_state.json"),
         base_dir=str(tmp_path / "lmdeploy"),
diff --git a/backend/tests/test_model_introspection.py b/backend/tests/test_model_introspection.py
new file mode 100644
index 0000000..da0c81b
--- /dev/null
+++ b/backend/tests/test_model_introspection.py
@@ -0,0 +1,52 @@
+from backend.model_introspection import GgufIntrospector
+
+
+def test_context_length_prefers_largest_and_uses_config_global():
+    # global config prefers general.context_length / model_max_length / max_position_embeddings
+    metadata = {
+        "general.context_length": 4096,
+        "general.model_max_length": 8192,
+        "qwen.context_length": 2048,
+    }
+    introspector = GgufIntrospector(metadata=metadata, tensors={})
+    info = introspector.build_model_info()
+    assert info.context_length == 8192
+
+
+def test_parameter_count_parses_formatted_and_raw_values():
+    metadata = {
+        "general.parameters": "7B",
+        "general.parameter_count": 6_000_000_000,
+    }
+    introspector = GgufIntrospector(metadata=metadata, tensors={})
+    info = introspector.build_model_info()
+    # 7B should win over 6B
+    assert info.parameter_count_display in {"7B", "7.0B"}
+
+
+def test_moe_detection_from_expert_keys():
+    metadata = {
+        "general.architecture": "glm4moe",
+        "ffn.expert_count": 64,
+        "ffn.num_experts_per_tok": 8,
+    }
+    introspector = GgufIntrospector(metadata=metadata, tensors={})
+    info = introspector.build_model_info()
+    assert info.is_moe is True
+    assert info.expert_count == 64
+    assert info.experts_used_count == 8
+
+
+def test_vocab_and_embedding_from_tensors_when_metadata_missing():
+    tensors = {
+        "tok_embeddings.weight": {
+            "shape": [32000, 4096],
+            "type": 0,
+            "offset": 0,
+        }
+    }
+    introspector = GgufIntrospector(metadata={}, tensors=tensors)
+    info = introspector.build_model_info()
+    assert info.vocab_size == 32000
+    assert info.embedding_length == 4096
+
diff --git a/docker-compose.cuda.yml b/docker-compose.cuda.yml
index 57d5cc0..ae578a0 100644
--- a/docker-compose.cuda.yml
+++ b/docker-compose.cuda.yml
@@ -13,8 +13,8 @@ services:
     environment:
       - CUDA_VISIBLE_DEVICES=all
       - HF_HUB_ENABLE_HF_TRANSFER=1
-      - HF_HOME=/app/data/temp/.cache/huggingface
-      - HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub
+      - HF_HOME=/app/data/hf-cache
+      - HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub
       - RELOAD=false
       # Uncomment and set your HuggingFace API key to enable model search and download
       # - HUGGINGFACE_API_KEY=your_huggingface_token_here
diff --git a/frontend/src/App.vue b/frontend/src/App.vue
index b4b0326..e78ed46 100644
--- a/frontend/src/App.vue
+++ b/frontend/src/App.vue
@@ -5,9 +5,7 @@
     <div class="layout-wrapper">
       <!-- Header -->
       <AppHeader 
-        :status-loading="statusLoading"
-        @refresh-status="refreshStatus"
-        @show-system-info="showSystemInfo"
+        :llama-swap-status="systemStore.systemStatus?.proxy_status || null"
       />
 
       <!-- Navigation -->
@@ -27,12 +25,10 @@
 <script setup>
 // Vue
 import { ref, onMounted, onUnmounted } from 'vue'
-import { useRouter } from 'vue-router'
 
 // PrimeVue
 import ConfirmDialog from 'primevue/confirmdialog'
 import Toast from 'primevue/toast'
-import { useConfirm } from 'primevue/useconfirm'
 import { useToast } from 'primevue/usetoast'
 
 // Stores
@@ -47,8 +43,6 @@ import AppHeader from '@/components/layout/AppHeader.vue'
 import AppNavigation from '@/components/layout/AppNavigation.vue'
 import AppFooter from '@/components/layout/AppFooter.vue'
 
-const router = useRouter()
-const confirm = useConfirm()
 const toast = useToast()
 const systemStore = useEnginesStore()
 const progressStore = useProgressStore()
@@ -77,17 +71,6 @@ const refreshStatus = async () => {
   }
 }
 
-const showSystemInfo = () => {
-  confirm.require({
-    message: `GPU Count: ${systemStore.gpuInfo.device_count || 0}\nTotal VRAM: ${((systemStore.gpuInfo.total_vram || 0) / 1024**3).toFixed(1)} GB\nCUDA Version: ${systemStore.gpuInfo.cuda_version || 'Unknown'}`,
-    header: 'System Information',
-    icon: 'pi pi-info-circle',
-    rejectLabel: 'Close',
-    acceptLabel: '',
-    accept: () => {},
-    reject: () => {}
-  })
-}
 </script>
 
 <style scoped>
diff --git a/frontend/src/components/ModelRow.vue b/frontend/src/components/ModelRow.vue
new file mode 100644
index 0000000..8acaf20
--- /dev/null
+++ b/frontend/src/components/ModelRow.vue
@@ -0,0 +1,90 @@
+<template>
+  <div class="quant-row" :class="{ 'is-active': quant.is_active }">
+    <div class="quant-info">
+      <div class="quant-main">
+        <code class="quant-name">{{ quant.quantization || quant.name }}</code>
+        <Tag v-if="quant.is_active" value="Running" severity="success" />
+      </div>
+      <div class="quant-sub">
+        <span v-if="quant.file_size" class="file-size">
+          {{ props.formatBytes(quant.file_size) }}
+        </span>
+        <span v-if="quant.downloaded_at" class="downloaded-at">
+          Downloaded {{ props.formatDate(quant.downloaded_at) }}
+        </span>
+      </div>
+    </div>
+
+    <div class="quant-actions">
+      <Button
+        v-if="!quant.is_active"
+        label="Start"
+        icon="pi pi-play"
+        size="small"
+        severity="success"
+        outlined
+        :loading="isStarting"
+        @click="emit('start', quant.id)"
+      />
+      <Button
+        v-else
+        label="Stop"
+        icon="pi pi-stop"
+        size="small"
+        severity="warning"
+        outlined
+        :loading="isStopping"
+        @click="emit('stop', quant.id)"
+      />
+      <Button
+        icon="pi pi-cog"
+        text
+        severity="secondary"
+        size="small"
+        v-tooltip.top="'Configure'"
+        @click="emit('configure', quant.id)"
+      />
+      <Button
+        icon="pi pi-trash"
+        text
+        severity="danger"
+        size="small"
+        v-tooltip.top="'Delete'"
+        @click="emit('delete', quant.id)"
+      />
+    </div>
+  </div>
+</template>
+
+<script setup>
+import Button from 'primevue/button'
+import Tag from 'primevue/tag'
+
+const props = defineProps({
+  quant: {
+    type: Object,
+    required: true,
+  },
+  isStarting: {
+    type: Boolean,
+    default: false,
+  },
+  isStopping: {
+    type: Boolean,
+    default: false,
+  },
+  formatBytes: {
+    type: Function,
+    required: true,
+  },
+  formatDate: {
+    type: Function,
+    required: true,
+  },
+})
+
+const { quant, isStarting, isStopping } = props
+
+const emit = defineEmits(['start', 'stop', 'configure', 'delete'])
+</script>
+
diff --git a/frontend/src/components/ThemeToggle.vue b/frontend/src/components/ThemeToggle.vue
index c7fdf04..7c0c974 100644
--- a/frontend/src/components/ThemeToggle.vue
+++ b/frontend/src/components/ThemeToggle.vue
@@ -5,7 +5,7 @@
     severity="secondary"
     text
     :title="isDark ? 'Switch to Light Mode' : 'Switch to Dark Mode'"
-    class="theme-toggle"
+    :class="['theme-toggle', { 'is-dark': isDark }]"
   />
 </template>
 
@@ -18,9 +18,12 @@ const { isDark, toggleTheme } = useTheme()
 
 <style scoped>
 .theme-toggle {
-  transition: all var(--transition-normal);
+  width: 2.4rem;
+  height: 2.4rem;
+  border-radius: 999px;
   position: relative;
   overflow: hidden;
+  transition: transform var(--transition-normal), box-shadow var(--transition-normal), background-color var(--transition-normal), border-color var(--transition-normal);
 }
 
 .theme-toggle::before {
@@ -40,8 +43,8 @@ const { isDark, toggleTheme } = useTheme()
 }
 
 .theme-toggle:hover {
-  transform: rotate(180deg) scale(1.05);
-  box-shadow: var(--shadow-lg), var(--glow-primary);
+  transform: translateY(-1px);
+  box-shadow: var(--shadow-md);
 }
 
 .theme-toggle:hover::before {
@@ -49,6 +52,22 @@ const { isDark, toggleTheme } = useTheme()
 }
 
 .theme-toggle:active {
-  transform: rotate(180deg) scale(0.95);
+  transform: scale(0.96);
+}
+
+.theme-toggle :deep(.p-button-icon) {
+  transition: transform var(--transition-normal), color var(--transition-normal);
+}
+
+.theme-toggle:hover :deep(.p-button-icon) {
+  transform: rotate(12deg) scale(1.08);
+}
+
+.theme-toggle.is-dark :deep(.p-button-icon) {
+  color: #fbbf24;
+}
+
+.theme-toggle:not(.is-dark) :deep(.p-button-icon) {
+  color: var(--accent-cyan);
 }
 </style>
diff --git a/frontend/src/components/common/ProgressTracker.vue b/frontend/src/components/common/ProgressTracker.vue
index 31fcf45..d8d1cf4 100644
--- a/frontend/src/components/common/ProgressTracker.vue
+++ b/frontend/src/components/common/ProgressTracker.vue
@@ -13,18 +13,29 @@
           <i class="pi pi-times-circle text-danger" v-else-if="task.status === 'failed'" />
           <span class="task-description">{{ task.description }}</span>
         </div>
-        <span class="progress-percent">{{ Math.round(task.progress) }}%</span>
+        <div class="progress-meta">
+          <button
+            v-if="getTaskLogs(task).length > 0"
+            type="button"
+            class="logs-toggle"
+            @click="toggleLogs(task.task_id)"
+          >
+            {{ isExpanded(task.task_id) ? 'Hide logs' : 'Show logs' }}
+          </button>
+          <span class="progress-percent">{{ Math.round(task.progress) }}%</span>
+        </div>
       </div>
       <ProgressBar :value="task.progress" :class="task.status === 'failed' ? 'p-progressbar-danger' : ''" />
       <small v-if="task.message" class="task-message" :class="task.status === 'failed' ? 'text-danger' : 'text-muted'">
         {{ task.message }}
       </small>
+      <pre v-if="isExpanded(task.task_id) && getTaskLogs(task).length > 0" class="task-logs">{{ getTaskLogs(task).join('\n') }}</pre>
     </div>
   </div>
 </template>
 
 <script setup>
-import { computed } from 'vue'
+import { computed, ref } from 'vue'
 import ProgressBar from 'primevue/progressbar'
 import { useProgressStore } from '@/stores/progress'
 
@@ -34,6 +45,14 @@ const props = defineProps({
     type: [String, Array],
     default: null,
   },
+  metadataKey: {
+    type: String,
+    default: null,
+  },
+  metadataValue: {
+    type: [String, Number, Boolean],
+    default: null,
+  },
   showCompleted: {
     type: Boolean,
     default: false,
@@ -41,6 +60,7 @@ const props = defineProps({
 })
 
 const progressStore = useProgressStore()
+const expandedLogs = ref({})
 
 const activeTasks = computed(() => {
   const allTasks = Object.values(progressStore.tasks)
@@ -51,10 +71,26 @@ const activeTasks = computed(() => {
       : [props.type]
   return allTasks.filter((t) => {
     const typeMatch = !types || types.length === 0 || types.includes(t.type)
+    const metadataMatch = !props.metadataKey || t?.metadata?.[props.metadataKey] === props.metadataValue
     const statusMatch = t.status === 'running' || (props.showCompleted && t.status === 'completed') || t.status === 'failed'
-    return typeMatch && statusMatch
+    return typeMatch && metadataMatch && statusMatch
   })
 })
+
+function getTaskLogs(task) {
+  return progressStore.getTaskLogs(task.task_id)
+}
+
+function isExpanded(taskId) {
+  return Boolean(expandedLogs.value[taskId])
+}
+
+function toggleLogs(taskId) {
+  expandedLogs.value = {
+    ...expandedLogs.value,
+    [taskId]: !expandedLogs.value[taskId],
+  }
+}
 </script>
 
 <style scoped>
@@ -87,6 +123,13 @@ const activeTasks = computed(() => {
   gap: var(--spacing-sm, 0.5rem);
 }
 
+.progress-meta {
+  display: flex;
+  align-items: center;
+  gap: var(--spacing-sm, 0.5rem);
+  flex-shrink: 0;
+}
+
 .task-info {
   display: flex;
   align-items: center;
@@ -110,6 +153,24 @@ const activeTasks = computed(() => {
   flex-shrink: 0;
 }
 
+.logs-toggle {
+  border: 1px solid var(--border-primary, #2a2f45);
+  background: transparent;
+  color: var(--text-secondary, #9ca3af);
+  border-radius: 999px;
+  padding: 0.2rem 0.55rem;
+  font-size: 0.75rem;
+  line-height: 1.2;
+  cursor: pointer;
+  transition: background-color 0.15s ease, border-color 0.15s ease, color 0.15s ease;
+}
+
+.logs-toggle:hover {
+  background: var(--bg-card-hover, rgba(255, 255, 255, 0.04));
+  border-color: var(--border-hover, #3b4261);
+  color: var(--text-primary, #f3f4f6);
+}
+
 .task-message {
   font-size: 0.75rem;
   display: block;
@@ -118,6 +179,21 @@ const activeTasks = computed(() => {
   white-space: nowrap;
 }
 
+.task-logs {
+  margin: 0;
+  padding: 0.75rem;
+  border-radius: var(--radius-md, 0.5rem);
+  border: 1px solid var(--border-primary, #2a2f45);
+  background: var(--bg-card-hover, rgba(255, 255, 255, 0.03));
+  color: var(--text-secondary, #d1d5db);
+  font-size: 0.75rem;
+  line-height: 1.5;
+  white-space: pre-wrap;
+  word-break: break-word;
+  max-height: 14rem;
+  overflow: auto;
+}
+
 .text-success { color: #22c55e; }
 .text-danger  { color: #ef4444; }
 .text-muted   { color: var(--text-secondary, #9ca3af); }
diff --git a/frontend/src/components/layout/AppHeader.vue b/frontend/src/components/layout/AppHeader.vue
index 65d20a5..76748c0 100644
--- a/frontend/src/components/layout/AppHeader.vue
+++ b/frontend/src/components/layout/AppHeader.vue
@@ -7,22 +7,22 @@
       </div>
       <div class="header-actions">
         <slot name="actions">
+          <a
+            class="llama-swap-link"
+            href="http://localhost:2000/ui"
+            target="_blank"
+            rel="noopener noreferrer"
+            v-tooltip.bottom="'Open llama-swap UI'"
+          >
+            <span
+              class="status-light"
+              :class="llamaSwapHealthy ? 'status-light--online' : 'status-light--offline'"
+              aria-hidden="true"
+            />
+            <span class="llama-swap-label">llama-swap</span>
+            <i class="pi pi-external-link" aria-hidden="true" />
+          </a>
           <ThemeToggle />
-          <Button 
-            icon="pi pi-refresh" 
-            @click="$emit('refresh-status')"
-            :loading="statusLoading"
-            severity="secondary"
-            size="small"
-            v-tooltip.top="'Refresh System Status'"
-          />
-          <Button 
-            icon="pi pi-info-circle" 
-            @click="$emit('show-system-info')"
-            severity="secondary"
-            size="small"
-            v-tooltip.top="'Show System Information'"
-          />
         </slot>
       </div>
     </div>
@@ -30,20 +30,60 @@
 </template>
 
 <script setup>
-import Button from 'primevue/button'
+import { computed } from 'vue'
 import ThemeToggle from '@/components/ThemeToggle.vue'
 
-defineProps({
-  statusLoading: {
-    type: Boolean,
-    default: false
+const props = defineProps({
+  llamaSwapStatus: {
+    type: Object,
+    default: null
   }
 })
 
-defineEmits(['refresh-status', 'show-system-info'])
+const llamaSwapHealthy = computed(() => Boolean(props.llamaSwapStatus?.healthy))
 </script>
 
 <style scoped>
-/* Header styles are in global _base.css */
+.llama-swap-link {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.45rem;
+  padding: 0.45rem 0.7rem;
+  border: 1px solid var(--border-primary);
+  border-radius: 999px;
+  color: var(--text-primary);
+  text-decoration: none;
+  background: var(--bg-surface);
+  transition: border-color 0.15s ease, transform 0.15s ease, background 0.15s ease;
+}
+
+.llama-swap-link:hover {
+  border-color: var(--accent-cyan);
+  background: var(--bg-card-hover, rgba(255, 255, 255, 0.04));
+  transform: translateY(-1px);
+}
+
+.status-light {
+  width: 0.55rem;
+  height: 0.55rem;
+  border-radius: 999px;
+  display: inline-block;
+  box-shadow: 0 0 0 0.2rem rgba(255, 255, 255, 0.04);
+}
+
+.status-light--online {
+  background: #22c55e;
+  box-shadow: 0 0 0.45rem rgba(34, 197, 94, 0.55);
+}
+
+.status-light--offline {
+  background: #ef4444;
+  box-shadow: 0 0 0.45rem rgba(239, 68, 68, 0.45);
+}
+
+.llama-swap-label {
+  font-size: 0.82rem;
+  font-weight: 600;
+}
 </style>
 
diff --git a/frontend/src/components/system/VersionTable.vue b/frontend/src/components/system/VersionTable.vue
index f6974d4..56c6ef2 100644
--- a/frontend/src/components/system/VersionTable.vue
+++ b/frontend/src/components/system/VersionTable.vue
@@ -14,7 +14,7 @@
         <div class="version-info">
           <code class="version-name">{{ v.version }}</code>
           <Tag v-if="v.is_active" value="Active" severity="success" />
-          <Tag :value="v.type || 'release'" severity="secondary" />
+          <Tag :value="v.type || 'source'" severity="secondary" />
           <small v-if="v.repository_source" class="repo-label">{{ v.repository_source }}</small>
           <small v-if="v.build_config?.cuda" class="cuda-badge">CUDA</small>
         </div>
@@ -34,7 +34,8 @@
             text
             severity="danger"
             size="small"
-            v-tooltip.top="'Delete version'"
+            :disabled="v.is_active"
+            v-tooltip.top="v.is_active ? 'Active versions cannot be deleted' : 'Delete version'"
             @click="$emit('delete', v.id ?? v.version)"
           />
         </div>
diff --git a/frontend/src/stores/engines.js b/frontend/src/stores/engines.js
index 0642899..5335b09 100644
--- a/frontend/src/stores/engines.js
+++ b/frontend/src/stores/engines.js
@@ -37,13 +37,25 @@ export const useEnginesStore = defineStore('engines', () => {
     return data
   }
 
-  async function fetchReleaseAssets(tagName) {
-    const { data } = await axios.get(`/api/llama-versions/releases/${encodeURIComponent(tagName)}/assets`)
+  async function fetchBuildSettings(engine) {
+    const { data } = await axios.get('/api/llama-versions/build-settings', {
+      params: { engine },
+    })
+    return data
+  }
+
+  async function saveBuildSettings(engine, settings) {
+    const { data } = await axios.put('/api/llama-versions/build-settings', settings, {
+      params: { engine },
+    })
     return data
   }
 
-  async function installRelease(params) {
-    const { data } = await axios.post('/api/llama-versions/install-release', params)
+  async function updateEngine(engine, params = {}) {
+    const { data } = await axios.post('/api/llama-versions/update', {
+      engine,
+      ...params,
+    })
     await fetchLlamaVersions()
     return data
   }
@@ -111,11 +123,6 @@ export const useEnginesStore = defineStore('engines', () => {
     await fetchLmdeployStatus()
   }
 
-  async function fetchLmdeployLogs(maxBytes = 8192) {
-    const { data } = await axios.get('/api/lmdeploy/logs', { params: { max_bytes: maxBytes } })
-    return data
-  }
-
   // --- GPU / System ---
 
   async function fetchGpuInfo() {
@@ -165,8 +172,9 @@ export const useEnginesStore = defineStore('engines', () => {
     checkLlamaCppUpdates,
     checkIkLlamaUpdates,
     checkLmdeployUpdates,
-    fetchReleaseAssets,
-    installRelease,
+    fetchBuildSettings,
+    saveBuildSettings,
+    updateEngine,
     buildSource,
     activateVersion,
     deleteVersion,
@@ -180,7 +188,6 @@ export const useEnginesStore = defineStore('engines', () => {
     installLmdeploy,
     installLmdeployFromSource,
     removeLmdeploy,
-    fetchLmdeployLogs,
 
     fetchGpuInfo,
     fetchSystemStatus,
diff --git a/frontend/src/stores/models.js b/frontend/src/stores/models.js
index c8ac178..2d4e369 100644
--- a/frontend/src/stores/models.js
+++ b/frontend/src/stores/models.js
@@ -5,6 +5,9 @@ import axios from 'axios'
 export const useModelStore = defineStore('models', () => {
   const models = ref([])        // array of groups: { huggingface_id, base_model_name, quantizations[] }
   const loading = ref(false)
+  const searchQuery = ref('')
+  const searchLastQuery = ref('')
+  const searchHasSearched = ref(false)
   const searchResults = ref([])
   const searchLoading = ref(false)
   const searchFormat = ref('gguf')
@@ -90,6 +93,9 @@ export const useModelStore = defineStore('models', () => {
     searchLoading.value = true
     try {
       const { data } = await axios.post('/api/models/search', { query, limit, model_format: modelFormat })
+      searchQuery.value = query
+      searchLastQuery.value = query
+      searchHasSearched.value = true
       searchResults.value = Array.isArray(data) ? data : []
       searchFormat.value = modelFormat
       return searchResults.value
@@ -102,6 +108,13 @@ export const useModelStore = defineStore('models', () => {
     }
   }
 
+  function clearSearchState() {
+    searchQuery.value = ''
+    searchLastQuery.value = ''
+    searchHasSearched.value = false
+    searchResults.value = []
+  }
+
   // ── Download ──────────────────────────────────────────────
 
   async function downloadModel(huggingfaceId, filename, totalBytes = 0, modelFormat = 'gguf', pipelineTag = null) {
@@ -124,12 +137,14 @@ export const useModelStore = defineStore('models', () => {
     return data
   }
 
-  async function downloadGgufBundle(huggingfaceId, quantization, files, pipelineTag = null) {
+  async function downloadGgufBundle(huggingfaceId, quantization, files, pipelineTag = null, mmprojFilename = null, mmprojSize = 0) {
     const { data } = await axios.post('/api/models/gguf/download-bundle', {
       huggingface_id: huggingfaceId,
       quantization,
       files,
       pipeline_tag: pipelineTag,
+      mmproj_filename: mmprojFilename,
+      mmproj_size: mmprojSize,
     })
     return data
   }
@@ -163,6 +178,14 @@ export const useModelStore = defineStore('models', () => {
     return data
   }
 
+  async function updateModelProjector(modelId, mmprojFilename = null, totalBytes = 0) {
+    const { data } = await axios.post(`/api/models/${encodeURIComponent(modelId)}/projector`, {
+      mmproj_filename: mmprojFilename,
+      total_bytes: totalBytes,
+    })
+    return data
+  }
+
   // ── HuggingFace Token ─────────────────────────────────────
 
   async function fetchHuggingfaceTokenStatus() {
@@ -226,6 +249,9 @@ export const useModelStore = defineStore('models', () => {
   return {
     models,
     loading,
+    searchQuery,
+    searchLastQuery,
+    searchHasSearched,
     searchResults,
     searchLoading,
     searchFormat,
@@ -249,6 +275,7 @@ export const useModelStore = defineStore('models', () => {
     deleteModelGroup,
     deleteSafetensorsModel,
     searchModels,
+    clearSearchState,
     downloadModel,
     downloadSafetensorsBundle,
     downloadGgufBundle,
@@ -257,6 +284,7 @@ export const useModelStore = defineStore('models', () => {
     getModelConfig,
     updateModelConfig,
     getModelDetails,
+    updateModelProjector,
     fetchHuggingfaceTokenStatus,
     setHuggingfaceToken,
     clearHuggingfaceToken,
diff --git a/frontend/src/stores/progress.js b/frontend/src/stores/progress.js
index 7cfecda..a30d033 100644
--- a/frontend/src/stores/progress.js
+++ b/frontend/src/stores/progress.js
@@ -14,8 +14,6 @@ const SSE_EVENT_TYPES = [
   'model_status',
   'model_event',
   'unified_monitoring',
-  'lmdeploy_status',
-  'lmdeploy_runtime_log',
   'lmdeploy_install_status',
   'lmdeploy_install_log',
   'cuda_install_status',
@@ -26,9 +24,13 @@ const SSE_EVENT_TYPES = [
 
 export const useProgressStore = defineStore('progress', () => {
   const tasks = ref({})
+  const taskLogs = ref({})
   const eventSource = ref(null)
   const connected = ref(false)
   const subscribers = ref(new Map()) // eventType -> Set<callback>
+  const CUDA_TASK_ID = 'cuda_operation'
+  const LMDEPLOY_TASK_ID = 'lmdeploy_operation'
+  const MAX_LOG_LINES = 200
 
   const activeTasks = computed(() => {
     return Object.values(tasks.value).filter(t => t.status === 'running')
@@ -51,16 +53,214 @@ export const useProgressStore = defineStore('progress', () => {
     if (any) any.forEach(cb => { try { cb(eventType, data) } catch (_) {} })
   }
 
+  function upsertTask(taskId, updates) {
+    const existing = tasks.value[taskId] || {}
+    tasks.value = {
+      ...tasks.value,
+      [taskId]: {
+        ...existing,
+        task_id: taskId,
+        ...updates,
+      },
+    }
+  }
+
+  function appendTaskLogs(taskId, lines) {
+    const entries = Array.isArray(lines) ? lines : [lines]
+    const existing = taskLogs.value[taskId] || []
+    const next = [...existing]
+    const seen = new Set(existing)
+
+    entries.forEach((entry) => {
+      if (typeof entry !== 'string') return
+      entry.split(/\r?\n/).forEach((rawLine) => {
+        const line = rawLine.trim()
+        if (!line) return
+        if (seen.has(line)) return
+        seen.add(line)
+        next.push(line)
+      })
+    })
+
+    if (next.length === existing.length) return
+
+    taskLogs.value = {
+      ...taskLogs.value,
+      [taskId]: next.slice(-MAX_LOG_LINES),
+    }
+  }
+
+  function syncTaskLogsFromTask(task) {
+    if (!task?.task_id) return
+
+    const existing = taskLogs.value[task.task_id] || []
+    const metadataLines = Array.isArray(task.metadata?.log_lines) ? task.metadata.log_lines : []
+
+    if (existing.length === 0 && metadataLines.length > 0) {
+      appendTaskLogs(task.task_id, metadataLines)
+    }
+
+    if (task.message && existing.length === 0) {
+      appendTaskLogs(task.task_id, task.message)
+    }
+  }
+
+  function normalizeCudaTask(eventType, payload) {
+    if (!payload || typeof payload !== 'object') return
+
+    if (eventType === 'cuda_install_status') {
+      const operation = payload.operation || payload.status || 'install'
+      const description = operation === 'uninstall' ? 'Uninstall CUDA' : 'Install CUDA'
+
+      if (payload.status === 'completed' || payload.status === 'failed') {
+        const existing = tasks.value[CUDA_TASK_ID] || {}
+        upsertTask(CUDA_TASK_ID, {
+          type: 'install',
+          description,
+          progress: payload.status === 'completed' ? 100 : (existing.progress ?? 0),
+          status: payload.status,
+          message: payload.message || existing.message || '',
+          metadata: {
+            ...(existing.metadata || {}),
+            target: 'cuda',
+            operation,
+            ended_at: payload.ended_at,
+          },
+        })
+        appendTaskLogs(CUDA_TASK_ID, payload.message)
+        return
+      }
+
+      upsertTask(CUDA_TASK_ID, {
+        type: 'install',
+        description,
+        progress: 0,
+        status: 'running',
+        message: payload.message || (operation === 'uninstall' ? 'Preparing CUDA uninstall...' : 'Preparing CUDA install...'),
+        metadata: {
+          target: 'cuda',
+          operation,
+          started_at: payload.started_at,
+        },
+      })
+      appendTaskLogs(CUDA_TASK_ID, payload.message)
+      return
+    }
+
+    if (eventType === 'cuda_install_progress') {
+      const existing = tasks.value[CUDA_TASK_ID] || {}
+      const operation = existing.metadata?.operation || 'install'
+      upsertTask(CUDA_TASK_ID, {
+        type: 'install',
+        description: operation === 'uninstall' ? 'Uninstall CUDA' : 'Install CUDA',
+        progress: Number(payload.progress ?? existing.progress ?? 0),
+        status: existing.status === 'failed' ? 'failed' : 'running',
+        message: payload.message || existing.message || '',
+        metadata: {
+          ...(existing.metadata || {}),
+          target: 'cuda',
+          stage: payload.stage,
+          timestamp: payload.timestamp,
+        },
+      })
+    }
+  }
+
+  function normalizeLmdeployTask(eventType, payload) {
+    if (!payload || typeof payload !== 'object') return
+
+    if (eventType === 'lmdeploy_install_status') {
+      const operation = payload.operation || payload.status || 'install'
+      const actionMap = {
+        install: 'Install LMDeploy',
+        install_source: 'Install LMDeploy from Source',
+        remove: 'Remove LMDeploy',
+      }
+      const description = actionMap[operation] || 'Install LMDeploy'
+
+      if (payload.status === 'completed' || payload.status === 'failed') {
+        const existing = tasks.value[LMDEPLOY_TASK_ID] || {}
+        upsertTask(LMDEPLOY_TASK_ID, {
+          type: 'install',
+          description,
+          progress: payload.status === 'completed' ? 100 : (existing.progress ?? 0),
+          status: payload.status,
+          message: payload.message || existing.message || '',
+          metadata: {
+            ...(existing.metadata || {}),
+            target: 'lmdeploy',
+            operation,
+            ended_at: payload.ended_at,
+          },
+        })
+        appendTaskLogs(LMDEPLOY_TASK_ID, payload.message)
+        return
+      }
+
+      upsertTask(LMDEPLOY_TASK_ID, {
+        type: 'install',
+        description,
+        progress: 10,
+        status: 'running',
+        message: payload.message || 'Preparing LMDeploy operation...',
+        metadata: {
+          target: 'lmdeploy',
+          operation,
+          started_at: payload.started_at,
+          log_count: 0,
+        },
+      })
+      appendTaskLogs(LMDEPLOY_TASK_ID, payload.message)
+      return
+    }
+
+    if (eventType === 'lmdeploy_install_log') {
+      const existing = tasks.value[LMDEPLOY_TASK_ID]
+      if (!existing || existing.status !== 'running') return
+      const logCount = Number(existing.metadata?.log_count || 0) + 1
+      const progress = Math.min(90, Math.max(Number(existing.progress || 10), 10 + logCount * 3))
+      upsertTask(LMDEPLOY_TASK_ID, {
+        type: 'install',
+        description: existing.description || 'Install LMDeploy',
+        progress,
+        status: 'running',
+        message: payload.line || existing.message || '',
+        metadata: {
+          ...(existing.metadata || {}),
+          target: 'lmdeploy',
+          log_count: logCount,
+          timestamp: payload.timestamp,
+        },
+      })
+      appendTaskLogs(LMDEPLOY_TASK_ID, payload.line)
+    }
+  }
+
   function handleEvent(eventType, rawData) {
     let data = rawData
     try {
       if (typeof rawData === 'string') data = JSON.parse(rawData)
     } catch (_) { return }
+    const payload = data?.data != null ? data.data : data
+    if (eventType === 'cuda_install_status' || eventType === 'cuda_install_progress') {
+      normalizeCudaTask(eventType, payload)
+    }
+    if (eventType === 'cuda_install_log') {
+      appendTaskLogs(CUDA_TASK_ID, payload?.line)
+    }
+    if (eventType === 'lmdeploy_install_status' || eventType === 'lmdeploy_install_log') {
+      normalizeLmdeployTask(eventType, payload)
+    }
+    if (eventType === 'build_progress') {
+      appendTaskLogs(payload?.task_id, payload?.log_lines)
+    }
     if (eventType === 'task_created' || eventType === 'task_updated') {
       const task = data?.data ?? data
-      if (task?.task_id) tasks.value = { ...tasks.value, [task.task_id]: task }
+      if (task?.task_id) {
+        tasks.value = { ...tasks.value, [task.task_id]: task }
+        syncTaskLogsFromTask(task)
+      }
     }
-    const payload = data?.data != null ? data.data : data
     notifySubscribers(eventType, payload)
     if (payload?.type && payload.type !== eventType) notifySubscribers(payload.type, payload)
   }
@@ -104,6 +304,10 @@ export const useProgressStore = defineStore('progress', () => {
     return tasks.value[taskId] || null
   }
 
+  function getTaskLogs(taskId) {
+    return taskLogs.value[taskId] || []
+  }
+
   function subscribe(eventType, callback) {
     if (!subscribers.value.has(eventType)) subscribers.value.set(eventType, new Set())
     subscribers.value.get(eventType).add(callback)
@@ -123,12 +327,11 @@ export const useProgressStore = defineStore('progress', () => {
   const subscribeToDownloadComplete = (cb) => subscribe('download_complete', cb)
   const subscribeToUnifiedMonitoring = (cb) => subscribe('unified_monitoring', cb)
   const subscribeToModelEvents = (cb) => subscribe('model_event', cb)
-  const subscribeToLmdeployStatus = (cb) => subscribe('lmdeploy_status', cb)
   const subscribeToLmdeployInstallLog = (cb) => subscribe('lmdeploy_install_log', cb)
-  const subscribeToLmdeployRuntimeLog = (cb) => subscribe('lmdeploy_runtime_log', cb)
 
   return {
     tasks,
+    taskLogs,
     activeTasks,
     connected,
     connectionStatus,
@@ -136,6 +339,7 @@ export const useProgressStore = defineStore('progress', () => {
     connect,
     disconnect,
     getTask,
+    getTaskLogs,
     subscribe,
     subscribeToDownloadProgress,
     subscribeToBuildProgress,
@@ -144,8 +348,6 @@ export const useProgressStore = defineStore('progress', () => {
     subscribeToDownloadComplete,
     subscribeToUnifiedMonitoring,
     subscribeToModelEvents,
-    subscribeToLmdeployStatus,
-    subscribeToLmdeployInstallLog,
-    subscribeToLmdeployRuntimeLog
+    subscribeToLmdeployInstallLog
   }
 })
diff --git a/frontend/src/styles/_components.css b/frontend/src/styles/_components.css
index 2154168..e07017f 100644
--- a/frontend/src/styles/_components.css
+++ b/frontend/src/styles/_components.css
@@ -112,6 +112,11 @@
   margin-top: var(--spacing-xs);
 }
 
+/* Global PrimeVue Tag tweaks */
+.p-tag.p-component {
+  padding-inline: 0.75rem;
+}
+
 .model-tag {
   display: inline-flex;
   align-items: center;
@@ -534,7 +539,6 @@
 /* Form inputs */
 .p-inputtext,
 .p-textarea,
-.p-dropdown,
 .p-inputnumber {
   padding: var(--spacing-sm);
   border-radius: var(--radius-md);
@@ -546,6 +550,16 @@
   box-sizing: border-box;
 }
 
+.p-dropdown {
+  border-radius: var(--radius-md);
+  border: 1px solid var(--border-primary);
+  background: var(--bg-surface);
+  color: var(--text-primary);
+  transition: all var(--transition-normal);
+  width: 100%;
+  box-sizing: border-box;
+}
+
 .p-inputtext::placeholder,
 .p-textarea::placeholder {
   color: var(--text-muted);
@@ -573,7 +587,16 @@
 }
 
 .p-dropdown .p-dropdown-label {
+  padding: calc(var(--spacing-sm) * 0.6) var(--spacing-sm);
   color: var(--text-primary);
+  display: flex;
+  align-items: center;
+  font-size: 0.875rem;
+}
+
+.p-dropdown .p-dropdown-trigger {
+  width: 2.4rem;
+  color: var(--text-secondary);
 }
 
 .p-dropdown-panel {
diff --git a/frontend/src/views/EnginesView.vue b/frontend/src/views/EnginesView.vue
index a7c78b0..363e291 100644
--- a/frontend/src/views/EnginesView.vue
+++ b/frontend/src/views/EnginesView.vue
@@ -45,148 +45,151 @@
                 <ProgressBar :value="diskPercent" :showValue="false" class="metric-bar" />
               </div>
             </div>
-            <div v-if="gpu" class="metric-card">
+            <div class="metric-card metric-card--actionable">
               <i class="pi pi-bolt metric-icon" />
               <div class="metric-data">
-                <div class="metric-label">GPU — {{ gpu.name }}</div>
+                <div class="metric-label">CUDA Toolkit</div>
                 <div class="metric-value">
-                  {{ formatBytesIEC(gpu.memory_used_mb * 1048576) }} /
-                  {{ formatBytesIEC(gpu.memory_total_mb * 1048576) }} VRAM
+                  <template v-if="cuda.installed">CUDA {{ cuda.version || '?' }}</template>
+                  <template v-else>Not Installed</template>
                 </div>
-                <ProgressBar :value="gpuPercent" :showValue="false" class="metric-bar" />
+                <div class="metric-subvalue">
+                  <template v-if="cuda.installed_versions?.length">
+                    {{ cuda.installed_versions.length }} version{{ cuda.installed_versions.length === 1 ? '' : 's' }} detected
+                  </template>
+                  <template v-else-if="cuda.cuda_path">
+                    {{ cuda.cuda_path }}
+                  </template>
+                  <template v-else>
+                    Build support and toolkit management
+                  </template>
+                </div>
+                <div class="metric-actions">
+                  <Button icon="pi pi-refresh" text severity="secondary" size="small"
+                    v-tooltip.top="'Reload CUDA status'"
+                    @click.stop="enginesStore.fetchCudaStatus()" />
+                  <Button label="Install" icon="pi pi-download" severity="success" outlined size="small"
+                    @click.stop="cudaInstallDialogVisible = true" />
+                </div>
+              </div>
+            </div>
+            <div v-for="(gpuItem, idx) in gpus" :key="gpuItem.index ?? gpuItem.uuid ?? gpuItem.name ?? idx" class="metric-card">
+              <i class="pi pi-bolt metric-icon" />
+              <div class="metric-data">
+                <div class="metric-label">GPU — {{ gpuItem.name }}</div>
+                <div class="metric-value">
+                  {{ formatBytesIEC(gpuItem.memory_used_mb * 1048576) }} /
+                  {{ formatBytesIEC(gpuItem.memory_total_mb * 1048576) }} VRAM
+                </div>
+                <ProgressBar :value="gpuPercent(gpuItem)" :showValue="false" class="metric-bar" />
               </div>
             </div>
           </div>
-        </div>
-      </Transition>
-    </section>
-
-    <!-- ── llama.cpp ──────────────────────────────────────── -->
-    <section class="ev-section">
-      <div class="ev-section-header">
-        <div class="ev-section-title">
-          <i class="pi pi-microchip" />
-          <h2>llama.cpp</h2>
-          <Tag v-if="activeLlamaCpp" :value="activeLlamaCpp.version" severity="success" />
-          <Tag v-else-if="enginesStore.llamaVersions.length" value="No Active" severity="warning" />
-        </div>
-        <div class="ev-section-actions">
-          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
-            :loading="checkingLlamaCpp" @click="checkLlamaCppUpdates" />
-          <Button icon="pi pi-refresh" text severity="secondary" size="small"
-            @click="enginesStore.fetchLlamaVersions()" />
-        </div>
-      </div>
-      <div class="ev-section-body">
-        <div v-if="llamaCppUpdateInfo?.update_available" class="update-banner">
-          <i class="pi pi-arrow-up-right" />
-          Update available: <strong>{{ llamaCppUpdateInfo.latest_version }}</strong>
-          <a :href="llamaCppUpdateInfo.release_url" target="_blank" class="update-link">View release</a>
-        </div>
-        <div v-else-if="llamaCppUpdateInfo" class="update-current">
-          <i class="pi pi-check" /> Up to date ({{ llamaCppUpdateInfo.current_version }})
-        </div>
-
-        <ProgressTracker :type="['build', 'install_release']" />
-
-        <div class="ev-actions">
-          <Button label="Install Release" icon="pi pi-download" severity="success" outlined size="small"
-            @click="openReleaseDialog('llama_cpp')" />
-          <Button label="Build from Source" icon="pi pi-code" severity="info" outlined size="small"
-            @click="openBuildDialog('llama_cpp')" />
-        </div>
-
-        <VersionTable
-          :versions="enginesStore.llamaVersions"
-          :activating="activating"
-          @activate="activateVersion"
-          @delete="confirmDeleteVersion"
-        />
-      </div>
-    </section>
-
-    <!-- ── ik_llama.cpp ───────────────────────────────────── -->
-    <section class="ev-section">
-      <div class="ev-section-header">
-        <div class="ev-section-title">
-          <i class="pi pi-microchip" />
-          <h2>ik_llama.cpp</h2>
-          <Tag v-if="activeIkLlama" :value="activeIkLlama.version" severity="success" />
-          <Tag v-else-if="enginesStore.ikLlamaVersions.length" value="No Active" severity="warning" />
-        </div>
-        <div class="ev-section-actions">
-          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
-            :loading="checkingIkLlama" @click="checkIkLlamaUpdates" />
-          <Button icon="pi pi-refresh" text severity="secondary" size="small"
-            @click="enginesStore.fetchLlamaVersions()" />
-        </div>
-      </div>
-      <div class="ev-section-body">
-        <div v-if="ikLlamaUpdateInfo?.update_available" class="update-banner">
-          <i class="pi pi-arrow-up-right" />
-          Update available: <strong>{{ ikLlamaUpdateInfo.latest_version }}</strong>
-          <a :href="ikLlamaUpdateInfo.release_url" target="_blank" class="update-link">View</a>
-        </div>
-        <div v-else-if="ikLlamaUpdateInfo" class="update-current">
-          <i class="pi pi-check" /> Up to date ({{ ikLlamaUpdateInfo.current_version }})
-        </div>
+          <div class="system-subpanel">
+            <ProgressTracker type="install" metadata-key="target" metadata-value="cuda" />
 
-        <ProgressTracker type="build" />
+            <div v-if="cuda.installed" class="status-detail">
+              <span class="detail-label">CUDA Path:</span>
+              <code>{{ cuda.cuda_path || 'unknown' }}</code>
+            </div>
 
-        <div class="ev-actions">
-          <Button label="Build from Source" icon="pi pi-code" severity="info" outlined size="small"
-            @click="openBuildDialog('ik_llama')" />
+            <div v-if="cuda.installed_versions?.length" class="ev-version-list">
+              <div v-for="v in cuda.installed_versions" :key="v.version" class="ev-version-row">
+                <code class="version-name">CUDA {{ v.version }}</code>
+                <Tag v-if="v.is_current" value="Active" severity="success" />
+                <Button icon="pi pi-trash" text severity="danger" size="small"
+                  @click="confirmUninstallCuda(v.version)" />
+              </div>
+            </div>
+          </div>
         </div>
-
-        <VersionTable
-          :versions="enginesStore.ikLlamaVersions"
-          :activating="activating"
-          @activate="activateVersion"
-          @delete="confirmDeleteVersion"
-        />
-      </div>
+      </Transition>
     </section>
 
-    <!-- ── CUDA Toolkit ───────────────────────────────────── -->
+    <!-- ── Engines Overview ───────────────────────────────── -->
     <section class="ev-section">
-      <div class="ev-section-header">
+      <div class="ev-section-header" @click="enginesExpanded = !enginesExpanded">
         <div class="ev-section-title">
-          <i class="pi pi-bolt" />
-          <h2>CUDA Toolkit</h2>
-          <Tag v-if="cuda.installed" :value="`CUDA ${cuda.version}`" severity="success" />
-          <Tag v-else value="Not Installed" severity="secondary" />
+          <i class="pi pi-server" />
+          <h2>Engines</h2>
         </div>
         <div class="ev-section-actions">
           <Button icon="pi pi-refresh" text severity="secondary" size="small"
-            @click="enginesStore.fetchCudaStatus()" />
+            @click.stop="refreshEnginesOverview" />
+          <i :class="['pi', enginesExpanded ? 'pi-chevron-up' : 'pi-chevron-down']" />
         </div>
       </div>
-      <div class="ev-section-body">
-        <ProgressTracker type="install" />
-
-        <div v-if="cuda.installed" class="status-detail">
-          <span class="detail-label">Path:</span>
-          <code>{{ cuda.cuda_path || 'unknown' }}</code>
-        </div>
-
-        <div v-if="cuda.installed_versions?.length" class="ev-version-list">
-          <div v-for="v in cuda.installed_versions" :key="v.version" class="ev-version-row">
-            <code class="version-name">CUDA {{ v.version }}</code>
-            <Tag v-if="v.is_current" value="Active" severity="success" />
-            <Button icon="pi pi-trash" text severity="danger" size="small"
-              @click="confirmUninstallCuda(v.version)" />
+      <Transition name="ev-collapse">
+        <div v-if="enginesExpanded" class="ev-section-body">
+          <div class="engine-grid">
+            <button type="button" class="engine-card" @click="openEngineModal('llama_cpp')">
+              <div class="engine-card-head">
+                <div class="engine-card-title">
+                  <span class="engine-mark engine-mark--llama" aria-hidden="true">L</span>
+                  <div>
+                    <div class="engine-card-name">llama.cpp</div>
+                    <div class="engine-card-meta">{{ enginesStore.llamaVersions.length }} version{{ enginesStore.llamaVersions.length === 1 ? '' : 's' }}</div>
+                  </div>
+                </div>
+                <Tag v-if="activeLlamaCpp" :value="activeLlamaCpp.version" severity="success" />
+                <Tag v-else value="No Active" severity="warning" />
+              </div>
+              <div class="engine-card-body">
+                <div v-if="llamaCppUpdateInfo?.update_available" class="engine-card-status engine-card-status--warning">
+                  Update available: {{ llamaCppUpdateInfo.latest_version }}
+                </div>
+                <div v-else class="engine-card-status">
+                  Manage builds, updates, activation, and versions
+                </div>
+              </div>
+            </button>
+
+            <button type="button" class="engine-card" @click="openEngineModal('ik_llama')">
+              <div class="engine-card-head">
+                <div class="engine-card-title">
+                  <span class="engine-mark engine-mark--ik" aria-hidden="true">IK</span>
+                  <div>
+                    <div class="engine-card-name">ik_llama.cpp</div>
+                    <div class="engine-card-meta">{{ enginesStore.ikLlamaVersions.length }} version{{ enginesStore.ikLlamaVersions.length === 1 ? '' : 's' }}</div>
+                  </div>
+                </div>
+                <Tag v-if="activeIkLlama" :value="activeIkLlama.version" severity="success" />
+                <Tag v-else value="No Active" severity="warning" />
+              </div>
+              <div class="engine-card-body">
+                <div v-if="ikLlamaUpdateInfo?.update_available" class="engine-card-status engine-card-status--warning">
+                  Update available: {{ ikLlamaUpdateInfo.latest_version }}
+                </div>
+                <div v-else class="engine-card-status">
+                  Manage builds, updates, activation, and versions
+                </div>
+              </div>
+            </button>
+
+            <button type="button" class="engine-card" @click="openEngineModal('lmdeploy')">
+              <div class="engine-card-head">
+                <div class="engine-card-title">
+                  <i class="pi pi-server engine-card-icon" />
+                  <div>
+                    <div class="engine-card-name">LMDeploy</div>
+                    <div class="engine-card-meta">{{ lm.installed ? 'Installed' : 'Not installed' }}</div>
+                  </div>
+                </div>
+                <Tag v-if="lm.installed" :value="`v${lm.version || '?'}`" severity="success" />
+                <Tag v-else value="Not Installed" severity="secondary" />
+              </div>
+              <div class="engine-card-body">
+                <div v-if="lmdeployUpdateInfo?.update_available" class="engine-card-status engine-card-status--warning">
+                  Update available: v{{ lmdeployUpdateInfo.latest_version }}
+                </div>
+                <div v-else class="engine-card-status">
+                  Manage installs, updates, and removal
+                </div>
+              </div>
+            </button>
           </div>
         </div>
-        <div v-else-if="cuda.installed" class="empty-state-mini">
-          <i class="pi pi-bolt" />
-          <span>No CUDA versions listed.</span>
-        </div>
-
-        <div class="ev-actions">
-          <Button label="Install CUDA" icon="pi pi-download" severity="success" outlined size="small"
-            @click="cudaInstallDialogVisible = true" />
-        </div>
-      </div>
+      </Transition>
     </section>
 
     <!-- ── CUDA Install Dialog ────────────────────────────── -->
@@ -206,114 +209,168 @@
       </template>
     </Dialog>
 
-    <!-- ── LMDeploy ───────────────────────────────────────── -->
-    <section class="ev-section">
-      <div class="ev-section-header">
-        <div class="ev-section-title">
-          <i class="pi pi-server" />
-          <h2>LMDeploy</h2>
-          <Tag v-if="lm.installed" :value="`v${lm.version || '?'}`" severity="success" />
-          <Tag v-else value="Not Installed" severity="secondary" />
-        </div>
-        <div class="ev-section-actions">
-          <Button label="Updates" icon="pi pi-search" text severity="info" size="small"
-            :loading="checkingLmdeploy" @click="checkLmdeployUpdates" />
-          <Button icon="pi pi-refresh" text severity="secondary" size="small"
-            @click="enginesStore.fetchLmdeployStatus()" />
+    <Dialog v-model:visible="engineDialogVisible"
+      :header="engineDialogTitle"
+      modal maximizable
+      :style="{ width: '960px' }">
+      <section v-if="selectedEngine === 'llama_cpp'" class="ev-section ev-section--modal">
+        <div class="ev-section-header">
+          <div class="ev-section-title">
+            <span class="engine-mark engine-mark--llama" aria-hidden="true">L</span>
+            <h2>llama.cpp</h2>
+            <Tag v-if="activeLlamaCpp" :value="activeLlamaCpp.version" severity="success" />
+            <Tag v-else-if="enginesStore.llamaVersions.length" value="No Active" severity="warning" />
+          </div>
+          <div class="ev-section-actions">
+            <Button icon="pi pi-sliders-h" text severity="info" size="small"
+              v-tooltip.top="'Build settings'"
+              @click="openBuildDialog('llama_cpp')" />
+            <Button icon="pi pi-arrow-up-right" text severity="info" size="small"
+              v-tooltip.top="'Check for updates'"
+              :loading="checkingLlamaCpp" @click="checkLlamaCppUpdates" />
+            <Button icon="pi pi-refresh" text severity="secondary" size="small"
+              v-tooltip.top="'Reload versions'"
+              @click="enginesStore.fetchLlamaVersions()" />
+          </div>
         </div>
-      </div>
-      <div class="ev-section-body">
-        <div v-if="lmdeployUpdateInfo?.update_available" class="update-banner">
-          <i class="pi pi-arrow-up-right" />
-          Update available: <strong>v{{ lmdeployUpdateInfo.latest_version }}</strong>
-          <a href="https://pypi.org/project/lmdeploy/" target="_blank" class="update-link">View on PyPI</a>
+        <div class="ev-section-body">
+          <div v-if="llamaCppUpdateInfo?.update_available" class="update-banner">
+            <i class="pi pi-arrow-up-right" />
+            Update available: <strong>{{ llamaCppUpdateInfo.latest_version }}</strong>
+            <a :href="llamaCppUpdateInfo.release_url" target="_blank" class="update-link">View release</a>
+            <Button icon="pi pi-arrow-circle-up" text severity="success" size="small"
+              v-tooltip.top="'Update using saved build settings'"
+              :loading="updatingEngine === 'llama_cpp'"
+              @click="doUpdateEngine('llama_cpp')" />
+          </div>
+          <div v-else-if="llamaCppUpdateInfo" class="update-current">
+            <i class="pi pi-check" /> Up to date ({{ llamaCppUpdateInfo.current_version }})
+          </div>
+
+          <ProgressTracker type="build" metadata-key="repository_source" metadata-value="llama.cpp" />
+
+          <VersionTable
+            :versions="enginesStore.llamaVersions"
+            :activating="activating"
+            @activate="activateVersion"
+            @delete="confirmDeleteVersion"
+          />
         </div>
-        <div v-else-if="lmdeployUpdateInfo" class="update-current">
-          <i class="pi pi-check" /> Up to date (v{{ lmdeployUpdateInfo.current_version || 'none' }})
+      </section>
+
+      <section v-else-if="selectedEngine === 'ik_llama'" class="ev-section ev-section--modal">
+        <div class="ev-section-header">
+          <div class="ev-section-title">
+            <span class="engine-mark engine-mark--ik" aria-hidden="true">IK</span>
+            <h2>ik_llama.cpp</h2>
+            <Tag v-if="activeIkLlama" :value="activeIkLlama.version" severity="success" />
+            <Tag v-else-if="enginesStore.ikLlamaVersions.length" value="No Active" severity="warning" />
+          </div>
+          <div class="ev-section-actions">
+            <Button icon="pi pi-sliders-h" text severity="info" size="small"
+              v-tooltip.top="'Build settings'"
+              @click="openBuildDialog('ik_llama')" />
+            <Button icon="pi pi-arrow-up-right" text severity="info" size="small"
+              v-tooltip.top="'Check for updates'"
+              :loading="checkingIkLlama" @click="checkIkLlamaUpdates" />
+            <Button icon="pi pi-refresh" text severity="secondary" size="small"
+              v-tooltip.top="'Reload versions'"
+              @click="enginesStore.fetchLlamaVersions()" />
+          </div>
         </div>
+        <div class="ev-section-body">
+          <div v-if="ikLlamaUpdateInfo?.update_available" class="update-banner">
+            <i class="pi pi-arrow-up-right" />
+            Update available: <strong>{{ ikLlamaUpdateInfo.latest_version }}</strong>
+            <a :href="ikLlamaUpdateInfo.release_url" target="_blank" class="update-link">View</a>
+            <Button icon="pi pi-arrow-circle-up" text severity="success" size="small"
+              v-tooltip.top="'Update using saved build settings'"
+              :loading="updatingEngine === 'ik_llama'"
+              @click="doUpdateEngine('ik_llama')" />
+          </div>
+          <div v-else-if="ikLlamaUpdateInfo" class="update-current">
+            <i class="pi pi-check" /> Up to date ({{ ikLlamaUpdateInfo.current_version }})
+          </div>
 
-        <ProgressTracker type="install" />
+          <ProgressTracker type="build" metadata-key="repository_source" metadata-value="ik_llama.cpp" />
 
-        <div v-if="lm.installed" class="status-detail">
-          <span class="detail-label">Install type:</span>
-          <Tag :value="lm.install_type || 'pip'" severity="info" />
-          <template v-if="lm.venv_path">
-            <span class="detail-label ml">Venv:</span>
-            <code>{{ lm.venv_path }}</code>
-          </template>
+          <VersionTable
+            :versions="enginesStore.ikLlamaVersions"
+            :activating="activating"
+            @activate="activateVersion"
+            @delete="confirmDeleteVersion"
+          />
         </div>
-        <div v-if="lm.source_repo" class="status-detail">
-          <span class="detail-label">Source:</span>
-          <code>{{ lm.source_repo }} ({{ lm.source_branch }})</code>
+      </section>
+
+      <section v-else-if="selectedEngine === 'lmdeploy'" class="ev-section ev-section--modal">
+        <div class="ev-section-header">
+          <div class="ev-section-title">
+            <i class="pi pi-server" />
+            <h2>LMDeploy</h2>
+            <Tag v-if="lm.installed" :value="`v${lm.version || '?'}`" severity="success" />
+            <Tag v-else value="Not Installed" severity="secondary" />
+          </div>
+          <div class="ev-section-actions">
+            <Button icon="pi pi-arrow-up-right" text severity="info" size="small"
+              v-tooltip.top="'Check for updates'"
+              :loading="checkingLmdeploy" @click="checkLmdeployUpdates" />
+            <Button icon="pi pi-refresh" text severity="secondary" size="small"
+              v-tooltip.top="'Reload LMDeploy status'"
+              @click="enginesStore.fetchLmdeployStatus()" />
+          </div>
         </div>
+        <div class="ev-section-body">
+          <div v-if="lmdeployUpdateInfo?.update_available" class="update-banner">
+            <i class="pi pi-arrow-up-right" />
+            Update available: <strong>v{{ lmdeployUpdateInfo.latest_version }}</strong>
+            <a href="https://pypi.org/project/lmdeploy/" target="_blank" class="update-link">View on PyPI</a>
+          </div>
+          <div v-else-if="lmdeployUpdateInfo" class="update-current">
+            <i class="pi pi-check" /> Up to date (v{{ lmdeployUpdateInfo.current_version || 'none' }})
+          </div>
 
-        <div class="ev-actions">
-          <Button label="Install from pip" icon="pi pi-download" severity="success" outlined size="small"
-            :disabled="lm.installed" @click="lmPipDialogVisible = true" />
-          <Button label="Install from Source" icon="pi pi-code" severity="info" outlined size="small"
-            :disabled="lm.installed" @click="lmSourceDialogVisible = true" />
-        </div>
+          <ProgressTracker type="install" metadata-key="target" metadata-value="lmdeploy" />
 
-        <div v-if="lm.installed" class="ev-actions" style="margin-top:1rem; border-top:1px solid var(--border-primary); padding-top:1rem">
-          <Button label="Remove LMDeploy" icon="pi pi-trash" severity="danger" outlined
-            :loading="lmdeployRemoving" @click="confirmRemoveLmdeploy" />
-        </div>
-      </div>
-    </section>
+          <div v-if="lm.installed" class="status-detail">
+            <span class="detail-label">Install type:</span>
+            <Tag :value="lm.install_type || 'pip'" severity="info" />
+            <template v-if="lm.venv_path">
+              <span class="detail-label ml">Venv:</span>
+              <code>{{ lm.venv_path }}</code>
+            </template>
+          </div>
+          <div v-if="lm.source_repo" class="status-detail">
+            <span class="detail-label">Source:</span>
+            <code>{{ lm.source_repo }} ({{ lm.source_branch }})</code>
+          </div>
 
-    <!-- ── Install Release Dialog ─────────────────────────── -->
-    <Dialog v-model:visible="releaseDialogVisible"
-      :header="`Install ${releaseTarget === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'} Release`"
-      modal :style="{ width: '520px' }">
-      <div class="dialog-body">
-        <div v-if="loadingReleases" class="dialog-loading">
-          <ProgressSpinner style="width:40px;height:40px" strokeWidth="4" />
-          <span>Fetching releases…</span>
-        </div>
-        <template v-else>
-          <div class="form-field">
-            <label>Release Tag</label>
-            <Dropdown v-model="selectedReleaseTag" :options="releaseTagOptions"
-              placeholder="Select release…" style="width:100%"
-              @change="loadReleaseAssets" />
+          <div class="ev-actions">
+            <Button label="Install from pip" icon="pi pi-download" severity="success" outlined size="small"
+              :disabled="lm.installed" @click="lmPipDialogVisible = true" />
+            <Button label="Install from Source" icon="pi pi-code" severity="info" outlined size="small"
+              :disabled="lm.installed" @click="lmSourceDialogVisible = true" />
           </div>
-          <div v-if="releaseAssets.length" class="form-field">
-            <label>Asset</label>
-            <div class="asset-list">
-              <div v-for="asset in releaseAssets" :key="asset.id"
-                class="asset-option" :class="{ selected: selectedAssetId === asset.id }"
-                @click="selectedAssetId = asset.id">
-                <RadioButton :value="asset.id" v-model="selectedAssetId" />
-                <span class="asset-name">{{ asset.name }}</span>
-                <span class="asset-size">{{ formatBytes(asset.size) }}</span>
-              </div>
-            </div>
+
+          <div v-if="lm.installed" class="ev-actions" style="margin-top:1rem; border-top:1px solid var(--border-primary); padding-top:1rem">
+            <Button label="Remove LMDeploy" icon="pi pi-trash" severity="danger" outlined
+              :loading="lmdeployRemoving" @click="confirmRemoveLmdeploy" />
           </div>
-          <Message v-if="!releaseTagOptions.length" severity="warn" :closable="false">
-            No compatible release assets found.
-          </Message>
-        </template>
-      </div>
-      <template #footer>
-        <Button label="Cancel" severity="secondary" outlined @click="releaseDialogVisible = false" />
-        <Button label="Install" icon="pi pi-download" severity="success"
-          :disabled="!selectedReleaseTag || loadingReleases || installingRelease"
-          :loading="installingRelease"
-          @click="doInstallRelease" />
-      </template>
+        </div>
+      </section>
     </Dialog>
 
-    <!-- ── Build from Source Dialog ──────────────────────── -->
+    <!-- ── Build Settings Dialog ─────────────────────────── -->
     <Dialog v-model:visible="buildDialogVisible"
-      :header="`Build ${buildTarget === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'} from Source`"
+      :header="`Build settings — ${buildTarget === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'}`"
       modal :style="{ width: '560px' }">
       <div class="dialog-body">
         <div class="form-field">
-          <label>Commit / Branch</label>
+          <label>Ref (tag / branch / commit)</label>
           <InputText v-model="buildForm.commitSha"
             :placeholder="buildTarget === 'ik_llama' ? 'main' : 'master'"
             style="width:100%" />
-          <small>Leave blank for default branch</small>
+          <small>Use a release tag, branch, or commit. Latest detected release is used by default when available.</small>
         </div>
         <div class="form-field">
           <label>Build Name Suffix <span class="optional">(optional)</span></label>
@@ -340,7 +397,10 @@
       </div>
       <template #footer>
         <Button label="Cancel" severity="secondary" outlined @click="buildDialogVisible = false" />
-        <Button label="Start Build" icon="pi pi-cog" severity="info"
+        <Button label="Save settings" icon="pi pi-save" severity="secondary"
+          :loading="savingBuildSettings"
+          @click="saveBuildSettingsOnly" />
+        <Button label="Build now" icon="pi pi-cog" severity="info"
           :loading="building" @click="doStartBuild" />
       </template>
     </Dialog>
@@ -382,13 +442,12 @@
       </template>
     </Dialog>
 
-    <ConfirmDialog />
-
   </div>
 </template>
 
 <script setup>
-import { ref, computed, onMounted } from 'vue'
+import { ref, computed, onMounted, onUnmounted } from 'vue'
+import axios from 'axios'
 import { useConfirm } from 'primevue/useconfirm'
 import { useToast } from 'primevue/usetoast'
 import Button from 'primevue/button'
@@ -399,27 +458,57 @@ import Dialog from 'primevue/dialog'
 import Dropdown from 'primevue/dropdown'
 import InputText from 'primevue/inputtext'
 import InputSwitch from 'primevue/inputswitch'
-import RadioButton from 'primevue/radiobutton'
-import ConfirmDialog from 'primevue/confirmdialog'
-import Message from 'primevue/message'
 import ProgressTracker from '@/components/common/ProgressTracker.vue'
 import VersionTable from '@/components/system/VersionTable.vue'
 import { useEnginesStore } from '@/stores/engines'
-import { formatBytes, formatBytesIEC } from '@/utils/formatting'
+import { useProgressStore } from '@/stores/progress'
+import { formatBytesIEC } from '@/utils/formatting'
 
 const enginesStore = useEnginesStore()
+const progressStore = useProgressStore()
 const confirm = useConfirm()
 const toast = useToast()
 
 // ── System metrics ─────────────────────────────────────────
 const systemExpanded = ref(true)
+const enginesExpanded = ref(true)
+const engineDialogVisible = ref(false)
+const selectedEngine = ref('llama_cpp')
+
+const engineDialogTitle = computed(() => {
+  if (selectedEngine.value === 'ik_llama') return 'ik_llama.cpp'
+  if (selectedEngine.value === 'lmdeploy') return 'LMDeploy'
+  return 'llama.cpp'
+})
+
+function openEngineModal(engineKey) {
+  selectedEngine.value = engineKey
+  engineDialogVisible.value = true
+  if (engineKey === 'llama_cpp') {
+    checkLlamaCppUpdates()
+  } else if (engineKey === 'ik_llama') {
+    checkIkLlamaUpdates()
+  } else if (engineKey === 'lmdeploy') {
+    checkLmdeployUpdates()
+  }
+}
+
+async function refreshEnginesOverview() {
+  await Promise.allSettled([
+    enginesStore.fetchLlamaVersions(),
+    enginesStore.fetchLmdeployStatus(),
+    checkLlamaCppUpdates(),
+    checkIkLlamaUpdates(),
+    checkLmdeployUpdates(),
+  ])
+}
 
 const sys = computed(() => {
   const s = enginesStore.systemStatus
   return s?.system || s || {}
 })
 
-const gpu = computed(() => enginesStore.gpuInfo?.gpus?.[0] ?? null)
+const gpus = computed(() => enginesStore.gpuInfo?.gpus ?? [])
 
 const memPercent = computed(() => {
   const m = sys.value.memory
@@ -435,12 +524,11 @@ const diskPercent = computed(() => {
   return total > 0 ? Math.round((used / total) * 100) : 0
 })
 
-const gpuPercent = computed(() => {
-  const g = gpu.value
+function gpuPercent(g) {
   const used = g?.memory_used_mb ?? 0
   const total = g?.memory_total_mb ?? 0
   return total > 0 ? Math.round((used / total) * 100) : 0
-})
+}
 
 // ── Active versions ────────────────────────────────────────
 const activeLlamaCpp = computed(() => enginesStore.llamaVersions.find(v => v.is_active) ?? null)
@@ -462,6 +550,21 @@ async function activateVersion(versionId) {
 }
 
 function confirmDeleteVersion(versionId) {
+  const allVersions = [
+    ...(enginesStore.llamaVersions || []),
+    ...(enginesStore.ikLlamaVersions || []),
+  ]
+  const version = allVersions.find(v => (v.id ?? v.version) === versionId)
+  if (version?.is_active) {
+    toast.add({
+      severity: 'warn',
+      summary: 'Cannot delete active version',
+      detail: 'Activate another engine version before deleting this one.',
+      life: 3000,
+    })
+    return
+  }
+
   confirm.require({
     message: `Delete version "${versionId}"?`,
     header: 'Confirm Delete',
@@ -497,6 +600,7 @@ function normalizeLlamaUpdateInfo(raw, currentVersion, commitUrlPrefix) {
 
 const checkingLlamaCpp = ref(false)
 const llamaCppUpdateInfo = ref(null)
+const updatingEngine = ref(null)
 
 async function checkLlamaCppUpdates() {
   checkingLlamaCpp.value = true
@@ -504,14 +608,9 @@ async function checkLlamaCppUpdates() {
     const raw = await enginesStore.checkLlamaCppUpdates()
     llamaCppUpdateInfo.value = normalizeLlamaUpdateInfo(
       raw,
-      activeLlamaCpp.value?.version,
+      activeLlamaCpp.value?.source_ref || activeLlamaCpp.value?.source_commit || activeLlamaCpp.value?.version,
       'https://github.com/ggerganov/llama.cpp',
     )
-    if (llamaCppUpdateInfo.value?.available_tags?.length) {
-      releaseTagOptions.value = llamaCppUpdateInfo.value.available_tags
-    } else if (llamaCppUpdateInfo.value?.latest_version) {
-      releaseTagOptions.value = [llamaCppUpdateInfo.value.latest_version]
-    }
   } catch (e) {
     toast.add({ severity: 'warn', summary: 'Could not check updates', detail: e.message, life: 3000 })
   } finally {
@@ -528,7 +627,7 @@ async function checkIkLlamaUpdates() {
     const raw = await enginesStore.checkIkLlamaUpdates()
     ikLlamaUpdateInfo.value = normalizeLlamaUpdateInfo(
       raw,
-      activeIkLlama.value?.version,
+      activeIkLlama.value?.source_ref || activeIkLlama.value?.source_commit || activeIkLlama.value?.version,
       'https://github.com/ikawrakow/ik_llama.cpp',
     )
   } catch (e) {
@@ -538,69 +637,11 @@ async function checkIkLlamaUpdates() {
   }
 }
 
-// ── Release install dialog ─────────────────────────────────
-const releaseDialogVisible = ref(false)
-const releaseTarget = ref('llama_cpp')
-const loadingReleases = ref(false)
-const releaseTagOptions = ref([])
-const releaseAssets = ref([])
-const selectedReleaseTag = ref(null)
-const selectedAssetId = ref(null)
-const installingRelease = ref(false)
-
-async function openReleaseDialog(engineKey) {
-  releaseTarget.value = engineKey
-  releaseDialogVisible.value = true
-  releaseAssets.value = []
-  selectedAssetId.value = null
-  if (!releaseTagOptions.value.length) {
-    loadingReleases.value = true
-    try {
-      await checkLlamaCppUpdates()
-    } finally {
-      loadingReleases.value = false
-    }
-  }
-  if (releaseTagOptions.value.length) {
-    selectedReleaseTag.value = releaseTagOptions.value[0]
-    await loadReleaseAssets()
-  }
-}
-
-async function loadReleaseAssets() {
-  if (!selectedReleaseTag.value) return
-  loadingReleases.value = true
-  try {
-    const data = await enginesStore.fetchReleaseAssets(selectedReleaseTag.value)
-    releaseAssets.value = data?.assets || []
-    if (releaseAssets.value.length) selectedAssetId.value = releaseAssets.value[0].id
-  } catch {
-    releaseAssets.value = []
-  } finally {
-    loadingReleases.value = false
-  }
-}
-
-async function doInstallRelease() {
-  installingRelease.value = true
-  try {
-    await enginesStore.installRelease({
-      tag_name: selectedReleaseTag.value,
-      asset_id: selectedAssetId.value || undefined,
-    })
-    releaseDialogVisible.value = false
-    toast.add({ severity: 'success', summary: 'Install started', detail: 'Track progress below', life: 3000 })
-  } catch (e) {
-    toast.add({ severity: 'error', summary: 'Install failed', detail: e.message, life: 4000 })
-  } finally {
-    installingRelease.value = false
-  }
-}
-
 // ── Build from source dialog ───────────────────────────────
 const buildDialogVisible = ref(false)
 const buildTarget = ref('llama_cpp')
 const building = ref(false)
+const savingBuildSettings = ref(false)
 const buildForm = ref({
   commitSha: '',
   versionSuffix: '',
@@ -621,14 +662,61 @@ const buildOptions = [
   { key: 'cpu_all_variants', label: 'CPU All Variants',         desc: 'GGML_CPU_ALL_VARIANTS=on' },
 ]
 
-function openBuildDialog(engineKey) {
+function _defaultBuildConfig() {
+  return {
+    cuda: false,
+    flash_attention: false,
+    native: true,
+    backend_dl: false,
+    cpu_all_variants: false,
+    cuda_architectures: '',
+  }
+}
+
+async function fetchEngineBuildSettings(engineId) {
+  if (typeof enginesStore.fetchBuildSettings === 'function') {
+    return await enginesStore.fetchBuildSettings(engineId)
+  }
+  const { data } = await axios.get('/api/llama-versions/build-settings', {
+    params: { engine: engineId },
+  })
+  return data
+}
+
+async function saveEngineBuildSettings(engineId, settings) {
+  if (typeof enginesStore.saveBuildSettings === 'function') {
+    return await enginesStore.saveBuildSettings(engineId, settings)
+  }
+  const { data } = await axios.put('/api/llama-versions/build-settings', settings, {
+    params: { engine: engineId },
+  })
+  return data
+}
+
+async function updateEngineWithSavedSettings(engineId) {
+  if (typeof enginesStore.updateEngine === 'function') {
+    return await enginesStore.updateEngine(engineId)
+  }
+  const { data } = await axios.post('/api/llama-versions/update', {
+    engine: engineId,
+  })
+  return data
+}
+
+async function openBuildDialog(engineKey) {
   buildTarget.value = engineKey
-  buildForm.value.commitSha = engineKey === 'ik_llama' ? 'main' : 'master'
-  buildForm.value.versionSuffix = ''
-  buildForm.value.buildConfig = {
-    cuda: false, flash_attention: false, native: true,
-    backend_dl: false, cpu_all_variants: false, cuda_architectures: '',
+  const engineId = engineKey === 'ik_llama' ? 'ik_llama' : 'llama_cpp'
+  const updateInfo = engineKey === 'ik_llama' ? ikLlamaUpdateInfo.value : llamaCppUpdateInfo.value
+  const baseConfig = _defaultBuildConfig()
+  try {
+    const saved = await fetchEngineBuildSettings(engineId)
+    Object.assign(baseConfig, saved || {})
+  } catch {
+    // Ignore, fall back to defaults
   }
+  buildForm.value.commitSha = updateInfo?.latest_version || (engineKey === 'ik_llama' ? 'main' : 'master')
+  buildForm.value.versionSuffix = ''
+  buildForm.value.buildConfig = baseConfig
   buildDialogVisible.value = true
 }
 
@@ -636,13 +724,17 @@ async function doStartBuild() {
   building.value = true
   try {
     const repoSource = buildTarget.value === 'ik_llama' ? 'ik_llama.cpp' : 'llama.cpp'
+    const engineId = buildTarget.value === 'ik_llama' ? 'ik_llama' : 'llama_cpp'
     const config = { ...buildForm.value.buildConfig }
     if (!config.cuda_architectures) delete config.cuda_architectures
+    // Persist settings before triggering a manual build
+    await saveEngineBuildSettings(engineId, config)
     await enginesStore.buildSource({
       commit_sha: buildForm.value.commitSha || (buildTarget.value === 'ik_llama' ? 'main' : 'master'),
       repository_source: repoSource,
       version_suffix: buildForm.value.versionSuffix || undefined,
       build_config: config,
+      auto_activate: false,
     })
     buildDialogVisible.value = false
     toast.add({ severity: 'success', summary: 'Build started', detail: 'Track progress below', life: 3000 })
@@ -653,6 +745,40 @@ async function doStartBuild() {
   }
 }
 
+async function saveBuildSettingsOnly() {
+  const engineId = buildTarget.value === 'ik_llama' ? 'ik_llama' : 'llama_cpp'
+  const config = { ...buildForm.value.buildConfig }
+  if (!config.cuda_architectures) delete config.cuda_architectures
+  savingBuildSettings.value = true
+  try {
+    await saveEngineBuildSettings(engineId, config)
+    buildDialogVisible.value = false
+    toast.add({ severity: 'success', summary: 'Build settings saved', life: 2500 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Save failed', detail: e.message, life: 4000 })
+  } finally {
+    savingBuildSettings.value = false
+  }
+}
+
+async function doUpdateEngine(engineKey) {
+  const updateInfo = engineKey === 'ik_llama' ? ikLlamaUpdateInfo.value : llamaCppUpdateInfo.value
+  if (!updateInfo?.latest_version) {
+    toast.add({ severity: 'warn', summary: 'No update available', detail: 'Check for updates first.', life: 3000 })
+    return
+  }
+  const engineId = engineKey === 'ik_llama' ? 'ik_llama' : 'llama_cpp'
+  updatingEngine.value = engineKey
+  try {
+    await updateEngineWithSavedSettings(engineId)
+    toast.add({ severity: 'success', summary: 'Update started', detail: 'Build in progress, track below.', life: 3000 })
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Update failed', detail: e.message, life: 4000 })
+  } finally {
+    updatingEngine.value = null
+  }
+}
+
 // ── CUDA ───────────────────────────────────────────────────
 const cuda = computed(() => enginesStore.cudaStatus || {})
 const cudaVersionOptions = ['12.9', '12.8', '12.7', '12.6', '12.5', '12.4', '12.3', '12.2', '12.1', '12.0', '11.9', '11.8']
@@ -772,7 +898,27 @@ function confirmRemoveLmdeploy() {
 }
 
 // ── Lifecycle ──────────────────────────────────────────────
-onMounted(() => enginesStore.fetchAll())
+let unsubscribeCudaStatus = null
+let unsubscribeLmdeployStatus = null
+
+onMounted(() => {
+  enginesStore.fetchAll()
+  unsubscribeCudaStatus = progressStore.subscribe('cuda_install_status', async (payload) => {
+    if (payload?.status === 'completed' || payload?.status === 'failed') {
+      await enginesStore.fetchCudaStatus()
+    }
+  })
+  unsubscribeLmdeployStatus = progressStore.subscribe('lmdeploy_install_status', async (payload) => {
+    if (payload?.status === 'completed' || payload?.status === 'failed') {
+      await enginesStore.fetchLmdeployStatus()
+    }
+  })
+})
+
+onUnmounted(() => {
+  if (unsubscribeCudaStatus) unsubscribeCudaStatus()
+  if (unsubscribeLmdeployStatus) unsubscribeLmdeployStatus()
+})
 </script>
 
 <style scoped>
@@ -824,6 +970,30 @@ onMounted(() => enginesStore.fetchAll())
   margin: 0;
 }
 
+.engine-mark {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 1.8rem;
+  height: 1.8rem;
+  padding: 0 0.45rem;
+  border-radius: 999px;
+  font-size: 0.72rem;
+  font-weight: 700;
+  line-height: 1;
+  letter-spacing: 0.04em;
+  color: #fff;
+  box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.1);
+}
+
+.engine-mark--llama {
+  background: linear-gradient(135deg, #0ea5e9, #2563eb);
+}
+
+.engine-mark--ik {
+  background: linear-gradient(135deg, #8b5cf6, #ec4899);
+}
+
 .ev-section-actions {
   display: flex;
   align-items: center;
@@ -851,13 +1021,105 @@ onMounted(() => enginesStore.fetchAll())
   border: 1px solid var(--border-primary);
 }
 
+.metric-card--actionable {
+  flex-direction: row;
+}
+
 .metric-icon { font-size: 1.5rem; flex-shrink: 0; line-height: 1; color: var(--accent-cyan); }
 .metric-data { flex: 1; min-width: 0; }
 .metric-label { font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--text-secondary); margin-bottom: 0.2rem; }
 .metric-value { font-size: 0.875rem; font-weight: 600; }
+.metric-subvalue {
+  margin-top: 0.25rem;
+  font-size: 0.8rem;
+  color: var(--text-secondary);
+  word-break: break-word;
+}
 .metric-bar { margin-top: 0.5rem; }
+.metric-actions {
+  display: flex;
+  gap: 0.4rem;
+  flex-wrap: wrap;
+  margin-top: 0.6rem;
+}
 /* No text inside the bar so low percentages don’t get clipped; value is shown above */
 
+.system-subpanel {
+  margin-top: 1rem;
+}
+
+/* ── Engines overview ───────────────────────────────────── */
+.engine-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
+  gap: 0.75rem;
+}
+
+.engine-card {
+  appearance: none;
+  border: 1px solid var(--border-primary);
+  background: var(--bg-surface);
+  border-radius: var(--radius-md);
+  padding: 0.9rem;
+  text-align: left;
+  color: inherit;
+  cursor: pointer;
+  transition: border-color 0.15s ease, transform 0.15s ease, background 0.15s ease;
+}
+
+.engine-card:hover {
+  border-color: var(--accent-cyan);
+  background: color-mix(in srgb, var(--bg-surface) 88%, var(--accent-cyan) 12%);
+  transform: translateY(-1px);
+}
+
+.engine-card-head {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 0.75rem;
+}
+
+.engine-card-title {
+  display: flex;
+  align-items: center;
+  gap: 0.65rem;
+  min-width: 0;
+}
+
+.engine-card-name {
+  font-size: 0.95rem;
+  font-weight: 600;
+}
+
+.engine-card-meta {
+  font-size: 0.78rem;
+  color: var(--text-secondary);
+  margin-top: 0.1rem;
+}
+
+.engine-card-icon {
+  font-size: 1.25rem;
+  color: var(--accent-cyan);
+  width: 1.8rem;
+  text-align: center;
+  flex-shrink: 0;
+}
+
+.engine-card-body {
+  margin-top: 0.8rem;
+}
+
+.engine-card-status {
+  font-size: 0.82rem;
+  color: var(--text-secondary);
+}
+
+.engine-card-status--warning {
+  color: var(--status-warning);
+  font-weight: 600;
+}
+
 /* ── Actions ─────────────────────────────────────────── */
 .ev-actions {
   display: flex;
@@ -1003,6 +1265,11 @@ code {
   color: var(--text-secondary);
 }
 
+.ev-section--modal {
+  border: 0;
+  background: transparent;
+}
+
 .form-field {
   display: flex;
   flex-direction: column;
diff --git a/frontend/src/views/ModelConfig.vue b/frontend/src/views/ModelConfig.vue
index cb5ea3f..f169f2c 100644
--- a/frontend/src/views/ModelConfig.vue
+++ b/frontend/src/views/ModelConfig.vue
@@ -39,8 +39,24 @@
             :class="{ selected: config.engine === eng.value }"
             @click="changeEngine(eng.value)"
           >
-            <i :class="['pi', eng.icon]" />
-            <span>{{ eng.label }}</span>
+            <div class="engine-option-label">
+              <span
+                v-if="eng.value === 'llama_cpp'"
+                class="engine-mark engine-mark--llama"
+                aria-hidden="true"
+              >L</span>
+              <span
+                v-else-if="eng.value === 'ik_llama'"
+                class="engine-mark engine-mark--ik"
+                aria-hidden="true"
+              >IK</span>
+              <i
+                v-else-if="eng.value === 'lmdeploy'"
+                class="pi pi-server engine-icon-lmdeploy"
+                aria-hidden="true"
+              />
+              <span class="engine-name">{{ eng.label }}</span>
+            </div>
           </div>
         </div>
       </div>
@@ -54,8 +70,51 @@
               {{ param.label }}
               <i class="pi pi-info-circle param-info" v-tooltip.top="param.description" />
             </label>
+            <!-- Context length / session length: slider + numeric input (soft max based on model metadata) -->
+            <template v-if="param.type === 'int' && (param.key === 'ctx_size' || param.key === 'session_len')">
+              <div class="param-slider-row">
+                <Slider
+                  v-model="config[param.key]"
+                  :min="512"
+                  :max="maxContextSuggestion || 131072"
+                  :step="256"
+                  class="param-slider"
+                />
+                <span v-if="maxContextSuggestion" class="param-hint">
+                  Suggested max: {{ maxContextSuggestion.toLocaleString() }} tokens
+                </span>
+              </div>
+              <InputNumber
+                :id="`param-${param.key}`"
+                v-model="config[param.key]"
+                :placeholder="String(param.default ?? '')"
+                class="param-input"
+              />
+            </template>
+            <!-- GPU layers: slider guided by detected layer count, but value not clamped -->
+            <template v-else-if="param.type === 'int' && param.key === 'n_gpu_layers'">
+              <div class="param-slider-row">
+                <Slider
+                  v-model="config[param.key]"
+                  :min="0"
+                  :max="layerCountSuggestion || 128"
+                  :step="1"
+                  class="param-slider"
+                />
+                <span v-if="layerCountSuggestion" class="param-hint">
+                  Detected layers: {{ layerCountSuggestion }}
+                </span>
+              </div>
+              <InputNumber
+                :id="`param-${param.key}`"
+                v-model="config[param.key]"
+                :placeholder="String(param.default ?? '')"
+                class="param-input"
+              />
+            </template>
+            <!-- Fallback: regular numeric / other inputs -->
             <InputNumber
-              v-if="param.type === 'int'"
+              v-else-if="param.type === 'int'"
               :id="`param-${param.key}`"
               v-model="config[param.key]"
               :placeholder="String(param.default ?? '')"
@@ -219,6 +278,7 @@ import InputSwitch from 'primevue/inputswitch'
 import Dropdown from 'primevue/dropdown'
 import Textarea from 'primevue/textarea'
 import ProgressSpinner from 'primevue/progressspinner'
+import Slider from 'primevue/slider'
 import { useModelStore } from '@/stores/models'
 
 const route = useRoute()
@@ -235,6 +295,7 @@ const savedConfig = ref({})          // for reset
 const paramRegistry = ref({ basic: [], advanced: [] })
 const selectedNewParam = ref(null)
 const activeAdvancedKeys = ref([])   // keys of advanced params currently in the form
+const modelLimits = ref(null)        // engine-agnostic: { max_context_length?, layer_count? } from /api/models/{id}/limits
 
 const allEngineOptions = [
   { value: 'llama_cpp', label: 'llama.cpp', icon: 'pi-microchip' },
@@ -261,6 +322,26 @@ const availableAdvancedParams = computed(() =>
   allAdvancedParams.value.filter(p => !activeAdvancedKeys.value.includes(p.key))
 )
 
+const maxContextSuggestion = computed(() => {
+  if (!model.value) return null
+  const limits = modelLimits.value
+  const cfg = config.value || {}
+  if (limits?.max_context_length != null && Number(limits.max_context_length) > 0) {
+    return Number(limits.max_context_length)
+  }
+  if (cfg.session_len != null && Number(cfg.session_len) > 0) return Number(cfg.session_len)
+  if (cfg.ctx_size != null && Number(cfg.ctx_size) > 0) return Number(cfg.ctx_size)
+  return null
+})
+
+const layerCountSuggestion = computed(() => {
+  const limits = modelLimits.value
+  if (limits?.layer_count != null && Number(limits.layer_count) > 0) {
+    return Number(limits.layer_count)
+  }
+  return null
+})
+
 // ── Helpers ────────────────────────────────────────────────
 function findModelById(id) {
   for (const group of modelStore.models) {
@@ -333,11 +414,17 @@ async function loadAll() {
     if (found.format !== 'safetensors' && engine === 'lmdeploy') engine = 'llama_cpp'
     await fetchParamRegistry(engine)
 
-    const { data: cfg } = await axios.get(`/api/models/${route.params.id}/config`)
+    const [cfgResp, limitsResp] = await Promise.all([
+      axios.get(`/api/models/${route.params.id}/config`),
+      axios.get(`/api/models/${route.params.id}/limits`).catch(() => ({ data: null })),
+    ])
+
+    const cfg = cfgResp.data
     const merged = { engine, ...cfg }
     config.value = merged
     savedConfig.value = JSON.parse(JSON.stringify(merged))
     activeAdvancedKeys.value = detectActiveAdvancedKeys(merged)
+    modelLimits.value = limitsResp?.data ?? null
   } catch (e) {
     toast.add({ severity: 'error', summary: 'Failed to load config', detail: e.message, life: 4000 })
   } finally {
@@ -349,11 +436,11 @@ async function loadAll() {
 async function saveConfig() {
   saving.value = true
   try {
-    // Build clean config: only include non-null values for advanced params
+    // Build clean config: drop any keys that are effectively "unset"
     const toSave = { ...config.value }
-    // Remove advanced params with null/empty values (treat as "not set")
-    for (const key of activeAdvancedKeys.value) {
-      if (toSave[key] == null || toSave[key] === '') {
+    for (const [key, value] of Object.entries(toSave)) {
+      // Keep false/0, but drop null, empty string, or NaN
+      if (value == null || value === '' || (typeof value === 'number' && Number.isNaN(value))) {
         delete toSave[key]
       }
     }
@@ -478,9 +565,8 @@ onMounted(loadAll)
 }
 
 .engine-option {
-  display: flex;
+  display: inline-flex;
   align-items: center;
-  gap: 0.5rem;
   padding: 0.5rem 1rem;
   border-radius: var(--radius-md, 0.5rem);
   border: 1px solid var(--border-primary, #2a2f45);
@@ -502,6 +588,45 @@ onMounted(loadAll)
   font-weight: 600;
 }
 
+.engine-option-label {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+
+.engine-name {
+  font-size: 0.875rem;
+}
+
+.engine-mark {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 1.6rem;
+  height: 1.6rem;
+  padding: 0 0.4rem;
+  border-radius: 999px;
+  font-size: 0.7rem;
+  font-weight: 700;
+  line-height: 1;
+  letter-spacing: 0.04em;
+  color: #fff;
+  box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.1);
+}
+
+.engine-mark--llama {
+  background: linear-gradient(135deg, #0ea5e9, #2563eb);
+}
+
+.engine-mark--ik {
+  background: linear-gradient(135deg, #8b5cf6, #ec4899);
+}
+
+.engine-icon-lmdeploy {
+  font-size: 1.1rem;
+  color: var(--accent-cyan, #22d3ee);
+}
+
 /* ── Params grid ──────────────────────────────────────── */
 .params-grid {
   display: grid;
@@ -564,4 +689,20 @@ onMounted(loadAll)
   justify-content: flex-end;
   padding-bottom: var(--spacing-lg, 1.5rem);
 }
+
+.param-slider-row {
+  display: flex;
+  flex-direction: column;
+  gap: 0.25rem;
+  margin-bottom: 0.25rem;
+}
+
+.param-slider {
+  width: 100%;
+}
+
+.param-hint {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #9ca3af);
+}
 </style>
diff --git a/frontend/src/views/ModelLibrary.vue b/frontend/src/views/ModelLibrary.vue
index fe75ba8..bb03b38 100644
--- a/frontend/src/views/ModelLibrary.vue
+++ b/frontend/src/views/ModelLibrary.vue
@@ -34,40 +34,62 @@
     </div>
 
     <!-- Loading -->
-    <div v-if="modelStore.loading && !modelStore.models.length" class="loading-state">
+    <div
+      v-if="(modelStore.loading || modelStore.safetensorsLoading) && !modelStore.models.length && !modelStore.safetensorsModels.length"
+      class="loading-state"
+    >
       <ProgressSpinner style="width:40px;height:40px" />
       <span>Loading models…</span>
     </div>
 
     <!-- Empty state -->
-    <div v-else-if="!modelStore.loading && !modelStore.models.length" class="empty-state">
+    <div
+      v-else-if="!modelStore.loading && !modelStore.safetensorsLoading && !modelStore.models.length && !modelStore.safetensorsModels.length"
+      class="empty-state"
+    >
       <i class="pi pi-inbox" style="font-size:3rem;color:var(--text-secondary)" />
       <h3>No models downloaded yet</h3>
       <p>Search HuggingFace to find and download models.</p>
       <Button label="Search Models" icon="pi pi-search" @click="$router.push('/search')" />
     </div>
 
-    <!-- Model groups -->
+    <!-- Model groups (GGUF + Safetensors) -->
     <div v-else class="model-groups">
       <div
-        v-for="group in modelStore.models"
+        v-for="group in displayGroups"
         :key="group.huggingface_id"
         class="model-group"
       >
-        <!-- Group header -->
-        <div class="group-header" @click="toggleGroup(group.huggingface_id)">
+        <!-- Group header: GGUF (expandable) -->
+        <div
+          v-if="!isSafetensorsGroup(group)"
+          class="group-header"
+          @click="toggleGroup(group.huggingface_id)"
+        >
           <div class="group-title">
-            <i :class="['pi', 'group-chevron', expandedGroups.has(group.huggingface_id) ? 'pi-chevron-down' : 'pi-chevron-right']" />
-            <span class="group-name">{{ group.base_model_name || group.huggingface_id }}</span>
+            <i
+              :class="['pi', 'group-chevron', expandedGroups.has(group.huggingface_id) ? 'pi-chevron-down' : 'pi-chevron-right']"
+            />
+            <span class="group-name">{{ group.huggingface_id }}</span>
             <Tag
               v-if="group.quantizations?.some(q => q.is_active)"
               value="Running"
               severity="success"
               class="running-badge"
             />
+            <Tag
+              v-if="primaryQuant(group)"
+              :value="(primaryQuant(group).config && primaryQuant(group).config.engine) || (primaryQuant(group).format === 'safetensors' ? 'ik_llama' : (primaryQuant(group).engine || 'llama_cpp'))"
+              severity="secondary"
+              class="engine-tag"
+            />
+            <Tag
+              v-if="primaryQuant(group) && primaryQuant(group).format"
+              :value="primaryQuant(group).format"
+              severity="info"
+            />
           </div>
           <div class="group-meta">
-            <small>{{ group.huggingface_id }}</small>
             <Button
               icon="pi pi-trash"
               text
@@ -79,71 +101,100 @@
           </div>
         </div>
 
-        <!-- Quantizations list -->
-        <Transition name="group-collapse">
+        <!-- Group header: Safetensors (single-row, non-expandable) -->
+        <div
+          v-else
+          class="group-header safetensors-header"
+        >
+          <div class="group-title">
+            <span class="group-name">{{ group.huggingface_id }}</span>
+            <Tag
+              v-if="primaryQuant(group) && primaryQuant(group).is_active"
+              value="Running"
+              severity="success"
+              class="running-badge"
+            />
+            <Tag
+              v-if="primaryQuant(group)"
+              :value="(primaryQuant(group).config && primaryQuant(group).config.engine) || (primaryQuant(group).format === 'safetensors' ? 'ik_llama' : (primaryQuant(group).engine || 'llama_cpp'))"
+              severity="secondary"
+              class="engine-tag"
+            />
+            <Tag
+              v-if="primaryQuant(group) && primaryQuant(group).format"
+              :value="primaryQuant(group).format"
+              severity="info"
+            />
+          </div>
+          <div class="group-meta">
+            <span
+              v-if="primaryQuant(group) && primaryQuant(group).file_size"
+              class="file-size"
+            >
+              {{ formatBytes(primaryQuant(group).file_size) }}
+            </span>
+            <span
+              v-if="primaryQuant(group) && primaryQuant(group).downloaded_at"
+              class="downloaded-at"
+            >
+              Downloaded {{ formatDate(primaryQuant(group).downloaded_at) }}
+            </span>
+            <Button
+              v-if="primaryQuant(group) && !primaryQuant(group).is_active"
+              label="Start"
+              icon="pi pi-play"
+              size="small"
+              severity="success"
+              outlined
+              :loading="primaryQuant(group) && startingModels.has(primaryQuant(group).id)"
+              @click.stop="primaryQuant(group) && startModel(primaryQuant(group).id)"
+            />
+            <Button
+              v-else-if="primaryQuant(group)"
+              label="Stop"
+              icon="pi pi-stop"
+              size="small"
+              severity="warning"
+              outlined
+              :loading="primaryQuant(group) && stoppingModels.has(primaryQuant(group).id)"
+              @click.stop="primaryQuant(group) && stopModel(primaryQuant(group).id)"
+            />
+            <Button
+              v-if="primaryQuant(group)"
+              icon="pi pi-cog"
+              text
+              severity="secondary"
+              size="small"
+              v-tooltip.top="'Configure'"
+              @click.stop="configureModel(primaryQuant(group).id)"
+            />
+            <Button
+              icon="pi pi-trash"
+              text
+              severity="danger"
+              size="small"
+              v-tooltip.top="'Delete model'"
+              @click.stop="primaryQuant(group) ? confirmDeleteModel(primaryQuant(group).id) : confirmDeleteGroup(group.huggingface_id)"
+            />
+          </div>
+        </div>
+
+        <!-- Quantizations list (GGUF only) -->
+        <Transition v-if="!isSafetensorsGroup(group)" name="group-collapse">
           <div v-if="expandedGroups.has(group.huggingface_id)" class="quantizations">
-            <div
+            <ModelRow
               v-for="quant in group.quantizations"
               :key="quant.id"
-              class="quant-row"
-              :class="{ 'is-active': quant.is_active }"
-            >
-              <div class="quant-info">
-                <div class="quant-main">
-                  <code class="quant-name">{{ quant.quantization || quant.name }}</code>
-                  <Tag v-if="quant.is_active" value="Running" severity="success" />
-                  <Tag :value="quant.engine || 'llama_cpp'" severity="secondary" />
-                  <Tag v-if="quant.format" :value="quant.format" severity="info" />
-                </div>
-                <div class="quant-sub">
-                  <span v-if="quant.file_size" class="file-size">
-                    {{ formatBytes(quant.file_size) }}
-                  </span>
-                  <span v-if="quant.downloaded_at" class="downloaded-at">
-                    Downloaded {{ formatDate(quant.downloaded_at) }}
-                  </span>
-                </div>
-              </div>
-
-              <div class="quant-actions">
-                <Button
-                  v-if="!quant.is_active"
-                  label="Start"
-                  icon="pi pi-play"
-                  size="small"
-                  severity="success"
-                  outlined
-                  :loading="startingModels.has(quant.id)"
-                  @click="startModel(quant.id)"
-                />
-                <Button
-                  v-else
-                  label="Stop"
-                  icon="pi pi-stop"
-                  size="small"
-                  severity="warning"
-                  outlined
-                  :loading="stoppingModels.has(quant.id)"
-                  @click="stopModel(quant.id)"
-                />
-                <Button
-                  icon="pi pi-cog"
-                  text
-                  severity="secondary"
-                  size="small"
-                  v-tooltip.top="'Configure'"
-                  @click="configureModel(quant.id)"
-                />
-                <Button
-                  icon="pi pi-trash"
-                  text
-                  severity="danger"
-                  size="small"
-                  v-tooltip.top="'Delete'"
-                  @click="confirmDeleteModel(quant.id)"
-                />
-              </div>
-            </div>
+              :quant="quant"
+              :is-starting="startingModels.has(quant.id)"
+              :is-stopping="stoppingModels.has(quant.id)"
+              :format-bytes="formatBytes"
+              :format-date="formatDate"
+              @start="startModel"
+              @stop="stopModel"
+              @configure="configureModel"
+              @delete="confirmDeleteModel"
+            />
           </div>
         </Transition>
       </div>
@@ -170,7 +221,6 @@
       </template>
     </Dialog>
 
-    <ConfirmDialog />
   </div>
 </template>
 
@@ -185,6 +235,7 @@ import ProgressSpinner from 'primevue/progressspinner'
 import Dialog from 'primevue/dialog'
 import Password from 'primevue/password'
 import ConfirmDialog from 'primevue/confirmdialog'
+import ModelRow from '@/components/ModelRow.vue'
 import { useModelStore } from '@/stores/models'
 
 const router = useRouter()
@@ -202,11 +253,25 @@ const savingToken = ref(false)
 let pollTimer = null
 
 // ── Computed ───────────────────────────────────────────────
+// Backend /api/models already returns both GGUF and safetensors models
+// grouped appropriately, so we can display models directly from there.
+const displayGroups = computed(() => modelStore.models || [])
+
 const totalModels = computed(() =>
-  modelStore.models.reduce((acc, g) => acc + (g.quantizations?.length ?? 0), 0)
+  displayGroups.value.reduce((acc, g) => acc + (g.quantizations?.length ?? 0), 0)
 )
 
 // ── Group expand/collapse ──────────────────────────────────
+function isSafetensorsGroup(group) {
+  if (!group || !Array.isArray(group.quantizations) || !group.quantizations.length) return false
+  return group.quantizations.every(q => q.format === 'safetensors')
+}
+
+function primaryQuant(group) {
+  if (!group || !Array.isArray(group.quantizations) || !group.quantizations.length) return null
+  return group.quantizations[0]
+}
+
 function toggleGroup(hfId) {
   if (expandedGroups.value.has(hfId)) {
     expandedGroups.value.delete(hfId)
@@ -216,7 +281,7 @@ function toggleGroup(hfId) {
 }
 
 function expandAllGroups() {
-  modelStore.models.forEach(g => expandedGroups.value.add(g.huggingface_id))
+  displayGroups.value.forEach(g => expandedGroups.value.add(g.huggingface_id))
 }
 
 // ── Model actions ──────────────────────────────────────────
@@ -331,11 +396,17 @@ function formatDate(iso) {
 
 // ── Lifecycle ──────────────────────────────────────────────
 onMounted(async () => {
-  await modelStore.fetchModels()
-  await modelStore.fetchHuggingfaceTokenStatus()
+  await Promise.all([
+    modelStore.fetchModels(),
+    modelStore.fetchSafetensorsModels(),
+    modelStore.fetchHuggingfaceTokenStatus(),
+  ])
   expandAllGroups()
   // Poll every 10 seconds for status updates
-  pollTimer = setInterval(() => modelStore.fetchModels(), 10000)
+  pollTimer = setInterval(() => {
+    modelStore.fetchModels()
+    modelStore.fetchSafetensorsModels()
+  }, 10000)
 })
 
 onUnmounted(() => {
@@ -477,14 +548,14 @@ onUnmounted(() => {
 .group-collapse-enter-to,
 .group-collapse-leave-from  { max-height: 1000px; opacity: 1; }
 
-.quantizations {
+:deep(.quantizations) {
   padding: 0.5rem;
   display: flex;
   flex-direction: column;
   gap: 0.375rem;
 }
 
-.quant-row {
+:deep(.quant-row) {
   display: flex;
   justify-content: space-between;
   align-items: center;
@@ -496,45 +567,52 @@ onUnmounted(() => {
   transition: border-color 0.15s;
 }
 
-.quant-row.is-active {
+:deep(.quant-row.is-active) {
   border-color: rgba(34, 197, 94, 0.4);
   background: rgba(34, 197, 94, 0.04);
 }
 
-.quant-info { flex: 1; min-width: 0; }
+:deep(.quant-info) { flex: 1; min-width: 0; }
 
-.quant-main {
+:deep(.quant-main) {
   display: flex;
   align-items: center;
   gap: 0.4rem;
   flex-wrap: wrap;
 }
 
-.quant-name {
+:deep(.quant-name) {
   font-weight: 600;
   font-size: 0.875rem;
   font-family: monospace;
 }
 
-.quant-sub {
+:deep(.quant-sub) {
   display: flex;
   gap: 0.75rem;
   margin-top: 0.2rem;
 }
 
-.file-size,
-.downloaded-at {
+:deep(.file-size),
+:deep(.downloaded-at) {
   font-size: 0.75rem;
   color: var(--text-secondary, #9ca3af);
 }
 
-.quant-actions {
+:deep(.quant-actions) {
   display: flex;
   gap: 0.25rem;
   flex-shrink: 0;
   align-items: center;
 }
 
+/* Emphasize engine tag with a distinct background */
+.engine-tag {
+  background-color: rgba(59, 130, 246, 0.15); /* soft blue */
+  border-color: rgba(59, 130, 246, 0.65);
+  color: #bfdbfe;
+}
+
 /* ── Token dialog ─────────────────────────────────────── */
 .token-form { display: flex; flex-direction: column; gap: 0.75rem; }
 .token-desc { font-size: 0.875rem; color: var(--text-secondary, #9ca3af); margin: 0; }
diff --git a/frontend/src/views/ModelSearch.vue b/frontend/src/views/ModelSearch.vue
index 15cde29..22fc98d 100644
--- a/frontend/src/views/ModelSearch.vue
+++ b/frontend/src/views/ModelSearch.vue
@@ -17,7 +17,7 @@
           text
           severity="secondary"
           class="clear-btn"
-          @click="query = ''; searchResults = []"
+          @click="clearSearchResults"
         />
       </div>
 
@@ -101,6 +101,12 @@
               <span v-if="result.likes != null" class="meta-item">
                 <i class="pi pi-heart" /> {{ formatNumber(result.likes) }}
               </span>
+              <span v-if="getResultArtifactCount(result)" class="meta-item">
+                <i class="pi pi-database" /> {{ getResultArtifactCount(result) }}
+              </span>
+              <span v-if="getResultSizeSummary(result)" class="meta-item">
+                <i class="pi pi-box" /> {{ getResultSizeSummary(result) }}
+              </span>
               <span v-if="result.license" class="meta-item license">
                 {{ result.license }}
               </span>
@@ -133,43 +139,82 @@
             <table v-else class="files-table">
               <thead>
                 <tr>
-                  <th>File</th>
+                  <th>{{ searchFormat === 'gguf' ? 'Item' : 'Model' }}</th>
                   <th>Size</th>
+                  <th v-if="searchFormat === 'gguf'">Shards</th>
+                  <th v-if="searchFormat === 'gguf'">Projector</th>
                   <th>Status</th>
                   <th></th>
                 </tr>
               </thead>
               <tbody>
-                <tr v-for="file in getFiles(result.modelId || result.id)" :key="file.filename">
+                <tr v-for="file in getFiles(result.modelId || result.id)" :key="file.key || file.filename">
                   <td class="file-name">
-                    <code>{{ file.filename }}</code>
-                    <Tag v-if="file.quantization" :value="file.quantization" severity="info" />
+                    <code>{{ formatResultItemLabel(file, result) }}</code>
+                    <span v-if="searchFormat === 'gguf' && file.kind === 'quant' && file.variantPrefix" class="file-subtext">
+                      {{ file.variantPrefix }} variant
+                    </span>
+                    <span v-else-if="file.subtext" class="file-subtext">
+                      {{ file.subtext }}
+                    </span>
                   </td>
                   <td class="file-size">{{ formatBytes(file.size) }}</td>
-                  <td class="file-status">
-                    <Tag v-if="file.downloaded" value="Downloaded" severity="success" />
-                    <span v-else class="not-downloaded">—</span>
+                  <td v-if="searchFormat === 'gguf'" class="file-count">
+                    {{ file.kind === 'quant' ? (file.files?.length || 0) : 1 }}
                   </td>
-                  <td class="file-action">
-                    <Button
-                      v-if="file.downloaded"
-                      label="Configure"
-                      icon="pi pi-cog"
-                      size="small"
-                      severity="secondary"
-                      text
-                      @click="configureDownloaded(result.modelId || result.id, file)"
+                  <td v-if="searchFormat === 'gguf'" class="projector-cell">
+                    <Dropdown
+                      v-if="file.kind === 'quant'"
+                      :model-value="getSelectedProjector(result.modelId || result.id, file)"
+                      :options="file.projectorOptions || [{ label: 'None', value: '' }]"
+                      optionLabel="label"
+                      optionValue="value"
+                      class="projector-select"
+                      :disabled="downloadingFiles.has(getDownloadKey(result.modelId || result.id, file))"
+                      @update:model-value="setSelectedProjector(result.modelId || result.id, file, $event)"
                     />
-                    <Button
-                      v-else
-                      label="Download"
-                      icon="pi pi-download"
-                      size="small"
-                      severity="success"
-                      outlined
-                      :loading="downloadingFiles.has(`${result.modelId || result.id}:${file.filename}`)"
-                      @click="downloadFile(result, file)"
+                  </td>
+                  <td class="file-status">
+                    <Tag
+                      v-if="isFileDownloading(result.modelId || result.id, file)"
+                      value="Downloading"
+                      severity="warning"
                     />
+                    <Tag v-else-if="file.downloaded" value="Downloaded" severity="success" />
+                    <Tag v-else value="Available" severity="warning" />
+                  </td>
+                  <td class="file-action">
+                    <div class="file-actions">
+                      <Button
+                        v-if="file.downloaded"
+                        label="Configure"
+                        icon="pi pi-cog"
+                        size="small"
+                        severity="secondary"
+                        text
+                        @click="configureDownloaded(result.modelId || result.id, file)"
+                      />
+                      <Button
+                        v-if="file.downloaded && searchFormat === 'gguf' && file.kind === 'quant' && hasProjectorSelectionChanged(result.modelId || result.id, file)"
+                        label="Apply projector"
+                        icon="pi pi-save"
+                        size="small"
+                        severity="success"
+                        outlined
+                        :loading="downloadingFiles.has(getDownloadKey(result.modelId || result.id, file))"
+                        @click="updateProjector(result, file)"
+                      />
+                      <Button
+                        v-if="!file.downloaded"
+                        label="Download"
+                        icon="pi pi-download"
+                        size="small"
+                        severity="success"
+                        outlined
+                        :loading="downloadingFiles.has(getDownloadKey(result.modelId || result.id, file))"
+                        @click="downloadFile(result, file)"
+                      />
+                    </div>
                   </td>
                 </tr>
               </tbody>
@@ -182,7 +227,8 @@
 </template>
 
 <script setup>
-import { ref, onMounted } from 'vue'
+import { ref, onMounted, onUnmounted } from 'vue'
+import { storeToRefs } from 'pinia'
 import { useRouter } from 'vue-router'
 import { useToast } from 'primevue/usetoast'
 import Button from 'primevue/button'
@@ -192,23 +238,28 @@ import Dropdown from 'primevue/dropdown'
 import ProgressSpinner from 'primevue/progressspinner'
 import ProgressTracker from '@/components/common/ProgressTracker.vue'
 import { useModelStore } from '@/stores/models'
+import { useProgressStore } from '@/stores/progress'
 import axios from 'axios'
 
 const router = useRouter()
 const toast = useToast()
 const modelStore = useModelStore()
+const progressStore = useProgressStore()
+const {
+  searchQuery: query,
+  searchLastQuery: lastQuery,
+  searchHasSearched: hasSearched,
+  searchResults,
+  searchLoading: searching,
+  searchFormat,
+} = storeToRefs(modelStore)
 
 // ── State ──────────────────────────────────────────────────
-const query = ref('')
-const lastQuery = ref('')
-const searchFormat = ref('gguf')
-const searching = ref(false)
-const hasSearched = ref(false)
-const searchResults = ref([])
 const expanded = ref(new Set())
 const loadingFiles = ref(new Set())
 const downloadingFiles = ref(new Set())
 const filesCache = ref({})   // modelId -> files[]
+const projectorSelections = ref({})
 
 const formatOptions = [
   { label: 'GGUF', value: 'gguf' },
@@ -218,21 +269,24 @@ const formatOptions = [
 // ── Search ─────────────────────────────────────────────────
 async function search() {
   if (!query.value.trim()) return
-  searching.value = true
-  hasSearched.value = true
-  lastQuery.value = query.value
   expanded.value = new Set()
   filesCache.value = {}
+  projectorSelections.value = {}
   try {
     searchResults.value = await modelStore.searchModels(query.value.trim(), 20, searchFormat.value)
   } catch (e) {
     toast.add({ severity: 'error', summary: 'Search failed', detail: e.message, life: 4000 })
     searchResults.value = []
-  } finally {
-    searching.value = false
   }
 }
 
+function clearSearchResults() {
+  modelStore.clearSearchState()
+  expanded.value = new Set()
+  filesCache.value = {}
+  projectorSelections.value = {}
+}
+
 // ── Expand row & load files ────────────────────────────────
 async function toggleExpand(modelId) {
   if (expanded.value.has(modelId)) {
@@ -256,50 +310,87 @@ async function loadFiles(modelId) {
 
     let files = []
     if (searchFormat.value === 'gguf') {
-      // Backend returns quantizations as a dict: { "Q4_K_M": { quantization, files: [{filename, size}], total_size, size_mb } }
-      const quantEntries = Object.values(result.quantizations || {})
-      // Flatten to individual files, keeping quant label
-      const allFiles = quantEntries.flatMap(entry =>
-        (entry.files || []).map(f => ({
+      const projectorOptions = getProjectorOptions(result.mmproj_files || [])
+      const quantEntries = Object.entries(result.quantizations || {}).map(([key, entry]) => ({
+        key,
+        kind: 'quant',
+        quantizationKey: key,
+        quantization: entry.quantization || '',
+        variantPrefix: entry.variant_prefix || '',
+        size: entry.total_size || 0,
+        projectorOptions,
+        files: (entry.files || []).map(f => ({
           filename: f.filename,
-          size: f.size || entry.total_size || 0,
-          quantization: entry.quantization || '',
-          variantPrefix: entry.variant_prefix || '',
-        }))
-      )
+          size: f.size || 0,
+        })),
+      }))
 
+      const allFiles = quantEntries.flatMap(entry => entry.files)
       if (allFiles.length) {
-        // Try to get accurate sizes from the API
         try {
           const filenames = allFiles.map(f => f.filename).join(',')
           const { data } = await axios.get(`/api/models/search/${encodeURIComponent(modelId)}/file-sizes`, {
             params: { filenames },
           })
           const sizes = data.sizes || {}
-          files = allFiles.map(f => {
-            const downloaded = isDownloaded(modelId, f.filename)
-            return {
+          files = quantEntries.map(entry => {
+            const resolvedFiles = entry.files.map(f => ({
               ...f,
               size: sizes[f.filename] ?? f.size,
+            }))
+            const downloaded = findDownloadedQuantization(modelId, entry, resolvedFiles)
+            return {
+              ...entry,
+              files: resolvedFiles,
+              size: resolvedFiles.reduce((sum, f) => sum + (f.size || 0), 0),
               downloaded,
               modelId: downloaded?.id,
             }
-          })
+          }).sort((a, b) => (a.size || 0) - (b.size || 0))
         } catch {
-          files = allFiles.map(f => {
-            const downloaded = isDownloaded(modelId, f.filename)
-            return { ...f, downloaded, modelId: downloaded?.id }
-          })
+          files = quantEntries.map(entry => {
+            const downloaded = findDownloadedQuantization(modelId, entry, entry.files)
+            return { ...entry, downloaded, modelId: downloaded?.id }
+          }).sort((a, b) => (a.size || 0) - (b.size || 0))
         }
       }
+      files.forEach((entry) => {
+        if (entry.kind !== 'quant') return
+        ensureProjectorSelection(modelId, entry, entry.downloaded?.mmproj_filename || '')
+      })
     } else {
-      // Safetensors: backend returns safetensors_files: [{ filename }]
       const stFiles = result.safetensors_files || []
-      files = stFiles.map(f => ({
-        filename: f.filename,
-        size: f.size || 0,
-        downloaded: false,
-      }))
+      let resolvedFiles = stFiles.map(file => ({ filename: file.filename, size: file.size || 0 }))
+      if (resolvedFiles.length) {
+        try {
+          const filenames = resolvedFiles.map(file => file.filename).join(',')
+          const { data } = await axios.get(`/api/models/search/${encodeURIComponent(modelId)}/file-sizes`, {
+            params: { filenames },
+          })
+          const sizes = data.sizes || {}
+          resolvedFiles = resolvedFiles.map(file => ({
+            ...file,
+            size: sizes[file.filename] ?? file.size,
+          }))
+        } catch {
+          // Keep the size hints returned by the search API.
+        }
+      }
+
+      const downloadedBundle = findDownloadedSafetensorsBundle(result.modelId || result.id)
+      const totalSize = resolvedFiles.reduce((sum, file) => sum + (file.size || 0), 0)
+      files = resolvedFiles.length
+        ? [{
+            key: 'safetensors-bundle',
+            kind: 'safetensors-bundle',
+            filename: result.modelId || result.id,
+            size: totalSize,
+            files: resolvedFiles,
+            downloaded: downloadedBundle,
+            modelId: downloadedBundle?.model_id,
+            subtext: `${resolvedFiles.length} file${resolvedFiles.length === 1 ? '' : 's'}`,
+          }]
+        : []
     }
 
     filesCache.value[modelId] = files
@@ -319,22 +410,43 @@ function getFiles(modelId) {
 // ── Download ───────────────────────────────────────────────
 async function downloadFile(result, file) {
   const modelId = result.modelId || result.id
-  const key = `${modelId}:${file.filename}`
+  const key = getDownloadKey(modelId, file)
   downloadingFiles.value.add(key)
   downloadingFiles.value = new Set(downloadingFiles.value)
   try {
-    await modelStore.downloadModel(
-      modelId,
-      file.filename,
-      file.size || 0,
-      searchFormat.value,
-      result.pipeline_tag || null
-    )
+    if (searchFormat.value === 'gguf' && file.kind === 'quant') {
+      const selectedProjector = getSelectedProjector(modelId, file)
+      const selectedProjectorOption = getSelectedProjectorOption(file, selectedProjector)
+      await modelStore.downloadGgufBundle(
+        modelId,
+        file.quantizationKey || file.quantization,
+        file.files || [],
+        result.pipeline_tag || null,
+        selectedProjector || null,
+        selectedProjectorOption?.size || 0,
+      )
+    } else if (searchFormat.value === 'safetensors') {
+      await modelStore.downloadSafetensorsBundle(
+        modelId,
+        file.files || []
+      )
+    } else {
+      await modelStore.downloadModel(
+        modelId,
+        file.filename,
+        file.size || 0,
+        searchFormat.value,
+        result.pipeline_tag || null
+      )
+    }
     toast.add({ severity: 'success', summary: 'Download started', detail: 'Track progress above', life: 3000 })
     // Refresh files to update downloaded status
     delete filesCache.value[modelId]
     await loadFiles(modelId)
     await modelStore.fetchModels()
+    if (searchFormat.value === 'safetensors') {
+      await modelStore.fetchSafetensorsModels()
+    }
   } catch (e) {
     toast.add({ severity: 'error', summary: 'Download failed', detail: e.message, life: 4000 })
   } finally {
@@ -343,17 +455,130 @@ async function downloadFile(result, file) {
   }
 }
 
-
-function configureDownloaded(modelId, file) {
+async function updateProjector(result, file) {
+  const repoId = result.modelId || result.id
+  const downloadKey = getDownloadKey(repoId, file)
   const model = file.modelId
     ? modelStore.allQuantizations.find(m => m.id === file.modelId)
-    : findDownloadedModel(modelId, file.filename)
+    : findDownloadedQuantization(repoId, file, file.files || [])
+  if (!model?.id) return
+
+  const selectedProjector = getSelectedProjector(repoId, file) || null
+  const selectedProjectorOption = getSelectedProjectorOption(file, selectedProjector)
+
+  downloadingFiles.value.add(downloadKey)
+  downloadingFiles.value = new Set(downloadingFiles.value)
+  try {
+    const response = await modelStore.updateModelProjector(
+      model.id,
+      selectedProjector,
+      selectedProjectorOption?.size || 0,
+    )
+    if (response?.applied) {
+      await refreshModelSearchState()
+      toast.add({ severity: 'success', summary: 'Projector updated', detail: response.message, life: 3000 })
+    } else {
+      toast.add({ severity: 'success', summary: 'Projector update started', detail: response?.message || 'Track progress above', life: 3000 })
+    }
+  } catch (e) {
+    toast.add({ severity: 'error', summary: 'Projector update failed', detail: e.message, life: 4000 })
+  } finally {
+    downloadingFiles.value.delete(downloadKey)
+    downloadingFiles.value = new Set(downloadingFiles.value)
+  }
+}
+
+
+function configureDownloaded(modelId, file) {
+  const model = searchFormat.value === 'safetensors'
+    ? findDownloadedSafetensorsBundle(modelId)
+    : file.modelId
+      ? modelStore.allQuantizations.find(m => m.id === file.modelId)
+      : file.kind === 'quant'
+        ? findDownloadedQuantization(modelId, file, file.files || [])
+        : findDownloadedModel(modelId, file.filename)
   if (model) {
-    router.push(`/models/${encodeURIComponent(model.id)}/config`)
+    router.push(`/models/${encodeURIComponent(model.id || model.model_id)}/config`)
   }
 }
 
 // ── Helpers ────────────────────────────────────────────────
+function getDownloadKey(modelId, file) {
+  return `${modelId}:${file.quantizationKey || file.filename}`
+}
+
+function isFileDownloading(modelId, file) {
+  return downloadingFiles.value.has(getDownloadKey(modelId, file))
+}
+
+function getProjectorSelectionKey(modelId, file) {
+  return `${modelId}:${file.quantizationKey || file.filename}:projector`
+}
+
+function parseProjectorPrecision(filename) {
+  const upper = (filename || '').toUpperCase()
+  if (upper.includes('BF16')) return 'BF16'
+  if (upper.includes('F16')) return 'F16'
+  if (upper.includes('F32')) return 'F32'
+  return null
+}
+
+function getProjectorOptions(mmprojFiles = []) {
+  const byPrecision = new Map()
+  mmprojFiles.forEach((file) => {
+    const precision = parseProjectorPrecision(file.filename)
+    if (!precision || byPrecision.has(precision)) return
+    byPrecision.set(precision, {
+      label: precision,
+      value: file.filename,
+      size: file.size || 0,
+    })
+  })
+
+  return [
+    { label: 'None', value: '', size: 0 },
+    ...Array.from(byPrecision.values()).sort((a, b) => a.label.localeCompare(b.label)),
+  ]
+}
+
+function ensureProjectorSelection(modelId, file, value = '') {
+  const key = getProjectorSelectionKey(modelId, file)
+  if (Object.prototype.hasOwnProperty.call(projectorSelections.value, key)) return
+  const defaultValue = value || getDefaultProjectorValue(file)
+  projectorSelections.value = {
+    ...projectorSelections.value,
+    [key]: defaultValue,
+  }
+}
+
+function setSelectedProjector(modelId, file, value) {
+  projectorSelections.value = {
+    ...projectorSelections.value,
+    [getProjectorSelectionKey(modelId, file)]: value || '',
+  }
+}
+
+function getSelectedProjector(modelId, file) {
+  const key = getProjectorSelectionKey(modelId, file)
+  if (Object.prototype.hasOwnProperty.call(projectorSelections.value, key)) {
+    return projectorSelections.value[key]
+  }
+  return file.downloaded?.mmproj_filename || ''
+}
+
+function getSelectedProjectorOption(file, value) {
+  return (file.projectorOptions || []).find(option => option.value === (value || '')) || null
+}
+
+function getDefaultProjectorValue(file) {
+  const f16 = (file.projectorOptions || []).find(option => option.label === 'F16')
+  return f16?.value || ''
+}
+
+function hasProjectorSelectionChanged(modelId, file) {
+  return (getSelectedProjector(modelId, file) || '') !== (file.downloaded?.mmproj_filename || '')
+}
+
 function isDownloaded(hfId, filename) {
   return modelStore.allQuantizations.find(
     m => m.huggingface_id === hfId &&
@@ -361,6 +586,20 @@ function isDownloaded(hfId, filename) {
   )
 }
 
+function findDownloadedQuantization(hfId, entry, files = []) {
+  return modelStore.allQuantizations.find(m =>
+    m.huggingface_id === hfId &&
+    (
+      (entry.quantization && m.quantization === entry.quantization) ||
+      files.some(file => m.filename === file.filename)
+    )
+  )
+}
+
+function findDownloadedSafetensorsBundle(hfId) {
+  return modelStore.safetensorsModels.find(model => model.huggingface_id === hfId)
+}
+
 function findDownloadedModel(hfId, filename) {
   return modelStore.allQuantizations.find(
     m => m.huggingface_id === hfId &&
@@ -368,10 +607,12 @@ function findDownloadedModel(hfId, filename) {
   )
 }
 
-function extractQuantization(filename) {
-  if (!filename) return null
-  const match = filename.match(/[_-](Q\d[_A-Z0-9]*(?:_M|_S|_XS|_XL|_XXS)?|IQ\d_[A-Z]+|BF16|F16|F32)/i)
-  return match?.[1]?.toUpperCase() ?? null
+function formatResultItemLabel(entry, result) {
+  if (!entry) return ''
+  if (entry.kind === 'quant') {
+    return entry.quantizationKey || entry.quantization || 'Unknown'
+  }
+  return result.modelId || result.id
 }
 
 const INTERESTING_TAGS = new Set([
@@ -399,9 +640,99 @@ function formatNumber(n) {
   return String(n)
 }
 
+function getResultArtifactCount(result) {
+  if ((result.model_format || searchFormat.value) === 'gguf') {
+    const quantCount = Object.keys(result.quantizations || {}).length
+    return quantCount ? `${quantCount} quant${quantCount === 1 ? '' : 's'}` : ''
+  }
+  const fileCount = (result.safetensors_files || []).length
+  return fileCount ? `${fileCount} file${fileCount === 1 ? '' : 's'}` : ''
+}
+
+function getResultSizeSummary(result) {
+  if ((result.model_format || searchFormat.value) === 'gguf') {
+    const sizes = Object.values(result.quantizations || {})
+      .map(entry => entry?.total_size || 0)
+      .filter(size => size > 0)
+    if (!sizes.length) return ''
+    return `from ${formatBytes(Math.min(...sizes))}`
+  }
+
+  const totalSize = (result.safetensors_files || [])
+    .reduce((sum, file) => sum + (file.size || 0), 0)
+  return totalSize > 0 ? formatBytes(totalSize) : ''
+}
+
 // ── Lifecycle ──────────────────────────────────────────────
+function markDownloadedFromEvent(payload) {
+  const hfId = payload?.huggingface_id
+  const quantization = payload?.quantization
+  if (!hfId || !quantization) return
+
+  const cachedRows = filesCache.value[hfId]
+  if (!Array.isArray(cachedRows) || cachedRows.length === 0) return
+
+  const nextRows = cachedRows.map((row) => {
+    if (row.kind !== 'quant') return row
+
+    const matchesQuantization = row.quantizationKey === quantization || row.quantization === quantization
+    const matchesFilename = Array.isArray(payload?.filenames)
+      && payload.filenames.some(filename => (row.files || []).some(file => file.filename === filename))
+
+    if (!matchesQuantization && !matchesFilename) return row
+
+    const downloaded = {
+      ...(row.downloaded || {}),
+      id: payload?.model_id || row.modelId || row.downloaded?.id,
+      mmproj_filename: payload?.mmproj_filename || row.downloaded?.mmproj_filename || '',
+    }
+
+    const updatedRow = {
+      ...row,
+      downloaded,
+      modelId: downloaded.id,
+    }
+    // Keep the projector selector in sync with backend state for this quant.
+    const key = getProjectorSelectionKey(hfId, updatedRow)
+    projectorSelections.value = {
+      ...projectorSelections.value,
+      [key]: downloaded.mmproj_filename || '',
+    }
+    return updatedRow
+  })
+
+  filesCache.value = {
+    ...filesCache.value,
+    [hfId]: nextRows,
+  }
+}
+
+async function refreshModelSearchState() {
+  await modelStore.fetchModels()
+  await modelStore.fetchSafetensorsModels()
+  const expandedIds = Array.from(expanded.value)
+  filesCache.value = {}
+  await Promise.all(expandedIds.map(id => loadFiles(id)))
+}
+
+let unsubscribeDownloadComplete = null
+
 onMounted(async () => {
   if (!modelStore.models.length) await modelStore.fetchModels()
+  if (!modelStore.safetensorsModels.length) await modelStore.fetchSafetensorsModels()
+  unsubscribeDownloadComplete = progressStore.subscribeToDownloadComplete(async (payload) => {
+    const hfId = payload?.huggingface_id
+    if (!hfId) return
+    if (!searchResults.value.some(result => (result.modelId || result.id) === hfId)) return
+    if (payload?.model_format === 'gguf-bundle') {
+      markDownloadedFromEvent(payload)
+    }
+    await refreshModelSearchState()
+  })
+})
+
+onUnmounted(() => {
+  if (typeof unsubscribeDownloadComplete === 'function') unsubscribeDownloadComplete()
 })
 </script>
 
@@ -612,9 +943,12 @@ onMounted(async () => {
 
 .files-table tr:last-child td { border-bottom: none; }
 
-.file-name { display: flex; align-items: center; gap: 0.4rem; }
-.file-name code { font-size: 0.8rem; }
+.file-subtext { color: var(--text-secondary, #9ca3af); font-size: 0.75rem; }
 .file-size { color: var(--text-secondary, #9ca3af); white-space: nowrap; }
+.file-count { color: var(--text-secondary, #9ca3af); white-space: nowrap; }
+.projector-cell { min-width: 9rem; }
+.projector-select { min-width: 8rem; }
+.file-actions { display: flex; align-items: center; gap: 0.35rem; justify-content: flex-end; flex-wrap: wrap; }
 .not-downloaded { color: var(--text-secondary, #9ca3af); }
 
 .safetensors-download {
diff --git a/package-lock.json b/package-lock.json
index d1a1f49..6a572b2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -14,13 +14,13 @@
         "primeicons": "^6.0.0",
         "primevue": "^3.45.0",
         "vue": "^3.4.0",
-        "vue-router": "^4.2.0",
-        "vue3-toastify": "^0.1.0"
+        "vue-router": "^4.2.0"
       },
       "devDependencies": {
         "@types/node": "^20.9.0",
         "@vitejs/plugin-vue": "^4.5.0",
         "concurrently": "^9.0.0",
+        "cross-env": "^10.1.0",
         "eslint": "^9.39.2",
         "eslint-plugin-vue": "^10.6.2",
         "prettier": "^3.7.4",
@@ -273,6 +273,13 @@
         "url": "https://github.com/sponsors/JounQin"
       }
     },
+    "node_modules/@epic-web/invariant": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/@epic-web/invariant/-/invariant-1.0.0.tgz",
+      "integrity": "sha512-lrTPqgvfFQtR/eY/qkIzp98OGdNJu0m5ji3q/nJI8v3SXkRKEnWiOxMmbvcSoAIzv/cGiuvRy57k4suKQSAdwA==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@esbuild/aix-ppc64": {
       "version": "0.21.5",
       "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
@@ -1746,6 +1753,24 @@
         }
       }
     },
+    "node_modules/cross-env": {
+      "version": "10.1.0",
+      "resolved": "https://registry.npmjs.org/cross-env/-/cross-env-10.1.0.tgz",
+      "integrity": "sha512-GsYosgnACZTADcmEyJctkJIoqAhHjttw7RsFrVoJNXbsWWqaq6Ym+7kZjq6mS45O0jij6vtiReppKQEtqWy6Dw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@epic-web/invariant": "^1.0.0",
+        "cross-spawn": "^7.0.6"
+      },
+      "bin": {
+        "cross-env": "dist/bin/cross-env.js",
+        "cross-env-shell": "dist/bin/cross-env-shell.js"
+      },
+      "engines": {
+        "node": ">=20"
+      }
+    },
     "node_modules/cross-spawn": {
       "version": "7.0.6",
       "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
@@ -4101,28 +4126,6 @@
         "vue": "^3.5.0"
       }
     },
-    "node_modules/vue3-toastify": {
-      "version": "0.1.14",
-      "resolved": "https://registry.npmjs.org/vue3-toastify/-/vue3-toastify-0.1.14.tgz",
-      "integrity": "sha512-2wyzMhWq8IjTclL25tqKWknDFdFI1vPueMGZpHNlPWf6TBfxBycBANS+2n4W1xD7tHhX4G6HhCe31sle6OpwYQ==",
-      "license": "MIT",
-      "workspaces": [
-        "docs",
-        "playground"
-      ],
-      "engines": {
-        "node": ">=16",
-        "npm": ">=7"
-      },
-      "peerDependencies": {
-        "vue": ">=3.2.0"
-      },
-      "peerDependenciesMeta": {
-        "vue": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/which": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
diff --git a/package.json b/package.json
index 4d092de..27ee91a 100644
--- a/package.json
+++ b/package.json
@@ -5,9 +5,9 @@
   "scripts": {
     "dev": "cd frontend && vite",
     "dev:frontend": "cd frontend && vite",
-    "dev:backend": "WATCHFILES_FORCE_POLLING=true python -m uvicorn main:app --host 0.0.0.0 --port 8081 --app-dir backend --reload --reload-dir backend",
+    "dev:backend": "cross-env WATCHFILES_FORCE_POLLING=true python -m uvicorn backend.main:app --host 0.0.0.0 --port 8081 --reload --reload-dir backend",
     "dev:all": "concurrently -n backend,frontend -c blue,green \"npm run dev:backend\" \"npm run dev:frontend\"",
-    "kill-ports": "powershell.exe -Command \"Get-NetTCPConnection -LocalPort 5173,8080 -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }\"",
+    "kill-ports": "powershell.exe -Command \"Get-NetTCPConnection -LocalPort 5173,8081 -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }\"",
     "build": "cd frontend && vite build",
     "preview": "cd frontend && vite preview"
   },
@@ -21,9 +21,10 @@
     "vue-router": "^4.2.0"
   },
   "devDependencies": {
-    "concurrently": "^9.0.0",
     "@types/node": "^20.9.0",
     "@vitejs/plugin-vue": "^4.5.0",
+    "concurrently": "^9.0.0",
+    "cross-env": "^10.1.0",
     "eslint": "^9.39.2",
     "eslint-plugin-vue": "^10.6.2",
     "prettier": "^3.7.4",