From 7edaf6fa34fdffecff3d18c59fd5f7dc63a633b5 Mon Sep 17 00:00:00 2001 From: lapy Date: Mon, 9 Mar 2026 17:46:59 +0000 Subject: [PATCH] Finish refactoring --- Dockerfile | 8 +- README.md | 778 +++++----- backend/data_store.py | 72 + backend/gguf_introspection_config.json | 23 + backend/gguf_reader.py | 49 +- backend/huggingface.py | 65 +- backend/llama_manager.py | 18 +- backend/llama_swap_client.py | 20 + backend/llama_swap_config.py | 194 ++- backend/llama_swap_manager.py | 4 +- backend/lmdeploy_installer.py | 416 ----- backend/lmdeploy_manager.py | 1274 ++++++---------- backend/main.py | 39 +- backend/model_introspection.py | 555 +++++++ backend/param_registry.py | 2 + backend/routes/llama_version_manager.py | 159 -- backend/routes/llama_versions.py | 427 ++++-- backend/routes/lmdeploy.py | 82 - backend/routes/lmdeploy_versions.py | 65 + backend/routes/models.py | 1344 +++++------------ backend/routes/status.py | 25 +- backend/tests/test_lmdeploy_installer.py | 10 +- backend/tests/test_model_introspection.py | 52 + docker-compose.cuda.yml | 4 +- frontend/src/App.vue | 19 +- frontend/src/components/ModelRow.vue | 90 ++ frontend/src/components/ThemeToggle.vue | 29 +- .../src/components/common/ProgressTracker.vue | 82 +- frontend/src/components/layout/AppHeader.vue | 84 +- .../src/components/system/VersionTable.vue | 5 +- frontend/src/stores/engines.js | 31 +- frontend/src/stores/models.js | 30 +- frontend/src/stores/progress.js | 220 ++- frontend/src/styles/_components.css | 25 +- frontend/src/views/EnginesView.vue | 853 +++++++---- frontend/src/views/ModelConfig.vue | 161 +- frontend/src/views/ModelLibrary.vue | 252 ++-- frontend/src/views/ModelSearch.vue | 496 +++++- package-lock.json | 51 +- package.json | 7 +- 40 files changed, 4331 insertions(+), 3789 deletions(-) create mode 100644 backend/gguf_introspection_config.json delete mode 100644 backend/lmdeploy_installer.py create mode 100644 backend/model_introspection.py delete mode 100644 backend/routes/llama_version_manager.py delete mode 100644 backend/routes/lmdeploy.py create mode 100644 backend/routes/lmdeploy_versions.py create mode 100644 backend/tests/test_model_introspection.py create mode 100644 frontend/src/components/ModelRow.vue diff --git a/Dockerfile b/Dockerfile index 0b8bd9b..dcc4173 100644 --- a/Dockerfile +++ b/Dockerfile @@ -81,8 +81,8 @@ ENV DEBIAN_FRONTEND=noninteractive \ CUDA_VISIBLE_DEVICES=all \ NVIDIA_VISIBLE_DEVICES=all \ NVIDIA_DRIVER_CAPABILITIES=compute,utility \ - HF_HOME=/app/data/temp/.cache/huggingface \ - HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub \ + HF_HOME=/app/data/hf-cache \ + HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub \ VENV_PATH=/opt/venv \ PYTHONPATH=/app \ PATH="/app/data/cuda/current/bin:${PATH}" \ @@ -133,7 +133,7 @@ RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERS && cmake --version # Install llama-swap binary -ARG LLAMA_SWAP_VERSION=179 +ARG LLAMA_SWAP_VERSION=197 RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \ tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \ mv /tmp/llama-swap /usr/local/bin/llama-swap && \ @@ -168,7 +168,7 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python # Create non-root user and data directory structure RUN useradd -m -s /bin/bash appuser && \ - mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp/.cache/huggingface/hub && \ + mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/hf-cache/hub && \ chown -R appuser:appuser /app && \ # Ensure entrypoint script is accessible to appuser chmod 755 /usr/local/bin/docker-entrypoint.sh diff --git a/README.md b/README.md index 89567a4..cb1ecef 100644 --- a/README.md +++ b/README.md @@ -1,525 +1,499 @@ -# llama.cpp Studio +## llama.cpp Studio -A professional AI model management platform for llama.cpp models and versions, designed for modern AI workflows with comprehensive GPU support (NVIDIA CUDA, AMD Vulkan/ROCm, Metal, OpenBLAS). +llama.cpp Studio is a web-based control plane for running and managing local LLMs on top of `llama.cpp`, `ik_llama.cpp`, and `LMDeploy` – all served through a single OpenAI-compatible endpoint powered by `llama-swap`. + +It is designed for **power users running models on a single machine or small server** (Docker or bare metal) with strong support for: + +- **CPU-only** inference (OpenBLAS) +- **NVIDIA CUDA GPUs** (via the NVIDIA Container Toolkit) + +There is **no built-in support for Vulkan/ROCm/Metal backends** and **no Smart Auto feature** – configuration is explicit and predictable. + +### Key capabilities + +- **HuggingFace search (GGUF + safetensors)**: Search the Hub, inspect metadata, and plan downloads by quantization or safetensors bundle. +- **Model library with multi-quantization support**: Manage multiple quantizations per base model in a grouped view with start/stop/delete actions. +- **Per-model runtime configuration**: Configure engine (llama.cpp / ik_llama / LMDeploy), context length, GPU layers, batch sizes, and advanced flags. +- **Unified multi-model serving**: Serve many GGUF quantizations at once via `llama-swap` on port `2000`. +- **System & progress monitoring**: Live system stats, GPU information, and unified progress for downloads, builds, CUDA/LMDeploy installs via SSE. + +--- + +## Core concepts & architecture + +llama.cpp Studio is a single application composed of a Vue 3 SPA frontend and a FastAPI backend. The backend persists configuration to YAML files under `/app/data` and orchestrates runtimes through `llama-swap`. + +### High-level architecture + +```mermaid +flowchart LR + userClient[User_Client] --> browserUI["Web_UI_(Vue_3_SPA)"] + browserUI --> fastapiBackend["FastAPI_Backend"] + fastapiBackend --> dataStore["YAML_DataStore_(models_engines_settings)"] + fastapiBackend --> progressSSE["SSE_/api/events"] + fastapiBackend --> llamaSwap["llama-swap_Proxy_:2000"] + llamaSwap --> llamaCpp["llama.cpp_ik_llama_runtimes"] + llamaSwap --> lmdeploy["LMDeploy_TurboMind_(safetensors)"] +``` + +### Frontend (Vue 3 SPA) + +- `App.vue` provides the global shell: + - Header with llama-swap status and theme toggle + - Navigation between the main sections + - Central `` for page content + - Global ConfirmDialog/Toast and SSE connection +- Main views: + - **Model Library** (`/models`) – installed models grouped by base model and quantization. + - **Model Search** (`/search`) – HuggingFace search & download (GGUF and safetensors). + - **Model Config** (`/models/:id/config`) – per-quantization configuration. + - **Engines & System** (`/engines`) – llama.cpp / ik_llama builds, CUDA and LMDeploy status, system & GPU info. +- State management: + - `useModelStore` – models, search, downloads, metadata, start/stop/config operations. + - `useEnginesStore` – engine versions, CUDA installer, system and GPU info. + - `useProgressStore` – EventSource connection to `/api/events`, normalized tasks, logs, and notifications. + +### Backend (FastAPI) + +- `backend/main.py`: + - Ensures the `/app/data` (or local `./data`) directory structure exists and is writable. + - Initializes the YAML-backed `DataStore` for models, engine versions, and settings. + - Loads `HUGGINGFACE_API_KEY` from the environment if present. + - Starts and manages the `llama-swap` proxy on port `2000` when a valid llama.cpp/ik_llama binary is active. + - Registers all known models with `llama-swap` at startup based on logical metadata (not hard-coded paths). + - Serves the built Vue app from `frontend/dist` and exposes a catch-all SPA route. +- Key route groups: + - `/api/models` – model library, HuggingFace search, GGUF/safetensors downloads, configuration, start/stop. + - `/api/llama-versions` – llama.cpp/ik_llama build settings, builds, version listing, activation, deletion, CUDA installer. + - `/api/lmdeploy` – LMDeploy install/remove. + - `/api/status` & `/api/gpu-info` – system and GPU metrics plus `llama-swap` proxy health. + - `/api/events` – Server-Sent Events stream for unified progress and notifications. + +### Runtimes and `llama-swap` + +- `llama.cpp` and `ik_llama.cpp` versions are: + - Built from source under `/app/data/llama-cpp/...` + - Recorded in the DataStore with metadata and active version selection + - Exposed to the frontend via `/api/llama-versions` +- `llama-swap`: + - Is downloaded and installed into the runtime image at build time. + - Runs a single proxy process on port `2000` and multiplexes multiple model backends. + - Reads its configuration from files generated by the backend based on stored models and the active engine. +- LMDeploy: + - Is installed into `/app/data/lmdeploy/venv` from PyPI or source on demand. + - Serves safetensors checkpoints using TurboMind behind `llama-swap`. + +--- ## Features -### Model Management -- **Search & Download**: Search HuggingFace for GGUF models with comprehensive metadata and size information for each quantization -- **Multi-Quantization Support**: Download and manage multiple quantizations of the same model -- **Model Library**: Manage downloaded models with start/stop/delete functionality -- **Smart Configuration**: Auto-generate optimal llama.cpp parameters based on GPU capabilities -- **VRAM Estimation**: Real-time VRAM usage estimation with warnings for memory constraints -- **Metadata Extraction**: Rich model information including parameters, architecture, license, tags, and more -- **Safetensors Runner**: Configure and run safetensors checkpoints via LMDeploy TurboMind with an OpenAI-compatible endpoint on port 2001 - -### llama.cpp Version Management -- **Release Installation**: Download and install pre-built binaries from GitHub releases -- **Source Building**: Build from source with optional patches from GitHub PRs -- **Custom Build Configuration**: Customize GPU backends (CUDA, Vulkan, Metal, OpenBLAS), build type, and compiler flags -- **Update Checking**: Check for updates to both releases and source code -- **Version Management**: Install, update, and delete multiple llama.cpp versions -- **Build Validation**: Automatic validation of built binaries to ensure they work correctly - -### GPU Support -- **Multi-GPU Support**: Automatic detection and configuration for NVIDIA, AMD, and other GPUs -- **NVIDIA CUDA**: Full support for CUDA compute capabilities, flash attention, and multi-GPU -- **AMD GPU Support**: Vulkan and ROCm support for AMD GPUs -- **Apple Metal**: Support for Apple Silicon GPUs -- **OpenBLAS**: CPU acceleration with optimized BLAS routines -- **VRAM Monitoring**: Real-time GPU memory usage and temperature monitoring -- **NVLink Detection**: Automatic detection of NVLink connections and topology analysis - -### Multi-Model Serving -- **Concurrent Execution**: Run multiple models simultaneously via llama-swap proxy -- **OpenAI-Compatible API**: Standard API format for easy integration -- **Port 2000**: All models served through a single unified endpoint -- **Automatic Lifecycle Management**: Seamless starting/stopping of models - -### Web Interface -- **Modern UI**: Vue.js 3 with PrimeVue components -- **Real-time Updates**: SSE-based progress tracking and system monitoring -- **Responsive Design**: Works on desktop and mobile devices -- **System Status**: CPU, memory, disk, and GPU monitoring -- **LMDeploy Installer**: Dedicated UI to install/remove LMDeploy at runtime with live logs -- **Dark Mode**: Built-in theme support - -## Quick Start - -### Using Docker Compose - -1. Clone the repository: +### Model management + +- **Unified model library** + - Models are grouped by HuggingFace repo (e.g. `meta-llama/Meta-Llama-3-8B-Instruct`). + - Each group contains one or more quantizations (GGUF) and optional safetensors bundles. + - Per-quantization rows show size, download timestamp, runtime type, and running state. + +- **HuggingFace search (GGUF + safetensors)** + - Search by model name or keyword with a choice of: + - `gguf` – quantized GGUF files and bundles. + - `safetensors` – safetensors checkpoints. + - See metadata (file sizes, quantization names, tags) before you download. + +- **Downloads & bundles** + - GGUF: + - Download individual quantizations or full bundles. + - Optionally attach `.mmproj` projector files for multimodal models. + - Safetensors: + - Download full safetensors bundles. + - All downloads are tracked as long-running tasks via SSE and shown in the global progress panel. + +### Engine & version management + +- **llama.cpp and ik_llama.cpp** + - Multiple versions per engine are supported. + - Builds are always **from source**, configured using stored build settings (CUDA flags, flash attention, CPU variants, etc.). + - Versions can be activated/deactivated; activation updates `llama-swap` configuration automatically. + - Old versions can be removed to reclaim disk space. + +- **CUDA toolkit management (NVIDIA only)** + - Optional in-container CUDA installer can install or remove the CUDA Toolkit (plus optional cuDNN/TensorRT) under `/app/data/cuda`. + - Progress and logs for installs/uninstalls are surfaced in the Engines/System view and via SSE events. + - Only **NVIDIA CUDA + CPU** are documented and supported; other GPU backends are not part of this project’s supported surface. + +- **LMDeploy integration** + - Install LMDeploy from **PyPI** or from **source** into a dedicated virtual environment under `/app/data/lmdeploy/venv`. + - The backend exposes endpoints to: + - Check for the latest LMDeploy version. + - Install/update/remove LMDeploy. + - Once installed, safetensors models can be launched via LMDeploy TurboMind and are exposed through the same `llama-swap` endpoint. + +### Multi-model serving + +- **Single OpenAI-compatible endpoint** + - All models are served via `llama-swap` on `http://:2000`. + - The proxy implements standard OpenAI-style `/v1/chat/completions` and `/v1/models`. + +- **Concurrent GGUF quantizations** + - Multiple GGUF quantizations can be active at once behind `llama-swap`. + - The System Status view shows running models and basic health information. + +- **Safetensors via LMDeploy** + - One LMDeploy runtime is supported at a time for safetensors models. + - It is exposed alongside GGUF models through the same `llama-swap` API. + +### Monitoring & progress + +- **System & GPU status** + - `/api/status` reports CPU, memory, disk utilization, running model instances, and `llama-swap` proxy health. + - `/api/gpu-info` reports detected GPUs and their capabilities (focused on NVIDIA/CUDA). + +- **Unified progress tracking** + - `/api/events` streams: + - Download progress and completion events. + - llama.cpp/ik_llama source build progress. + - CUDA toolkit installation/uninstallation status and logs. + - LMDeploy installation status and logs. + - Notifications related to long-running tasks. + +--- + +## Quick start (Docker) + +The recommended way to run llama.cpp Studio is via Docker Compose. All examples assume you’ve cloned the repository. + +### 1. Clone the repo + ```bash git clone cd llama-cpp-studio ``` -2. Start the application: -```bash -# CPU-only mode -docker-compose -f docker-compose.cpu.yml up -d - -# GPU mode (NVIDIA CUDA) -docker-compose -f docker-compose.cuda.yml up -d +### 2. CPU-focused development (hot reload backend) -# Vulkan/AMD GPU mode -docker-compose -f docker-compose.vulkan.yml up -d +Use the CPU compose file (`docker-compose.cpu.yml`) during development. It mounts the backend source and enables reload: -# ROCm mode -docker-compose -f docker-compose.rocm.yml up -d +```bash +docker-compose -f docker-compose.cpu.yml up --build ``` -3. Access the web interface at `http://localhost:8080` - -### Published Container Images +This will: -Prebuilt images are pushed to GitHub Container Registry whenever the `publish-docker` workflow runs. +- Expose the web UI and API at `http://localhost:8080`. +- Expose the `llama-swap` proxy at `http://localhost:2000`. +- Mount `./data` to `/app/data` so models, configs, and logs persist between runs. -- `ghcr.io//llama-cpp-studio:latest` – standard image based on `ubuntu:22.04` with GPU tooling installed at runtime +### 3. GPU mode (NVIDIA CUDA) -Pull the image from GHCR: +For NVIDIA GPUs with the NVIDIA Container Toolkit installed on the host: ```bash -docker pull ghcr.io//llama-cpp-studio:latest +docker-compose -f docker-compose.cuda.yml up --build -d ``` -### Manual Docker Build +This will: + +- Build the image from the current source tree. +- Map: + - `8080:8080` – web UI + FastAPI backend + - `2000:2000` – `llama-swap` OpenAI-compatible endpoint +- Mount `./data` to `/app/data`. +- Reserve all GPUs for the container using the Compose `deploy.resources.reservations.devices` section. + +### 4. Manual Docker build and run + +You can also build and run the container without Compose: -1. Build the image: ```bash +# Build the image docker build -t llama-cpp-studio . -``` -2. Run the container: -```bash -# With GPU support +# GPU-capable run (NVIDIA) docker run -d \ --name llama-cpp-studio \ --gpus all \ -p 8080:8080 \ + -p 2000:2000 \ -v ./data:/app/data \ llama-cpp-studio -# CPU-only +# CPU-only run docker run -d \ - --name llama-cpp-studio \ + --name llama-cpp-studio-cpu \ -p 8080:8080 \ + -p 2000:2000 \ + -e CUDA_VISIBLE_DEVICES="" \ -v ./data:/app/data \ llama-cpp-studio ``` +### 5. Published images + +If you prefer pulling from a registry, use the GitHub Container Registry image published by this project (replace `` with the correct namespace): + +```bash +docker pull ghcr.io//llama-cpp-studio:latest +``` + +Run it with the same ports and volume mapping as above. + +--- + ## Configuration -### Environment Variables -- `CUDA_VISIBLE_DEVICES`: GPU device selection (default: all, set to "" for CPU-only) -- `PORT`: Web server port (default: 8080) -- `HUGGINGFACE_API_KEY`: HuggingFace API token for model search and download (optional) -- `LMDEPLOY_BIN`: Override path to the `lmdeploy` CLI (default: `lmdeploy` on PATH) -- `LMDEPLOY_PORT`: Override the LMDeploy OpenAI port (default: 2001) +### Environment variables + +Common environment variables for the backend: + +- **`HUGGINGFACE_API_KEY`** – HuggingFace token used for model search and download. + - When set via environment variable, the UI treats it as read-only and shows only a masked preview. +- **`CUDA_VISIBLE_DEVICES`** – controls which GPUs are visible to the container: + - Default in Compose is `all`. + - Set to `""` (empty string) for CPU-only runs. +- **`HF_HOME`** and **`HUGGINGFACE_HUB_CACHE`** – location for the HuggingFace cache: + - Default to `/app/data/hf-cache` and `/app/data/hf-cache/hub` so cache data is persisted in the volume. +- **`BACKEND_CORS_ORIGINS`**, **`BACKEND_CORS_ALLOW_CREDENTIALS`** – advanced CORS options for custom setups. +- **`RELOAD`** – when running the backend directly, controls uvicorn reload behavior (`true` in local dev, `false` in Docker). + +These can be set directly in `docker-compose.yml` or via an `.env` file referenced by Compose. + +### Data & volumes + +The image expects a writable data directory at `/app/data`, typically mapped from `./data` on the host: + +- **Models** – GGUF files and safetensors bundles. +- **Config** – YAML files for models, engines, and other settings. +- **Logs** – backend logs, build logs, installer logs. +- **llama.cpp builds** – source trees and build outputs. +- **CUDA toolkit** – if installed, under `/app/data/cuda`. +- **LMDeploy virtualenv** – under `/app/data/lmdeploy/venv`. + +Recommended Compose mapping: + +```yaml +volumes: + - ./data:/app/data +``` -### Volume Mounts -- `/app/data`: Persistent storage for models, configurations, and database +### HuggingFace token -### HuggingFace API Key +You can provide your HuggingFace token in multiple ways: -To enable model search and download functionality, you need to set your HuggingFace API key. You can do this in several ways: +- **Directly in Compose** (keep this private): -#### Option 1: Docker Compose Environment Variable -Uncomment and set the token in your `docker-compose.yml`: ```yaml environment: - - CUDA_VISIBLE_DEVICES=all - HUGGINGFACE_API_KEY=your_huggingface_token_here ``` -#### Option 2: .env File -Create a `.env` file in your project root: +- **`.env` file** (not committed to git): + ```bash HUGGINGFACE_API_KEY=your_huggingface_token_here ``` -Then uncomment the `env_file` section in `docker-compose.yml`: +Then in Compose: + ```yaml env_file: - .env ``` -#### Option 3: System Environment Variable -Set the environment variable before running Docker Compose: -```bash -export HUGGINGFACE_API_KEY=your_huggingface_token_here -docker-compose up -d -``` - -#### Getting Your HuggingFace Token -1. Go to [HuggingFace Settings](https://huggingface.co/settings/tokens) -2. Create a new token with "Read" permissions -3. Copy the token and use it in one of the methods above - -**Note**: When the API key is set via environment variable, it cannot be modified through the web UI for security reasons. +Once configured, the Model Search UI will use this token transparently. -### GPU Requirements -- **NVIDIA**: NVIDIA GPU with CUDA support, NVIDIA Container Toolkit installed -- **AMD**: AMD GPU with Vulkan/ROCm drivers -- **Apple**: Apple Silicon with Metal support -- **CPU**: OpenBLAS for CPU acceleration (included in Docker image) -- Minimum 8GB VRAM recommended for most models +--- -### LMDeploy Requirement +## Using the web UI -Safetensors execution relies on [LMDeploy](https://github.com/InternLM/lmdeploy), but the base image intentionally omits it to keep Docker builds lightweight (critical for GitHub Actions). Use the **LMDeploy** page in the UI to install or remove LMDeploy inside the running container—installs happen via `pip` at runtime and logs are streamed live. The installer creates a dedicated virtual environment under `/app/data/lmdeploy/venv`, so the package lives on the writable volume and can be removed by deleting that folder. If you are running outside the container, you can still `pip install lmdeploy` manually or point `LMDEPLOY_BIN` to a custom binary. The runtime uses `lmdeploy serve turbomind` to expose an OpenAI-compatible server on port `2001`. +### Model search & download -## Usage +- Open the **Model Search** view. +- Enter a HuggingFace repo name or search term, choose: + - `gguf` – to browse GGUF quantizations and bundles. + - `safetensors` – to browse safetensors bundles. +- Expand a result to: + - Inspect file sizes and quantization names. + - See optional projector (`.mmproj`) files for multimodal models. +- Click **Download** to start a download; progress will appear in the global progress panel. -### 1. Model Management +### Model library -#### Search Models -- Use the search bar to find GGUF models on HuggingFace -- Filter by tags, parameters, or model name -- View comprehensive metadata including downloads, likes, tags, and file sizes +- Open the **Model Library** view (`/models`). +- Each card groups all quantizations for a base model: + - GGUF quantizations (different sizes and quant schemes). + - Safetensors bundles (if present). +- Per-row actions: + - **Start** / **Stop** – launch or stop a model via `llama-swap`. + - **Configure** – open the per-quantization configuration screen. + - **Delete** – remove a specific quantization. +- Group-level actions let you delete entire model groups to reclaim disk space. -#### Download Models -- Click download on any quantization to start downloading -- Multiple quantizations of the same model are automatically grouped -- Progress tracking with real-time updates via SSE +### Per-model configuration -#### Configure Models -- Set llama.cpp parameters or use Smart Auto for optimal settings -- View VRAM estimation before starting -- Configure context size, batch sizes, temperature, and more +- From the library, click **Configure** on a quantization. +- Choose an engine: + - `llama.cpp` or `ik_llama.cpp` for GGUF. + - `LMDeploy` for safetensors. +- Adjust: + - Context length. + - GPU layers (`-ngl`-style behavior). + - Batch sizes and other llama.cpp/LMDeploy flags. +- Advanced options are rendered from a parameter registry maintained on the backend, allowing you to set engine-specific flags explicitly. -#### Run Models -- Start/stop models with one click -- Multiple models can run simultaneously -- View running instances and resource usage +### Engines and CUDA -### 2. llama.cpp Versions +- Open the **Engines & System** view (`/engines`) to: + - View and manage **llama.cpp** and **ik_llama.cpp** versions: + - Build from source using saved build settings (e.g. CUDA on/off). + - Activate a version (updates `llama-swap` configuration). + - Delete non-active versions to free disk. + - Manage **CUDA toolkit** in the container: + - Install or uninstall specific CUDA versions. + - See status and detailed logs. + - Manage **LMDeploy**: + - Install from PyPI or a git branch. + - Remove LMDeploy and its virtualenv. + - Tail installer logs for debugging. -#### Check Updates -- View available releases and source updates -- See commit history and release notes +All of these actions surface their progress and logs in the unified progress UI. -#### Install Release -- Download pre-built binaries from GitHub -- Automatic verification and installation +### System status & monitoring -#### Build from Source -- Compile from source with custom configuration -- Select GPU backends (CUDA, Vulkan, Metal, OpenBLAS) -- Configure build type (Release, Debug, RelWithDebInfo) -- Add custom CMake flags and compiler options -- Apply patches from GitHub PRs -- Automatic validation of built binaries +- The header shows a concise llama-swap status indicator (health and port). +- The System section displays: + - CPU, memory, and disk usage. + - Detected NVIDIA GPUs and key characteristics. + - Currently running models as reported by `llama-swap`. -#### Manage Versions -- Delete old versions to free up space -- View installation details and build configuration +--- -### 3. System Monitoring -- **Overview**: CPU, memory, disk, and GPU usage -- **GPU Details**: Individual GPU information and utilization -- **Running Instances**: Active model instances with resource usage -- **SSE**: Real-time updates for all metrics +## OpenAI-compatible API (high level) -## Multi-Model Serving +Once at least one model is running, you can call the `llama-swap` proxy directly. -llama-cpp-studio uses llama-swap to serve multiple models simultaneously on port 2000. - -### Starting Models - -Simply start any model from the Model Library. All models run on port 2000 simultaneously. - -### OpenAI-Compatible API +- **Base URL**: `http://:2000` +- **Chat completions**: ```bash curl http://localhost:2000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "llama-3-2-1b-instruct-iq2-xs", + "model": "your-model-name", "messages": [{"role": "user", "content": "Hello!"}] }' ``` -Model names are shown in System Status after starting a model. - -### Features - -- Multiple models run concurrently -- No loading time - instant switching between models -- Standard OpenAI API format -- Automatic lifecycle management -- Single unified endpoint - -### Troubleshooting +- **Model listing**: `GET http://localhost:2000/v1/models` +- **Health**: `GET http://localhost:2000/health` -- Check available models: `http://localhost:2000/v1/models` -- Check proxy health: `http://localhost:2000/health` -- View logs: `docker logs llama-cpp-studio` +Model IDs are shown in the System Status view and in the Model Library when a model is running. -### LMDeploy TurboMind (Safetensors) +--- -- Run exactly one safetensors checkpoint at a time via LMDeploy -- Configure tensor/pipeline parallelism, context length, temperature, and other runtime flags from the Model Library -- Serves an OpenAI-compatible endpoint at `http://localhost:2001/v1/chat/completions` -- Install LMDeploy on demand from the LMDeploy page (or manually via `pip`) before starting safetensors runtimes -- Start/stop directly from the Safetensors panel; status is reported in System Status and the LMDeploy status chip +## Troubleshooting & logs -## Build Customization +### Common issues -### GPU Backends +- **GPU not detected** + - Ensure the NVIDIA Container Toolkit is installed and `nvidia-smi` works on the host. + - Use `--gpus all` (docker run) or the `deploy.resources.reservations.devices` stanza in Compose. + - Confirm `CUDA_VISIBLE_DEVICES` is not set to `""` when you intend to use the GPU. -Enable specific GPU backends during source builds: +- **Build failures (llama.cpp / ik_llama / CUDA)** + - Check that you have enough disk space (≥ 10 GB free is a good baseline). + - Verify CUDA and driver versions are compatible with the chosen build settings. + - Review build or installer logs (via the progress UI or log files in `/app/data/logs`). -- **CUDA**: NVIDIA GPU acceleration with cuBLAS -- **Vulkan**: AMD/Intel GPU acceleration with Vulkan compute -- **Metal**: Apple Silicon GPU acceleration -- **OpenBLAS**: CPU optimization with OpenBLAS routines +- **Memory errors / out-of-memory** + - Reduce context length and/or batch size for the model configuration. + - For GPU runs, lower GPU layers or choose a smaller quantization. -### Build Configuration +- **Model download failures** + - Verify HuggingFace connectivity and model visibility (public/private). + - Ensure `HUGGINGFACE_API_KEY` is correctly configured for private models. + - Check free space under `/app/data`. -Customize your build with: +- **llama-swap** + - Hit `http://localhost:2000/health` and `http://localhost:2000/v1/models` to check proxy state. -- **Build Type**: Release (optimal), Debug (development), RelWithDebInfo -- **Custom CMake Flags**: Additional CMake configuration -- **Compiler Flags**: CFLAGS and CXXFLAGS for optimization -- **Git Patches**: Apply patches from GitHub PRs - -### Example Build Configuration - -```json -{ - "commit_sha": "master", - "patches": [ - "https://github.com/ggerganov/llama.cpp/pull/1234.patch" - ], - "build_config": { - "build_type": "Release", - "enable_cuda": true, - "enable_vulkan": false, - "enable_metal": false, - "enable_openblas": true, - "custom_cmake_args": "-DGGML_CUDA_CUBLAS=ON", - "cflags": "-O3 -march=native", - "cxxflags": "-O3 -march=native" - } -} -``` +### Logs -## Smart Auto Configuration - -The Smart Auto feature automatically generates optimal llama.cpp parameters based on: - -- **GPU Capabilities**: VRAM, compute capability, multi-GPU support -- **NVLink Topology**: Automatic detection and optimization for NVLink clusters -- **Model Architecture**: Detected from model name (Llama, Mistral, etc.) -- **Available Resources**: CPU cores, memory, disk space -- **Performance Optimization**: Flash attention, tensor parallelism, batch sizing - -### NVLink Optimization Strategies - -The system automatically detects NVLink topology and applies appropriate strategies: - -- **Unified NVLink**: All GPUs connected via NVLink - uses aggressive tensor splitting and higher parallelism -- **Clustered NVLink**: Multiple NVLink clusters - optimizes for the largest cluster -- **Partial NVLink**: Some GPUs connected via NVLink - uses hybrid approach -- **PCIe Only**: No NVLink detected - uses conservative PCIe-based configuration - -### Supported Parameters -- Context size, batch sizes, GPU layers -- Temperature, top-k, top-p, repeat penalty -- CPU threads, parallel sequences -- RoPE scaling, YaRN factors -- Multi-GPU tensor splitting -- Custom arguments via YAML config - -## API Endpoints - -### Models -- `GET /api/models` - List all models -- `POST /api/models/search` - Search HuggingFace -- `POST /api/models/download` - Download model -- `GET /api/models/{id}/config` - Get model configuration -- `PUT /api/models/{id}/config` - Update configuration -- `POST /api/models/{id}/auto-config` - Generate smart configuration -- `POST /api/models/{id}/start` - Start model -- `POST /api/models/{id}/stop` - Stop model -- `DELETE /api/models/{id}` - Delete model -- `GET /api/models/safetensors/{model_id}/lmdeploy/config` - Get LMDeploy config for a safetensors download -- `PUT /api/models/safetensors/{model_id}/lmdeploy/config` - Update LMDeploy config -- `POST /api/models/safetensors/{model_id}/lmdeploy/start` - Start LMDeploy runtime -- `POST /api/models/safetensors/{model_id}/lmdeploy/stop` - Stop LMDeploy runtime -- `GET /api/models/safetensors/lmdeploy/status` - LMDeploy manager status - -### LMDeploy Installer -- `GET /api/lmdeploy/status` - Installer status (version, binary path, current operation) -- `POST /api/lmdeploy/install` - Install LMDeploy via pip at runtime -- `POST /api/lmdeploy/remove` - Remove LMDeploy from the runtime environment -- `GET /api/lmdeploy/logs` - Tail the LMDeploy installer log - -### llama.cpp Versions -- `GET /api/llama-versions` - List installed versions -- `GET /api/llama-versions/check-updates` - Check for updates -- `GET /api/llama-versions/build-capabilities` - Get build capabilities -- `POST /api/llama-versions/install-release` - Install release -- `POST /api/llama-versions/build-source` - Build from source -- `DELETE /api/llama-versions/{id}` - Delete version - -### System -- `GET /api/status` - System status -- `GET /api/gpu-info` - GPU information -- `GET /api/events` - Server-Sent Events for real-time updates - -## Database Migration - -If upgrading from an older version, you may need to migrate your database: +- **Container logs**: ```bash -# Run migration to support multi-quantization -python migrate_db.py +docker logs llama-cpp-studio ``` -## Troubleshooting - -### Common Issues - -1. **GPU Not Detected** - - Ensure NVIDIA Container Toolkit is installed (for NVIDIA) - - Check `nvidia-smi` output - - Verify `--gpus all` flag in docker run - - For AMD: Check Vulkan/ROCm drivers - -2. **Build Failures** - - Check CUDA version compatibility (for NVIDIA) - - Ensure sufficient disk space (at least 10GB free) - - Verify internet connectivity for downloads - - For Vulkan builds: Ensure `glslang-tools` is installed - - Check build logs for specific errors - -3. **Memory Issues** - - Use Smart Auto configuration - - Reduce context size or batch size - - Enable memory mapping - - Check available system RAM and VRAM - -4. **Model Download Failures** - - Check HuggingFace connectivity - - Verify model exists and is public - - Ensure sufficient disk space - - Set HUGGINGFACE_API_KEY if using private models - -5. **Validation Failed** - - Binary exists and is executable - - Binary runs `--version` successfully - - Output contains "llama" or "version:" string - -### Logs -- Application logs: `docker logs llama-cpp-studio` -- Model logs: Available in the web interface -- Build logs: Shown during source compilation -- SSE event stream: GET /api/events for real-time progress and status +- **Backend and task logs**: stored under `/app/data/logs` and surfaced via `/api/events`. +- **CUDA installer logs**: available via CUDA log endpoints and the Engines/System view. -## Development +--- -### Backend -- FastAPI with async support -- YAML-backed data store (models, engines, settings) -- SSE (GET /api/events) for real-time updates -- Background tasks for long operations -- Llama-swap integration for multi-model serving +## Development & testing -### Frontend -- Vue.js 3 with Composition API -- PrimeVue component library -- Pinia for state management -- Vite for build tooling -- Dark mode support +### Backend (FastAPI) -### Testing -- Backend tests: `pytest` (install deps first: `pip install -r requirements.txt pytest pytest-asyncio`) -- Run from repo root: `PYTHONPATH=. pytest backend/tests/ -v` -- Smoke tests in `backend/tests/test_app_smoke.py` verify the app starts and key API routes respond (`/api/status`, `/api/models/param-registry`, `/api/models/`, `/api/events`) -- LMDeploy installer and config validation tests in `backend/tests/test_lmdeploy_*.py` +- The backend code lives under `backend/`. +- To run the backend directly in development: -## Memory Estimation Model - -The studio’s capacity planning tooling is grounded in a three-component model for llama.cpp that provides a conservative upper bound on peak memory usage. - -- **Formula**: `M_total = M_weights + M_kv + M_compute` -- **Model weights (`M_weights`)**: Treat the GGUF file size as the ground truth. When `--no-mmap` is disabled (default), the file is memory-mapped so only referenced pages touch physical RAM, but the virtual footprint still equals the file size. -- **KV cache (`M_kv`)**: Uses the GQA-aware formula `n_ctx × N_layers × N_head_kv × (N_embd / N_head) × (p_a_k + p_a_v)`, where `p_a_*` are the bytes-per-value chosen via `--cache-type-k` / `--cache-type-v`. -- **Compute buffers (`M_compute`)**: Approximate as a fixed CUDA overhead (~550 MB) plus a scratch buffer that scales with micro-batch size (`n_ubatch × 0.5 MB` by default). - -### RAM vs VRAM Allocation +```bash +cd backend +pip install -r ../requirements.txt +uvicorn main:app --reload --port 8080 +``` -- `-ngl 0` (CPU-only): All components stay in RAM. -- `-ngl > 0` (hybrid/full GPU): Model weights split by layer between RAM and VRAM, while **both `M_kv` and `M_compute` move entirely to VRAM**—the “VRAM trap”. -- Full offload avoids PCIe contention; hybrid splits suffer a “performance cliff” because activations bounce between CPU and GPU. +### Frontend (Vue 3 + Vite) -### Optimization Strategy +- The frontend SPA lives under `frontend/`. -1. Attempt full offload first (best throughput). If weights + compute fit, deduce `n_ctx_max` from remaining VRAM budget. -2. When full offload fails, search decreasing `n_ngl` values that satisfy RAM limits while maximizing context length, accepting the hybrid performance penalty. -3. Iterate quantization choices to find the smallest model that still enables full offload on the target hardware profile. +```bash +cd frontend +npm install +npm run dev +``` -## Smart Auto Module Report +The dev server (typically on port `5173`) is configured to proxy API calls to the backend. -The Smart Auto subsystem applies the model above to recommend llama.cpp launch parameters. Priority 1 fixes are complete, eliminating prior memory underestimation bugs. +### Backend tests -- **Resolutions**: - - Corrected KV cache math to respect grouped-query attention head counts. - - Removed the dangerous 0.30 multiplier on cache size; estimates now use real memory. - - Ensured KV cache/compute buffers migrate to VRAM whenever GPU layers are in play. - - Modeled compute overhead as `550 MB + 0.5 MB × n_ubatch`. - - Improved GPU layer estimation using GGUF file size with a 20 % safety buffer. -- **Open improvements**: - - Reorder calculations so KV cache quantization feeds batch/context sizing directly. - - Replace remaining heuristics with joint optimization across `n_ctx`, `n_ngl`, and `n_ubatch`. +```bash +pip install -r requirements.txt pytest pytest-asyncio +PYTHONPATH=. pytest backend/tests -v +``` -### Recommended Validation +The test suite includes: -- Benchmark against known examples (e.g., 13B @ 2 048 tokens → ~1.6 GB KV cache, 7B @ 4 096 tokens → ~6 GB total). -- Stress-test large contexts, tight VRAM scenarios, MoE models, and hybrid modes. -- Expand automated regression coverage around the estimator and Smart Auto flows. +- Smoke tests to ensure the app boots and key routes (`/api/status`, `/api/models`, `/api/llama-versions`, `/api/events`) respond. +- Tests for LMDeploy management and configuration. +- Tests for CUDA installer flows and model introspection logic. -## Memory Estimation Test Results +--- -Empirical testing with `Llama-3.2-1B-Instruct.IQ1_M` demonstrates that the estimator acts as a safe upper bound. +## License -- **Setup**: `n_ctx ≈ 35 K`, batch 32, CPU-only run. -- **Estimated peak**: 4.99 GB (weights 394 MB, KV cache 4.34 GB, batch 12 MB, llama.cpp overhead 256 MB). -- **Observed deltas**: - - With mmap enabled: ~608 MB (11.9 % of estimate). Lower usage is expected because the KV cache grows as context fills and weights are paged on demand. - - With `--no-mmap`: ~1.16 GB (23 % of estimate). Weights load fully, but KV cache still expands progressively. -- **Takeaways**: - - Estimates intentionally err on the high side to prevent OOM once the context window reaches capacity. - - Divergence between virtual and physical usage stems from memory mapping and lazy KV cache allocation. - - Additional GPU-focused measurements and long session traces are encouraged to correlate VRAM predictions with reality. +This project is licensed under the MIT License – see the `LICENSE` file for details. -## License +--- -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. +## Contributing & support -Copyright (c) 2024 llama.cpp Studio +### Contributing -## Contributing +- Fork the repository. +- Create a feature branch. +- Make your changes and add tests where appropriate. +- Open a pull request describing your changes and how you tested them. -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Add tests if applicable -5. Submit a pull request +### Support -## Support +- Open an issue on GitHub for bugs or feature requests. +- Review this README and the troubleshooting section before filing. -For issues and questions: -- Create an issue on GitHub -- Check the troubleshooting section -- Review the API documentation +### Acknowledgments -## Acknowledgments +- **llama.cpp** – core inference engine. +- **llama-swap** – multi-model serving proxy. +- **HuggingFace** – model hosting and search. +- **Vue.js** – frontend framework. +- **FastAPI** – backend framework. -- [llama.cpp](https://github.com/ggerganov/llama.cpp) - The core inference engine -- [llama-swap](https://github.com/mostlygeek/llama-swap) - Multi-model serving proxy -- [HuggingFace](https://huggingface.co) - Model hosting and search -- [Vue.js](https://vuejs.org) - Frontend framework -- [FastAPI](https://fastapi.tiangolo.com) - Backend framework diff --git a/backend/data_store.py b/backend/data_store.py index 36aebdf..e75b57b 100644 --- a/backend/data_store.py +++ b/backend/data_store.py @@ -1,6 +1,8 @@ """YAML-backed data store replacing SQLite.""" +import json import os +import re import threading from typing import Any, Dict, List, Optional @@ -31,6 +33,59 @@ def generate_proxy_name(huggingface_id: str, quantization: Optional[str] = None) return huggingface_slug +def _coerce_config(config_value: Optional[Any]) -> Dict[str, Any]: + if not config_value: + return {} + if isinstance(config_value, dict): + return config_value + if isinstance(config_value, str): + try: + return json.loads(config_value) + except json.JSONDecodeError: + return {} + return {} + + +def _model_value(model: Any, key: str, default: Any = None) -> Any: + if isinstance(model, dict): + return model.get(key, default) + return getattr(model, key, default) + + +def normalize_proxy_alias(alias: Optional[str]) -> str: + """Normalize a user-provided model alias into a safe exposed engine ID.""" + if alias is None: + return "" + + normalized = str(alias).strip().lower() + if not normalized: + return "" + + normalized = normalized.replace("/", "-").replace("\\", "-") + normalized = re.sub(r"\s+", "-", normalized) + normalized = re.sub(r"[^a-z0-9._-]", "-", normalized) + normalized = re.sub(r"-{2,}", "-", normalized) + normalized = normalized.strip("._-") + return normalized + + +def resolve_proxy_name(model: Any) -> str: + """Return the exposed runtime model ID for a stored model.""" + config = _coerce_config(_model_value(model, "config")) + alias = normalize_proxy_alias(config.get("model_alias")) + if alias: + return alias + + existing = normalize_proxy_alias(_model_value(model, "proxy_name")) + if existing: + return existing + + return generate_proxy_name( + _model_value(model, "huggingface_id", ""), + _model_value(model, "quantization"), + ) + + class DataStore: """Thread-safe YAML-backed data store replacing SQLite.""" @@ -175,6 +230,23 @@ def delete_engine_version(self, engine: str, version: str) -> bool: self._save_yaml("engines.yaml", data) return True + def get_engine_build_settings(self, engine: str) -> Dict[str, Any]: + """Return persisted build settings for the given engine (or empty dict).""" + data = self._read_yaml("engines.yaml") + return data.get(engine, {}).get("build_settings", {}) or {} + + def update_engine_build_settings(self, engine: str, settings: Dict[str, Any]) -> Dict[str, Any]: + """Merge and persist build settings for the given engine. Returns the stored settings.""" + if not isinstance(settings, dict): + settings = {} + data = self._read_yaml("engines.yaml") + engine_data = data.setdefault(engine, {}) + existing = engine_data.get("build_settings") or {} + merged = {**existing, **settings} + engine_data["build_settings"] = merged + self._save_yaml("engines.yaml", data) + return merged + # --- LMDeploy --- def get_lmdeploy_status(self) -> dict: diff --git a/backend/gguf_introspection_config.json b/backend/gguf_introspection_config.json new file mode 100644 index 0000000..885f0cc --- /dev/null +++ b/backend/gguf_introspection_config.json @@ -0,0 +1,23 @@ +{ + "global": { + "context_length": { + "preferred_keys": [ + "general.context_length", + "general.model_max_length", + "general.max_position_embeddings" + ] + } + }, + "glm4": { + "match_arch": ["glm4", "glm4moe"], + "context_length": { + "preferred_keys": ["glm4.context_length", "glm4.model_max_length"], + "fallback_terms": ["context", "max_position_embeddings"] + }, + "layer_count": { + "preferred_keys": ["glm4.num_hidden_layers"], + "fallback_terms": ["layer", "block"] + } + } +} + diff --git a/backend/gguf_reader.py b/backend/gguf_reader.py index 2ad9dd1..df47723 100644 --- a/backend/gguf_reader.py +++ b/backend/gguf_reader.py @@ -2,13 +2,14 @@ GGUF file metadata reader for extracting model layer information """ -import struct import os +import struct import mmap from enum import IntEnum -from typing import Dict, Optional, Any, List, Tuple, BinaryIO +from typing import Any, BinaryIO, Dict, List, Optional, Tuple from backend.logging_config import get_logger +from backend.model_introspection import GgufIntrospector logger = get_logger(__name__) @@ -1247,22 +1248,34 @@ def get_model_layer_info(model_path: str) -> Optional[Dict[str, Any]]: logger.error(f"Model file is not GGUF format: {model_path}") return None - metadata = read_gguf_metadata(model_path) - if metadata: - return { - "layer_count": metadata["layer_count"], - "architecture": metadata["architecture"], - "context_length": metadata["context_length"], - "vocab_size": 0, # Not extracted from metadata - "embedding_length": metadata["embedding_length"], - "attention_head_count": metadata["attention_head_count"], - "attention_head_count_kv": metadata["attention_head_count_kv"], - "block_count": metadata["block_count"], - "is_moe": metadata["is_moe"], - "expert_count": metadata["expert_count"], - "experts_used_count": metadata["experts_used_count"], - } - return None + with GGUFReader(model_path) as reader: + metadata = reader.metadata + tensors = reader.tensors + + introspector = GgufIntrospector(metadata=metadata, tensors=tensors) + info = introspector.build_model_info() + + return { + "layer_count": int(info.layer_count) if info.layer_count else 0, + "architecture": metadata.get("general.architecture", ""), + "context_length": int(info.context_length) if info.context_length else 0, + "vocab_size": int(info.vocab_size) if info.vocab_size else 0, + "embedding_length": int(info.embedding_length) + if info.embedding_length + else 0, + "attention_head_count": int(info.attention_head_count) + if info.attention_head_count + else 0, + "attention_head_count_kv": int(info.attention_head_count_kv) + if info.attention_head_count_kv + else 0, + "block_count": int(info.block_count) if info.block_count else 0, + "is_moe": bool(info.is_moe), + "expert_count": int(info.expert_count) if info.expert_count else 0, + "experts_used_count": int(info.experts_used_count) + if info.experts_used_count + else 0, + } except Exception as e: logger.error( f"Failed to get model layer info from {model_path}: {e}", exc_info=True diff --git a/backend/huggingface.py b/backend/huggingface.py index 288ab57..077ffb4 100644 --- a/backend/huggingface.py +++ b/backend/huggingface.py @@ -863,9 +863,6 @@ async def _fetch_and_merge(repo_id: Optional[str]): metadata["tokenizer"] = tokenizer_json await _fetch_and_merge(huggingface_id) - if huggingface_id and huggingface_id.lower().endswith("-gguf"): - base_repo = huggingface_id[:-5] - await _fetch_and_merge(base_repo) try: layer_info = get_model_layer_info(file_path) or {} @@ -1210,6 +1207,20 @@ async def process_model(model): if result is not None: valid_results.append(result) + if model_format == "gguf": + def _gguf_sort_key(item: Dict[str, Any]): + quantizations = item.get("quantizations") or {} + size_candidates = [ + q.get("total_size") or 0 + for q in quantizations.values() + if isinstance(q, dict) + ] + positive_sizes = [size for size in size_candidates if size > 0] + min_size = min(positive_sizes) if positive_sizes else float("inf") + return (min_size, -(item.get("downloads") or 0), item.get("id") or "") + + valid_results.sort(key=_gguf_sort_key) + return valid_results[:limit] @@ -1219,20 +1230,18 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]: logger.info(f"Processing model: {model.id}") quantizations: Dict[str, Dict] = {} + mmproj_files: List[Dict[str, Any]] = [] safetensors_files: List[Dict] = [] repo_files: List[Dict[str, Any]] = [] if hasattr(model, "siblings") and model.siblings: if model_format == "gguf": - # Group GGUF files by logical quantization, handling multi-part shards - # Accept both plain `.gguf` and multi-part patterns like `.gguf.part1of2` - # Exclude mmproj (vision/multimodal projection) files – they are extensions, not standalone quants + # Group GGUF files by logical quantization, handling multi-part shards. gguf_siblings = [ s for s in model.siblings if isinstance(getattr(s, "rfilename", None), str) and re.search(r"\.gguf(\.|$)", s.rfilename) - and "mmproj" not in s.rfilename.lower() ] logger.info(f"Model {model.id}: {len(gguf_siblings)} GGUF files found") if not gguf_siblings: @@ -1240,6 +1249,14 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]: for sibling in gguf_siblings: filename = sibling.rfilename + if "mmproj" in filename.lower(): + mmproj_files.append( + { + "filename": filename, + "size": getattr(sibling, "size", 0) or 0, + } + ) + continue # Normalize filename by stripping shard suffix patterns like: # -00001-of-00002.gguf (TheBloke-style) # .gguf.part1of2 (Hugging Face-style multi-part) @@ -1298,25 +1315,9 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]: else 0.0 ) - # Siblings from list_models often have size=None; fetch accurate sizes from Hub - try: - all_filenames = [s.rfilename for s in gguf_siblings] - accurate_sizes = get_accurate_file_sizes(model.id, all_filenames) - if accurate_sizes: - for entry in quantizations.values(): - for f in entry["files"]: - f["size"] = accurate_sizes.get(f["filename"]) or f["size"] or 0 - entry["total_size"] = sum(f["size"] for f in entry["files"]) - entry["size_mb"] = ( - round(entry["total_size"] / (1024 * 1024), 2) - if entry["total_size"] - else 0.0 - ) - except Exception as size_err: - logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}") - - # If no quantizations were detected after grouping, skip this model - if not quantizations: + # Search should stay to a single HF API call. Accurate file sizes are lazy-loaded on expand. + # If no downloadable GGUF entries were detected after grouping, skip this model. + if not quantizations and not mmproj_files: return None else: safetensors_files = [] @@ -1338,15 +1339,6 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]: ) if not safetensors_files: return None - # Fetch accurate sizes; list_models siblings often have size=None - try: - st_filenames = [f["filename"] for f in safetensors_files] - accurate_sizes = get_accurate_file_sizes(model.id, st_filenames) - if accurate_sizes: - for f in safetensors_files: - f["size"] = accurate_sizes.get(f["filename"]) or 0 - except Exception as size_err: - logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}") else: return None @@ -1364,6 +1356,7 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]: "tags": model.tags or [], "model_format": model_format, "quantizations": quantizations if model_format == "gguf" else {}, + "mmproj_files": mmproj_files if model_format == "gguf" else [], "safetensors_files": ( safetensors_files if model_format == "safetensors" else [] ), @@ -1668,7 +1661,7 @@ async def get_model_details(model_id: str) -> Dict: config_path = hf_hub_download( repo_id=model_id, filename="config.json", - local_dir="data/temp", + local_dir="data/hf-cache", local_dir_use_symlinks=False, ) diff --git a/backend/llama_manager.py b/backend/llama_manager.py index cbcb928..55f5d2e 100644 --- a/backend/llama_manager.py +++ b/backend/llama_manager.py @@ -2273,13 +2273,19 @@ def set_flag(flag: str, value: bool): logger.error(f"Build failed: {e}") if progress_manager and task_id: try: - await progress_manager.send_build_progress( - task_id=task_id, - stage="error", - progress=0, - message=f"Build failed: {str(e)}", - log_lines=[f"Error: {str(e)}"], + existing_task = progress_manager.get_task(task_id) + existing_logs = ( + (existing_task or {}).get("metadata", {}).get("log_lines") or [] ) + error_text = str(e) + if error_text not in existing_logs: + await progress_manager.send_build_progress( + task_id=task_id, + stage="error", + progress=0, + message=f"Build failed: {error_text}", + log_lines=[f"Error: {error_text}"], + ) except Exception as ws_error: logger.error(f"Failed to send error via SSE: {ws_error}") raise Exception(f"Failed to build from source {commit_sha}: {e}") diff --git a/backend/llama_swap_client.py b/backend/llama_swap_client.py index 454fe2a..3f3e5e3 100644 --- a/backend/llama_swap_client.py +++ b/backend/llama_swap_client.py @@ -149,3 +149,23 @@ async def get_model_info(self, model_id: str, upstream_path: str = "v1/models"): except Exception as e: logger.error(f"Failed to get model info for {model_id}: {e}") raise + + async def load_model(self, model_name: str, retries: int = 20, delay: float = 0.5): + """Trigger on-demand model loading via llama-swap's upstream route.""" + last_error = None + for _ in range(max(1, retries)): + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/upstream/{model_name}/v1/models", timeout=30 + ) + response.raise_for_status() + self._loading_models.discard(model_name) + return response.json() + except Exception as e: + last_error = e + self._loading_models.add(model_name) + await asyncio.sleep(delay) + self._loading_models.discard(model_name) + logger.error(f"Failed to load model {model_name}: {last_error}") + raise last_error diff --git a/backend/llama_swap_config.py b/backend/llama_swap_config.py index 3e071e7..6829449 100644 --- a/backend/llama_swap_config.py +++ b/backend/llama_swap_config.py @@ -510,44 +510,73 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any: return m.get(key, default) return getattr(m, key, default) + from backend.data_store import ( + generate_proxy_name as _generate_proxy_name, + normalize_proxy_alias as _normalize_proxy_alias, + resolve_proxy_name as _resolve_proxy_name, + ) + # Resolve LMDeploy binary and build proxy->model map for overlay (used for both all_models and running overlay) lmdeploy_bin = None all_models_by_proxy: Dict[str, Any] = {} + all_models_by_legacy_proxy: Dict[str, Any] = {} try: from backend.data_store import get_store as _get_store store = _get_store() - lmdeploy_status = store.get_lmdeploy_status() - if lmdeploy_status.get("installed") and lmdeploy_status.get("venv_path"): - venv = lmdeploy_status["venv_path"] - lmdeploy_bin = os.path.join(venv, "bin", "lmdeploy") - if not os.path.isabs(lmdeploy_bin): - lmdeploy_bin = os.path.join("/app", lmdeploy_bin) - if not os.path.exists(lmdeploy_bin): - lmdeploy_bin = None + # Prefer the active versioned LMDeploy engine, same pattern as llama_cpp. + active_lmdeploy = store.get_active_engine_version("lmdeploy") + venv = active_lmdeploy.get("venv_path") if active_lmdeploy else None + # Fallback to legacy single-status layout if no active version is found. + if not venv: + legacy_status = store.get_lmdeploy_status() + if legacy_status.get("installed"): + venv = legacy_status.get("venv_path") + if venv: + # Ensure the venv path still exists before resolving the binary. + if not os.path.isabs(venv): + venv = os.path.join("/app", venv) + if os.path.isdir(venv): + candidate = os.path.join(venv, "bin", "lmdeploy") + if not os.path.isabs(candidate): + candidate = os.path.join("/app", candidate) + if os.path.exists(candidate) and os.access(candidate, os.X_OK): + lmdeploy_bin = candidate + else: + logger.debug( + f"LMDeploy binary not found or not executable at {candidate}; " + "LMDeploy engine entries will be skipped in llama-swap config" + ) + else: + logger.debug( + f"LMDeploy venv_path does not exist at {venv}; " + "LMDeploy engine entries will be skipped in llama-swap config" + ) except Exception as e: logger.debug(f"Could not resolve LMDeploy binary: {e}") # First, add all models from the data store (if provided) if all_models: - from backend.data_store import generate_proxy_name as _gen_proxy_name - for model in all_models: - proxy_model_name = _model_attr(model, "proxy_name") - if not proxy_model_name: - proxy_model_name = _gen_proxy_name( - _model_attr(model, "huggingface_id", ""), - _model_attr(model, "quantization"), - ) + proxy_model_name = _resolve_proxy_name(model) if not proxy_model_name: logger.warning( f"Model '{_model_attr(model, 'display_name') or _model_attr(model, 'name')}' does not have a proxy_name set, skipping" ) continue all_models_by_proxy[proxy_model_name] = model + legacy_proxy_name = _normalize_proxy_alias(_model_attr(model, "proxy_name")) + if legacy_proxy_name and legacy_proxy_name != proxy_model_name: + all_models_by_legacy_proxy[legacy_proxy_name] = model + generated_proxy_name = _generate_proxy_name( + _model_attr(model, "huggingface_id", ""), + _model_attr(model, "quantization"), + ) + if generated_proxy_name and generated_proxy_name != proxy_model_name: + all_models_by_legacy_proxy[generated_proxy_name] = model engine = _model_attr(model, "engine") - model_format = _model_attr(model, "format") or _model_attr(model, "model_format") or "gguf" - is_lmdeploy = engine == "lmdeploy" or model_format == "safetensors" + # LMDeploy-backed models are detected strictly by engine, not by format. + is_lmdeploy = engine == "lmdeploy" if is_lmdeploy and lmdeploy_bin: config = _coerce_model_config(_model_attr(model, "config")) try: @@ -558,30 +587,33 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any: continue hf_id = _model_attr(model, "huggingface_id") - filename = _model_attr(model, "filename") or ( - os.path.basename(_model_attr(model, "file_path") or "") or None - ) - - # Resolve model path: HF cache first, then legacy file_path + quantization = _model_attr(model, "quantization") + + # Prefer llama.cpp's native HF integration when we have a repo id and quant. + # This lets us use: --hf-repo /:, and llama.cpp will + # resolve/download the correct GGUF (including multi‑shard) on its own. + hf_repo_arg = None + if hf_id and quantization: + hf_repo_arg = f"{hf_id}:{str(quantization).lower()}" + + # For legacy/local models without huggingface_id+quantization, fall back + # to a stored file_path. New HF-backed models never rely on a specific + # filename or file path; llama.cpp pulls from Hugging Face via --hf-repo. model_path = None - if hf_id and filename: - from backend.huggingface import resolve_cached_model_path - model_path = resolve_cached_model_path(hf_id, filename) - - if not model_path: - # Legacy fallback: stored file_path (old-style records) + if not hf_repo_arg: legacy = _model_attr(model, "file_path") if legacy: model_path = legacy if os.path.isabs(legacy) else f"/app/{legacy}" - if not model_path: + # If we don't have either an HF repo+quant or a legacy path, skip. + if not hf_repo_arg and not model_path: logger.warning( - f"Model '{proxy_model_name}' path could not be resolved (hf_id={hf_id}, filename={filename}), skipping" + f"Model '{proxy_model_name}' path could not be resolved (hf_id={hf_id}), skipping" ) continue - # Ensure absolute path (HF cache returns absolute; legacy may not) - if not os.path.isabs(model_path): + # Ensure absolute path when we are in local-path mode. + if model_path and not os.path.isabs(model_path): model_path = f"/app/{model_path}" # Get the working directory and build directory for LD_LIBRARY_PATH @@ -610,20 +642,30 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any: f"Model {proxy_model_name}: jinja={config.get('jinja')} (type: {type(config.get('jinja'))})" ) - # Build llama.cpp command arguments - # Quote model path if it contains spaces or special characters - quoted_model_path = _quote_arg_if_needed(model_path) + # Build llama.cpp command arguments (excluding the base launcher). + # We keep the first 3 entries in cmd_args unused; only cmd_args[3:] + # (starting from "--port") are appended to the final command string. cmd_args = [ - llama_server_path, - "--model", - quoted_model_path, + None, + None, + None, "--port", "${PORT}", ] - # Vision: if model has mmproj (multimodal projector), add --mmproj so vision is available + + # If the user provided a model_alias in config, propagate it to llama.cpp + # via --alias so that /v1/models exposes this name. + alias_for_api = config.get("model_alias") + if isinstance(alias_for_api, str) and alias_for_api.strip(): + cmd_args.extend(["--alias", alias_for_api.strip()]) + + # Vision: if model has mmproj (multimodal projector) and we're using a + # local model path, add --mmproj so vision is available. When using + # --hf-repo, llama.cpp will auto-download mmproj if available. mmproj_filename = _model_attr(model, "mmproj_filename") - if mmproj_filename and hf_id: + if mmproj_filename and hf_id and not hf_repo_arg: from backend.huggingface import resolve_cached_model_path + mmproj_path = resolve_cached_model_path(hf_id, mmproj_filename) if mmproj_path and os.path.exists(mmproj_path): if not os.path.isabs(mmproj_path): @@ -820,49 +862,75 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any: except Exception as e: logger.debug(f"Could not get CUDA library path: {e}") - # Create the command with proper shell syntax for environment variables + # Create the command with proper shell syntax for environment variables. + # Prefer llama.cpp's HF integration when we have an HF repo id + quant; + # otherwise fall back to a direct local GGUF path. + if hf_repo_arg: + launcher = f"./{binary_name} --hf-repo {hf_repo_arg}" + else: + quoted_model_path = _quote_arg_if_needed(model_path) + launcher = f"./{binary_name} --model {quoted_model_path}" + cmd_with_env = ( - f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} ./{binary_name} --model {model_path} " + f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} {launcher} " + " ".join(cmd_args[3:]) + "'" - ) # Skip llama_server_path, --model, model_path, --port + ) config_data["models"][proxy_model_name] = {"cmd": cmd_with_env} # Then, add/update with running models (these take precedence for active models) for proxy_model_name, model_data in models.items(): - overlay_model = all_models_by_proxy.get(proxy_model_name) + overlay_model = all_models_by_proxy.get(proxy_model_name) or all_models_by_legacy_proxy.get(proxy_model_name) + resolved_proxy_model_name = ( + _resolve_proxy_name(overlay_model) + if overlay_model + else _normalize_proxy_alias(model_data.get("config", {}).get("model_alias")) or proxy_model_name + ) engine = _model_attr(overlay_model, "engine") if overlay_model else None - model_format = _model_attr(overlay_model, "format") or _model_attr(overlay_model, "model_format") if overlay_model else None - is_lmdeploy_overlay = (engine == "lmdeploy" or model_format == "safetensors") and lmdeploy_bin and overlay_model + # For overlay models, also rely solely on the engine flag to detect LMDeploy. + is_lmdeploy_overlay = engine == "lmdeploy" and lmdeploy_bin and overlay_model if is_lmdeploy_overlay: config = _coerce_model_config(model_data.get("config")) try: cmd_with_env = _build_lmdeploy_cmd(overlay_model, config, lmdeploy_bin, _model_attr) - config_data["models"][proxy_model_name] = {"cmd": cmd_with_env} + config_data["models"].pop(proxy_model_name, None) + config_data["models"][resolved_proxy_model_name] = {"cmd": cmd_with_env} except Exception as e: - logger.warning(f"Failed to build LMDeploy overlay cmd for {proxy_model_name}: {e}") + logger.warning(f"Failed to build LMDeploy overlay cmd for {resolved_proxy_model_name}: {e}") continue model_path = model_data["model_path"] llama_cpp_config = model_data["config"] - # Build llama.cpp command arguments (using full path to llama-server) - # Quote model path if it contains spaces or special characters + # Build llama.cpp command arguments (using full path to llama-server). + # For overlay models, also prefer HF repo + quant when available. + hf_id_overlay = _model_attr(overlay_model, "huggingface_id") if overlay_model else None + quantization_overlay = _model_attr(overlay_model, "quantization") if overlay_model else None + hf_repo_arg_overlay = None + if hf_id_overlay and quantization_overlay: + hf_repo_arg_overlay = f"{hf_id_overlay}:{str(quantization_overlay).lower()}" + + # Quote model path if it contains spaces or special characters (local-path mode). quoted_model_path = _quote_arg_if_needed(model_path) cmd_args = [ - llama_server_path, - "--model", - quoted_model_path, + None, + None, + None, "--port", "${PORT}", ] + # Propagate model_alias from the live llama_cpp_config if present so that + # llama.cpp exposes this name via /v1/models. + alias_for_api_overlay = llama_cpp_config.get("model_alias") + if isinstance(alias_for_api_overlay, str) and alias_for_api_overlay.strip(): + cmd_args.extend(["--alias", alias_for_api_overlay.strip()]) # Vision: add --mmproj if model has mmproj_filename - if overlay_model: + if overlay_model and not hf_repo_arg_overlay: mmproj_fn = _model_attr(overlay_model, "mmproj_filename") - hf_id_overlay = _model_attr(overlay_model, "huggingface_id") if mmproj_fn and hf_id_overlay: from backend.huggingface import resolve_cached_model_path + mmproj_path = resolve_cached_model_path(hf_id_overlay, mmproj_fn) if mmproj_path and os.path.exists(mmproj_path): if not os.path.isabs(mmproj_path): @@ -1052,14 +1120,20 @@ def _model_attr(m: Any, key: str, default: Any = None) -> Any: # The shared libraries are in the same directory as the binary library_path = build_dir - # Create the command with proper shell syntax for environment variables + # Create the command with proper shell syntax for environment variables. + if hf_repo_arg_overlay: + launcher = f"./{binary_name} --hf-repo {hf_repo_arg_overlay}" + else: + launcher = f"./{binary_name} --model {quoted_model_path}" + cmd_with_env = ( - f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} ./{binary_name} --model {model_path} " + f"bash -c 'cd {working_dir} && LD_LIBRARY_PATH={library_path} {launcher} " + " ".join(cmd_args[3:]) + "'" - ) # Skip llama_server_path, --model, model_path, --port + ) - config_data["models"][proxy_model_name] = {"cmd": cmd_with_env} + config_data["models"].pop(proxy_model_name, None) + config_data["models"][resolved_proxy_model_name] = {"cmd": cmd_with_env} # Add groups configuration to allow multiple models to run simultaneously # Note: This means models won't be unloaded when new ones start - user must manage memory diff --git a/backend/llama_swap_manager.py b/backend/llama_swap_manager.py index d08a0c1..7b75507 100644 --- a/backend/llama_swap_manager.py +++ b/backend/llama_swap_manager.py @@ -345,7 +345,6 @@ async def register_model(self, model: Any, config: Dict[str, Any]) -> str: """ Registers a model with llama-swap by storing its configuration. Returns the proxy_model_name used by llama-swap. - Note: This only stores the model info, config is written separately. model can be a dict or an object with proxy_name, file_path, display_name/name. """ proxy_name = model.get("proxy_name") if isinstance(model, dict) else getattr(model, "proxy_name", None) @@ -365,6 +364,9 @@ async def register_model(self, model: Any, config: Dict[str, Any]) -> str: "config": config, } + # Persist the updated model registry immediately so llama-swap can watch and reload it. + await self._write_config() + logger.info( f"Model '{name}' registered as '{proxy_name}' with llama-swap" ) diff --git a/backend/lmdeploy_installer.py b/backend/lmdeploy_installer.py deleted file mode 100644 index 875b2f7..0000000 --- a/backend/lmdeploy_installer.py +++ /dev/null @@ -1,416 +0,0 @@ -import asyncio -import json -import os -import shutil -import subprocess -import sys -from asyncio.subprocess import PIPE, STDOUT -from datetime import datetime, timezone -from typing import Any, Awaitable, Dict, Optional - -from backend.logging_config import get_logger -from backend.progress_manager import get_progress_manager - - -def _utcnow() -> str: - return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") - - -logger = get_logger(__name__) - -_installer_instance: Optional["LMDeployInstaller"] = None - - -def get_lmdeploy_installer() -> "LMDeployInstaller": - global _installer_instance - if _installer_instance is None: - _installer_instance = LMDeployInstaller() - return _installer_instance - - -class LMDeployInstaller: - """Install or remove LMDeploy inside the runtime environment on demand.""" - - def __init__( - self, - *, - log_path: Optional[str] = None, - state_path: Optional[str] = None, - base_dir: Optional[str] = None, - ) -> None: - self._lock = asyncio.Lock() - self._operation: Optional[str] = None - self._operation_started_at: Optional[str] = None - self._current_task: Optional[asyncio.Task] = None - self._last_error: Optional[str] = None - data_root = os.path.abspath("data") - base_path = base_dir or os.path.join(data_root, "lmdeploy") - self._base_dir = os.path.abspath(base_path) - self._venv_path = os.path.join(self._base_dir, "venv") - log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log") - state_path = state_path or os.path.join( - data_root, "configs", "lmdeploy_installer.json" - ) - self._log_path = os.path.abspath(log_path) - self._state_path = os.path.abspath(state_path) - self._ensure_directories() - - def _ensure_directories(self) -> None: - os.makedirs(self._base_dir, exist_ok=True) - os.makedirs(os.path.dirname(self._log_path), exist_ok=True) - os.makedirs(os.path.dirname(self._state_path), exist_ok=True) - - def _venv_bin(self, executable: str) -> str: - if os.name == "nt": - exe = ( - executable - if executable.lower().endswith(".exe") - else f"{executable}.exe" - ) - return os.path.join(self._venv_path, "Scripts", exe) - return os.path.join(self._venv_path, "bin", executable) - - def _venv_python(self) -> str: - return self._venv_bin("python") - - def _ensure_venv(self) -> None: - python_path = self._venv_python() - if os.path.exists(python_path): - return - os.makedirs(self._base_dir, exist_ok=True) - try: - subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True) - except subprocess.CalledProcessError as exc: - raise RuntimeError( - f"Failed to create LMDeploy virtual environment: {exc}" - ) from exc - - def _load_state(self) -> Dict[str, Any]: - if not os.path.exists(self._state_path): - return {} - try: - with open(self._state_path, "r", encoding="utf-8") as handle: - data = json.load(handle) - return data if isinstance(data, dict) else {} - except Exception as exc: - logger.warning(f"Failed to load LMDeploy installer state: {exc}") - return {} - - def _save_state(self, state: Dict[str, Any]) -> None: - tmp_path = f"{self._state_path}.tmp" - with open(tmp_path, "w", encoding="utf-8") as handle: - json.dump(state, handle, indent=2) - os.replace(tmp_path, self._state_path) - - def _detect_installed_version(self) -> Optional[str]: - python_exe = self._venv_python() - if not os.path.exists(python_exe): - return None - script = ( - "import importlib, sys\n" - "try:\n" - " from importlib import metadata\n" - "except ImportError:\n" - " import importlib_metadata as metadata\n" - "try:\n" - " print(metadata.version('lmdeploy'))\n" - "except metadata.PackageNotFoundError:\n" - " sys.exit(1)\n" - ) - try: - output = subprocess.check_output( - [python_exe, "-c", script], text=True - ).strip() - return output or None - except subprocess.CalledProcessError: - return None - except Exception as exc: # pragma: no cover - logger.debug(f"Unable to determine LMDeploy version: {exc}") - return None - - def _resolve_binary_path(self) -> Optional[str]: - override = os.getenv("LMDEPLOY_BIN") - if override: - override_path = os.path.abspath(os.path.expanduser(override)) - if os.path.exists(override_path): - return override_path - resolved_override = shutil.which(override) - if resolved_override: - return resolved_override - - candidate = self._venv_bin("lmdeploy") - if os.path.exists(candidate) and os.access(candidate, os.X_OK): - return os.path.abspath(candidate) - - resolved = shutil.which("lmdeploy") - return resolved - - def _update_installed_state( - self, installed: bool, version: Optional[str] = None - ) -> None: - state = self._load_state() - if installed: - state["installed_at"] = _utcnow() - if version: - state["installed_version"] = version - state["venv_path"] = self._venv_path - else: - state["installed_version"] = None - state["installed_at"] = None - state["removed_at"] = _utcnow() - state["venv_path"] = self._venv_path - self._save_state(state) - - def _refresh_state_from_environment(self) -> None: - state = self._load_state() - version = self._detect_installed_version() - state["installed_version"] = version - if version is None: - state["removed_at"] = _utcnow() - state["venv_path"] = self._venv_path - self._save_state(state) - - async def _run_pip( - self, - args: list[str], - operation: str, - ensure_venv: bool = True, - cwd: Optional[str] = None, - ) -> int: - if ensure_venv: - self._ensure_venv() - python_exe = self._venv_python() - if not os.path.exists(python_exe): - raise RuntimeError( - "LMDeploy virtual environment is missing; cannot run pip." - ) - header = ( - f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n" - ) - with open(self._log_path, "w", encoding="utf-8") as log_file: - log_file.write(header) - process = await asyncio.create_subprocess_exec( - python_exe, - "-m", - "pip", - *args, - stdout=PIPE, - stderr=STDOUT, - cwd=cwd, - ) - - async def _stream_output() -> None: - if process.stdout is None: - return - with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file: - while True: - chunk = await process.stdout.readline() - if not chunk: - break - text = chunk.decode("utf-8", errors="replace") - log_file.write(text) - await self._broadcast_log_line(text.rstrip("\n")) - - await asyncio.gather(process.wait(), _stream_output()) - return process.returncode or 0 - - async def _broadcast_log_line(self, line: str) -> None: - try: - await get_progress_manager().broadcast( - { - "type": "lmdeploy_install_log", - "line": line, - "timestamp": _utcnow(), - } - ) - except Exception as exc: # pragma: no cover - logger.debug(f"Failed to broadcast LMDeploy log line: {exc}") - - async def _set_operation(self, operation: str) -> None: - self._operation = operation - self._operation_started_at = _utcnow() - self._last_error = None - await get_progress_manager().broadcast( - { - "type": "lmdeploy_install_status", - "status": operation, - "started_at": self._operation_started_at, - } - ) - - async def _finish_operation(self, success: bool, message: str = "") -> None: - payload = { - "type": "lmdeploy_install_status", - "status": "completed" if success else "failed", - "operation": self._operation, - "message": message, - "ended_at": _utcnow(), - } - await get_progress_manager().broadcast(payload) - self._operation = None - self._operation_started_at = None - - def _create_task(self, coro: Awaitable[Any]) -> None: - loop = asyncio.get_running_loop() - task = loop.create_task(coro) - self._current_task = task - - def _cleanup(fut: asyncio.Future) -> None: - try: - fut.result() - except Exception as exc: # pragma: no cover - surfaced via status - logger.error(f"LMDeploy installer task error: {exc}") - finally: - self._current_task = None - - task.add_done_callback(_cleanup) - - async def install( - self, version: Optional[str] = None, force_reinstall: bool = False - ) -> Dict[str, Any]: - async with self._lock: - if self._operation: - raise RuntimeError( - "Another LMDeploy installer operation is already running" - ) - await self._set_operation("install") - args = ["install", "--upgrade"] - if force_reinstall: - args.append("--force-reinstall") - package = "lmdeploy" - if version: - package = f"lmdeploy=={version}" - args.append(package) - - async def _runner(): - try: - code = await self._run_pip(args, "install") - if code != 0: - raise RuntimeError(f"pip exited with status {code}") - detected_version = self._detect_installed_version() - self._update_installed_state(True, detected_version) - await self._finish_operation(True, "LMDeploy installed") - except Exception as exc: - self._last_error = str(exc) - self._refresh_state_from_environment() - await self._finish_operation(False, str(exc)) - - self._create_task(_runner()) - return {"message": "LMDeploy installation started"} - - async def install_from_source( - self, - repo_url: str = "https://github.com/InternLM/lmdeploy.git", - branch: str = "main", - ) -> Dict[str, Any]: - """Install LMDeploy from a git repo and branch (for development).""" - async with self._lock: - if self._operation: - raise RuntimeError( - "Another LMDeploy installer operation is already running" - ) - await self._set_operation("install_source") - clone_dir = os.path.join(self._base_dir, "source") - async def _runner(): - try: - self._ensure_venv() - if os.path.exists(clone_dir): - shutil.rmtree(clone_dir) - os.makedirs(clone_dir, exist_ok=True) - proc = await asyncio.create_subprocess_exec( - "git", "clone", "--depth", "1", "--branch", branch, repo_url, clone_dir, - stdout=PIPE, stderr=STDOUT, - ) - await proc.wait() - if proc.returncode != 0: - raise RuntimeError(f"git clone failed with code {proc.returncode}") - code = await self._run_pip( - ["install", "-e", "."], - "install_source", - cwd=clone_dir, - ) - if code != 0: - raise RuntimeError(f"pip install -e . failed with code {code}") - detected = self._detect_installed_version() - self._update_installed_state(True, detected) - from backend.data_store import get_store - get_store().update_lmdeploy({ - "install_type": "source", - "source_repo": repo_url, - "source_branch": branch, - }) - await self._finish_operation(True, f"Installed from {branch}") - except Exception as exc: - self._last_error = str(exc) - self._refresh_state_from_environment() - await self._finish_operation(False, str(exc)) - self._create_task(_runner()) - return {"message": "LMDeploy install from source started", "repo": repo_url, "branch": branch} - - async def remove(self) -> Dict[str, Any]: - async with self._lock: - if self._operation: - raise RuntimeError( - "Another LMDeploy installer operation is already running" - ) - await self._set_operation("remove") - args = ["uninstall", "-y", "lmdeploy"] - - async def _runner(): - try: - python_exists = os.path.exists(self._venv_python()) - if python_exists: - code = await self._run_pip(args, "remove", ensure_venv=False) - if code != 0: - raise RuntimeError(f"pip exited with status {code}") - shutil.rmtree(self._venv_path, ignore_errors=True) - self._update_installed_state(False) - await self._finish_operation(True, "LMDeploy removed") - except Exception as exc: - self._last_error = str(exc) - self._refresh_state_from_environment() - await self._finish_operation(False, str(exc)) - - self._create_task(_runner()) - return {"message": "LMDeploy removal started"} - - def status(self) -> Dict[str, Any]: - version = self._detect_installed_version() - binary_path = self._resolve_binary_path() - installed = version is not None and binary_path is not None - state = self._load_state() - return { - "installed": installed, - "version": version, - "binary_path": binary_path, - "venv_path": state.get("venv_path") or self._venv_path, - "installed_at": state.get("installed_at"), - "removed_at": state.get("removed_at"), - "operation": self._operation, - "operation_started_at": self._operation_started_at, - "last_error": self._last_error, - "log_path": self._log_path, - } - - async def _broadcast_status(self) -> None: - """Broadcast current status via SSE.""" - try: - status_data = self.status() - get_progress_manager().emit("lmdeploy_status", {**status_data, "timestamp": _utcnow()}) - except Exception as exc: - logger.debug(f"Failed to broadcast LMDeploy status: {exc}") - - def is_operation_running(self) -> bool: - return self._operation is not None - - def read_log_tail(self, max_bytes: int = 8192) -> str: - if not os.path.exists(self._log_path): - return "" - with open(self._log_path, "rb") as log_file: - log_file.seek(0, os.SEEK_END) - size = log_file.tell() - log_file.seek(max(0, size - max_bytes)) - data = log_file.read().decode("utf-8", errors="replace") - if size > max_bytes: - data = data.split("\n", 1)[-1] - return data.strip() diff --git a/backend/lmdeploy_manager.py b/backend/lmdeploy_manager.py index 6328d71..c6aa4c9 100644 --- a/backend/lmdeploy_manager.py +++ b/backend/lmdeploy_manager.py @@ -1,841 +1,479 @@ import asyncio import json import os -import shlex import shutil -from datetime import datetime -from typing import Optional, Dict, Any, List - -import httpx -import psutil -from asyncio.subprocess import Process, STDOUT +import subprocess +import sys +from asyncio.subprocess import PIPE, STDOUT +from datetime import datetime, timezone +from typing import Any, Awaitable, Dict, Optional from backend.logging_config import get_logger -from backend.data_store import get_store -from backend.huggingface import DEFAULT_LMDEPLOY_CONTEXT, MAX_LMDEPLOY_CONTEXT from backend.progress_manager import get_progress_manager +from backend.data_store import get_store + + +def _utcnow() -> str: + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + logger = get_logger(__name__) -_lmdeploy_manager_instance: Optional["LMDeployManager"] = None +_manager_instance: Optional["LMDeployManager"] = None def get_lmdeploy_manager() -> "LMDeployManager": - """Return singleton LMDeploy manager.""" - global _lmdeploy_manager_instance - if _lmdeploy_manager_instance is None: - _lmdeploy_manager_instance = LMDeployManager() - return _lmdeploy_manager_instance + """Singleton accessor, mirroring the llama manager pattern.""" + global _manager_instance + if _manager_instance is None: + _manager_instance = LMDeployManager() + return _manager_instance class LMDeployManager: - """Manage LMDeploy TurboMind runtime lifecycle.""" - - def __init__( - self, - binary_path: Optional[str] = None, - host: str = "0.0.0.0", - port: int = 2001, - ): - self.binary_path = binary_path or os.getenv("LMDEPLOY_BIN", "lmdeploy") - self.host = host - self.port = int(os.getenv("LMDEPLOY_PORT", port)) - self._process: Optional[Process] = None - self._log_file = None - self._lock = asyncio.Lock() - self._current_instance: Optional[Dict[str, Any]] = None - self._started_at: Optional[str] = None - self._log_path = os.path.join("data", "logs", "lmdeploy.log") - self._health_timeout = 180 # seconds - self._last_health_status: Optional[Dict[str, Any]] = None - self._last_detected_external: Optional[Dict[str, Any]] = None - self._last_broadcast_log_position = 0 - - async def start( - self, model_entry: Dict[str, Any], config: Dict[str, Any] - ) -> Dict[str, Any]: - """Start LMDeploy serving the provided model. Only one model may run at once.""" - async with self._lock: - if self._process and self._process.returncode is None: - raise RuntimeError("LMDeploy runtime is already running") - - model_path = model_entry.get("file_path") - if not model_path or not os.path.exists(model_path): - raise FileNotFoundError(f"Model file not found at {model_path}") - model_dir = model_entry.get("model_dir") or os.path.dirname(model_path) - if not os.path.isdir(model_dir): - raise FileNotFoundError(f"Model directory not found at {model_dir}") - model_dir_abs = os.path.abspath(model_dir) - - # Derive a stable model name for LMDeploy's --model-name flag. - # Preference order: - # 1) Explicit model_name passed in model_entry - # 2) Base model / display name from model_entry - # 3) Hugging Face repo id - # 4) Directory name - model_name = ( - model_entry.get("model_name") - or model_entry.get("display_name") - or model_entry.get("huggingface_id") - or os.path.basename(model_dir_abs.rstrip(os.sep)) - ) - - # Inject model_name into config passed to LMDeploy so the command builder - # can add --model-name and we persist it in status/config reflection. - effective_config = dict(config or {}) - if model_name and not effective_config.get("model_name"): - effective_config["model_name"] = model_name - - binary = self._resolve_binary() - command = self._build_command(binary, model_dir_abs, effective_config) - env = os.environ.copy() - env.setdefault("LMDEPLOY_LOG_DIR", os.path.dirname(self._log_path)) - os.makedirs(os.path.dirname(self._log_path), exist_ok=True) - self._log_file = open(self._log_path, "ab", buffering=0) - - logger.info(f"Starting LMDeploy with command: {' '.join(command)}") - self._process = await asyncio.create_subprocess_exec( - *command, - stdout=self._log_file, - stderr=STDOUT, - cwd=model_dir_abs, - env=env, - ) - self._started_at = datetime.utcnow().isoformat() + "Z" - self._current_instance = { - "model_id": model_entry.get("model_id"), - "huggingface_id": model_entry.get("huggingface_id"), - "file_path": model_path, - "config": effective_config, - "pid": self._process.pid, - } - + """ + Manage LMDeploy installation into its own venv, similar in spirit to LlamaManager. + + Responsibilities: + - Create a dedicated venv under data/lmdeploy + - Install LMDeploy from PyPI (release) or from a git source checkout + - Track install status, version, binary path and venv path + - Emit progress events so the UI can show logs and status + """ + + def __init__( + self, + *, + log_path: Optional[str] = None, + state_path: Optional[str] = None, + base_dir: Optional[str] = None, + ) -> None: + self._lock = asyncio.Lock() + self._operation: Optional[str] = None + self._operation_started_at: Optional[str] = None + self._current_task: Optional[asyncio.Task] = None + self._last_error: Optional[str] = None + + data_root = os.path.abspath("data") + base_path = base_dir or os.path.join(data_root, "lmdeploy") + # Root directory under which versioned LMDeploy environments are created. + self._root_dir = os.path.abspath(base_path) + # Default venv path (used only as a fallback when no versioned install exists). + self._base_dir = self._root_dir + self._venv_path = os.path.join(self._base_dir, "venv") + log_path = log_path or os.path.join(data_root, "logs", "lmdeploy_install.log") + state_path = state_path or os.path.join( + data_root, "configs", "lmdeploy_manager.json" + ) + self._log_path = os.path.abspath(log_path) + self._state_path = os.path.abspath(state_path) + self._ensure_directories() + + # --- Venv and filesystem helpers ------------------------------------------------- + + def _ensure_directories(self) -> None: + os.makedirs(self._base_dir, exist_ok=True) + os.makedirs(os.path.dirname(self._log_path), exist_ok=True) + os.makedirs(os.path.dirname(self._state_path), exist_ok=True) + + def _venv_bin(self, executable: str) -> str: + if os.name == "nt": + exe = executable if executable.lower().endswith(".exe") else f"{executable}.exe" + return os.path.join(self._venv_path, "Scripts", exe) + return os.path.join(self._venv_path, "bin", executable) + + def _venv_python(self) -> str: + return self._venv_bin("python") + + def _prepare_versioned_paths(self, label: str = "") -> str: + """ + Prepare a new versioned install directory under the LMDeploy root. + + Returns: + A version directory name component (e.g. '20250309-123456-pip'). + """ + ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + suffix = f"-{label}" if label else "" + version_dir = f"{ts}{suffix}" + self._base_dir = os.path.join(self._root_dir, version_dir) + self._venv_path = os.path.join(self._base_dir, "venv") + self._ensure_directories() + return version_dir + + def _ensure_venv(self) -> None: + python_path = self._venv_python() + if os.path.exists(python_path): + return + os.makedirs(self._base_dir, exist_ok=True) + try: + subprocess.run([sys.executable, "-m", "venv", self._venv_path], check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"Failed to create LMDeploy virtual environment: {exc}") from exc + + # --- State persistence ----------------------------------------------------------- + + def _load_state(self) -> Dict[str, Any]: + if not os.path.exists(self._state_path): + return {} + try: + with open(self._state_path, "r", encoding="utf-8") as handle: + data = json.load(handle) + return data if isinstance(data, dict) else {} + except Exception as exc: + logger.warning(f"Failed to load LMDeploy manager state: {exc}") + return {} + + def _save_state(self, state: Dict[str, Any]) -> None: + tmp_path = f"{self._state_path}.tmp" + with open(tmp_path, "w", encoding="utf-8") as handle: + json.dump(state, handle, indent=2) + os.replace(tmp_path, self._state_path) + + def _detect_installed_version(self) -> Optional[str]: + python_exe = self._venv_python() + if not os.path.exists(python_exe): + return None + script = ( + "import importlib, sys\n" + "try:\n" + " from importlib import metadata\n" + "except ImportError:\n" + " import importlib_metadata as metadata\n" + "try:\n" + " print(metadata.version('lmdeploy'))\n" + "except metadata.PackageNotFoundError:\n" + " sys.exit(1)\n" + ) + try: + output = subprocess.check_output([python_exe, "-c", script], text=True).strip() + return output or None + except subprocess.CalledProcessError: + return None + except Exception as exc: # pragma: no cover + logger.debug(f"Unable to determine LMDeploy version: {exc}") + return None + + def _resolve_binary_path(self) -> Optional[str]: + override = os.getenv("LMDEPLOY_BIN") + if override: + override_path = os.path.abspath(os.path.expanduser(override)) + if os.path.exists(override_path): + return override_path + resolved_override = shutil.which(override) + if resolved_override: + return resolved_override + + candidate = self._venv_bin("lmdeploy") + if os.path.exists(candidate) and os.access(candidate, os.X_OK): + return os.path.abspath(candidate) + + return shutil.which("lmdeploy") + + def _update_installed_state(self, installed: bool, version: Optional[str]) -> None: + state = self._load_state() + if installed: + state["installed_at"] = _utcnow() + state["installed_version"] = version + state["venv_path"] = self._venv_path + else: + state["installed_version"] = None + state["installed_at"] = None + state["removed_at"] = _utcnow() + state["venv_path"] = self._venv_path + self._save_state(state) + + def _refresh_state_from_environment(self) -> None: + state = self._load_state() + version = self._detect_installed_version() + state["installed_version"] = version + if version is None: + state["removed_at"] = _utcnow() + state["venv_path"] = self._venv_path + self._save_state(state) + + # --- PIP helpers and progress broadcasting -------------------------------------- + + async def _run_pip( + self, + args: list[str], + operation: str, + ensure_venv: bool = True, + cwd: Optional[str] = None, + ) -> int: + if ensure_venv: + self._ensure_venv() + python_exe = self._venv_python() + if not os.path.exists(python_exe): + raise RuntimeError("LMDeploy virtual environment is missing; cannot run pip.") + + header = f"[{_utcnow()}] Starting LMDeploy {operation} via pip {' '.join(args)}\n" + with open(self._log_path, "w", encoding="utf-8") as log_file: + log_file.write(header) + + process = await asyncio.create_subprocess_exec( + python_exe, + "-m", + "pip", + *args, + stdout=PIPE, + stderr=STDOUT, + cwd=cwd, + ) + + async def _stream_output() -> None: + if process.stdout is None: + return + with open(self._log_path, "a", encoding="utf-8", buffering=1) as log_file: + while True: + chunk = await process.stdout.readline() + if not chunk: + break + text = chunk.decode("utf-8", errors="replace") + log_file.write(text) + await self._broadcast_log_line(text.rstrip("\n")) + + await asyncio.gather(process.wait(), _stream_output()) + return process.returncode or 0 + + async def _broadcast_log_line(self, line: str) -> None: + try: + await get_progress_manager().broadcast( + {"type": "lmdeploy_install_log", "line": line, "timestamp": _utcnow()} + ) + except Exception as exc: # pragma: no cover + logger.debug(f"Failed to broadcast LMDeploy log line: {exc}") + + async def _set_operation(self, operation: str) -> None: + self._operation = operation + self._operation_started_at = _utcnow() + self._last_error = None + await get_progress_manager().broadcast( + { + "type": "lmdeploy_install_status", + "status": operation, + "started_at": self._operation_started_at, + } + ) + + async def _finish_operation(self, success: bool, message: str = "") -> None: + payload = { + "type": "lmdeploy_install_status", + "status": "completed" if success else "failed", + "operation": self._operation, + "message": message, + "ended_at": _utcnow(), + } + await get_progress_manager().broadcast(payload) + self._operation = None + self._operation_started_at = None + + def _create_task(self, coro: Awaitable[Any]) -> None: + loop = asyncio.get_running_loop() + task = loop.create_task(coro) + self._current_task = task + + def _cleanup(fut: asyncio.Future) -> None: + try: + fut.result() + except Exception as exc: # pragma: no cover + logger.error(f"LMDeploy manager task error: {exc}") + finally: + self._current_task = None + + task.add_done_callback(_cleanup) + + # --- Public interface ----------------------------------------------------------- + + async def install_release( + self, version: Optional[str] = None, force_reinstall: bool = False + ) -> Dict[str, Any]: + """Install LMDeploy from PyPI into its own venv.""" + async with self._lock: + if self._operation: + raise RuntimeError("Another LMDeploy operation is already running") + await self._set_operation("install") + # Create a fresh, versioned install directory for this LMDeploy release. + self._prepare_versioned_paths(label="pip") + args = ["install", "--upgrade"] + if force_reinstall: + args.append("--force-reinstall") + package = "lmdeploy" + if version: + package = f"lmdeploy=={version}" + args.append(package) + + async def _runner(): try: - await self._wait_for_ready() + code = await self._run_pip(args, "install") + if code != 0: + raise RuntimeError(f"pip exited with status {code}") + detected_version = self._detect_installed_version() + self._update_installed_state(True, detected_version) + # Persist engine metadata in engines.yaml (used by llama-swap config) + try: + store = get_store() + version_name = detected_version or f"pip-{_utcnow()}" + meta: Dict[str, Any] = { + "version": version_name, + "install_type": "pip", + "venv_path": self._venv_path, + "installed_at": _utcnow(), + } + # Register LMDeploy as a versioned engine, same pattern as llama_cpp. + store.add_engine_version("lmdeploy", meta) + store.set_active_engine_version("lmdeploy", version_name) + except Exception as exc: + logger.debug(f"Failed to persist LMDeploy engine metadata: {exc}") + await self._finish_operation(True, "LMDeploy installed") except Exception as exc: - await self.stop(force=True) - raise exc - - return self.status() - - async def stop(self, force: bool = False) -> None: - """Stop LMDeploy process if running.""" - async with self._lock: - if not self._process: - return - if self._process.returncode is None: - try: - self._process.terminate() - await asyncio.wait_for(self._process.wait(), timeout=30) - except asyncio.TimeoutError: - logger.warning( - "LMDeploy did not terminate gracefully; killing process" - ) - self._process.kill() - await self._process.wait() - except ProcessLookupError: - logger.debug("LMDeploy process already stopped") - elif force: - try: - self._process.kill() - except ProcessLookupError: - pass - self._cleanup_process_state() - - async def restart( - self, model_entry: Dict[str, Any], config: Dict[str, Any] - ) -> Dict[str, Any]: - """Restart LMDeploy with a new model/config.""" - await self.stop() - return await self.start(model_entry, config) - - def status(self) -> Dict[str, Any]: - """Return status payload describing the running instance.""" - running = bool(self._process and self._process.returncode is None) - detection = None - if not running: - detection = self._detect_external_process() - if detection: - running = True - self._last_detected_external = detection - if not self._current_instance: - self._current_instance = detection.get("instance") - if not self._started_at: - self._started_at = detection.get("started_at") - else: - self._last_detected_external = None - else: - self._last_detected_external = None - - return { - "running": running, - "port": self.port, - "host": self.host, - "process_id": self._process.pid if running else None, - "started_at": self._started_at, - "current_instance": self._current_instance if running else None, - "health": self._last_health_status, - "binary_path": self._current_binary_path(), - "log_path": self._log_path, - "auto_detected": bool(detection), - "detection": detection, - } - - def _current_binary_path(self) -> Optional[str]: - try: - return self._resolve_binary() - except FileNotFoundError: - return None - - def _resolve_binary(self) -> str: + self._last_error = str(exc) + self._refresh_state_from_environment() + await self._finish_operation(False, str(exc)) + + self._create_task(_runner()) + return {"message": "LMDeploy installation started"} + + async def install_from_source( + self, + repo_url: str = "https://github.com/InternLM/lmdeploy.git", + branch: str = "main", + ) -> Dict[str, Any]: + """Install LMDeploy from a git repo and branch (for development).""" + async with self._lock: + if self._operation: + raise RuntimeError("Another LMDeploy operation is already running") + await self._set_operation("install_source") + # Create a fresh, versioned install directory for this LMDeploy source build. + self._prepare_versioned_paths(label="source") + clone_dir = os.path.join(self._base_dir, "source") + + async def _runner(): try: - from backend.lmdeploy_installer import get_lmdeploy_installer - - installer_binary = get_lmdeploy_installer().status().get("binary_path") - if installer_binary and os.path.exists(installer_binary): - return installer_binary + self._ensure_venv() + if os.path.exists(clone_dir): + shutil.rmtree(clone_dir) + os.makedirs(clone_dir, exist_ok=True) + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth", + "1", + "--branch", + branch, + repo_url, + clone_dir, + stdout=PIPE, + stderr=STDOUT, + ) + await proc.wait() + if proc.returncode != 0: + raise RuntimeError(f"git clone failed with code {proc.returncode}") + code = await self._run_pip( + ["install", "-e", "."], "install_source", cwd=clone_dir + ) + if code != 0: + raise RuntimeError(f"pip install -e . failed with code {code}") + detected = self._detect_installed_version() + self._update_installed_state(True, detected) + try: + store = get_store() + base_version = detected or branch or "source" + version_name = f"{base_version}-{_utcnow()}" + meta: Dict[str, Any] = { + "version": version_name, + "install_type": "source", + "source_repo": repo_url, + "source_branch": branch, + "venv_path": self._venv_path, + "installed_at": _utcnow(), + } + store.add_engine_version("lmdeploy", meta) + store.set_active_engine_version("lmdeploy", version_name) + except Exception as exc: + logger.debug(f"Failed to persist LMDeploy engine metadata (source): {exc}") + await self._finish_operation(True, f"Installed from {branch}") except Exception as exc: - logger.debug( - f"Failed to resolve LMDeploy binary via installer status: {exc}" - ) - - resolved = shutil.which(self.binary_path) - if resolved: - return resolved - - candidate = os.path.expanduser(self.binary_path) - if os.path.isabs(candidate) and os.path.exists(candidate): - return candidate - raise FileNotFoundError( - "LMDeploy binary not found in PATH. Install LMDeploy from the LMDeploy page or set LMDEPLOY_BIN." - ) - - def _build_command( - self, binary: str, model_dir: str, config: Dict[str, Any] - ) -> list: - """Convert stored config into lmdeploy CLI arguments.""" - tensor_parallel = max(1, int(config.get("tensor_parallel") or 1)) - base_session_len = max( - 1024, - int( - config.get("session_len") - or config.get("context_length") - or DEFAULT_LMDEPLOY_CONTEXT - ), - ) - rope_scaling_mode = str(config.get("rope_scaling_mode") or "disabled").lower() - rope_scaling_factor = float(config.get("rope_scaling_factor") or 1.0) - scaling_enabled = ( - rope_scaling_mode not in {"", "none", "disabled"} - and rope_scaling_factor > 1.0 - ) - effective_session_len = base_session_len - if scaling_enabled: - scaled = int(base_session_len * rope_scaling_factor) - effective_session_len = max( - base_session_len, min(scaled, MAX_LMDEPLOY_CONTEXT) - ) - max_batch_size = max(1, int(config.get("max_batch_size") or 4)) - base_prefill = int( - config.get("max_prefill_token_num") - or config.get("max_batch_tokens") - or (base_session_len * 2) - ) - if scaling_enabled: - scaled_prefill = int(base_prefill * rope_scaling_factor) - max_prefill_token_num = scaled_prefill - else: - max_prefill_token_num = base_prefill - - command = [ - binary, - "serve", - "api_server", - model_dir, - "--backend", - "turbomind", - "--server-name", - self.host, - "--server-port", - str(self.port), - "--tp", - str(tensor_parallel), - "--session-len", - str(effective_session_len), - "--max-batch-size", - str(max_batch_size), - ] - - # Optional model identity for OpenAI-style /v1/models listing - model_name = config.get("model_name") - if model_name and str(model_name).strip(): - command.extend(["--model-name", str(model_name).strip()]) - - # Optional inference settings - dtype = config.get("dtype") - if dtype and str(dtype).strip(): - command.extend(["--dtype", str(dtype).strip()]) - if max_prefill_token_num: - command.extend(["--max-prefill-token-num", str(max_prefill_token_num)]) - cache_max_entry_count = config.get("cache_max_entry_count") - if cache_max_entry_count is not None: - command.extend(["--cache-max-entry-count", str(cache_max_entry_count)]) - cache_block_seq_len = config.get("cache_block_seq_len") - if cache_block_seq_len: - command.extend(["--cache-block-seq-len", str(cache_block_seq_len)]) - if config.get("enable_prefix_caching"): - command.append("--enable-prefix-caching") - quant_policy = config.get("quant_policy") - if quant_policy is not None: - command.extend(["--quant-policy", str(quant_policy)]) - model_format = config.get("model_format") - if model_format and str(model_format).strip(): - command.extend(["--model-format", str(model_format).strip()]) - hf_overrides = config.get("hf_overrides") - if isinstance(hf_overrides, dict) and hf_overrides: - - def _flatten(prefix: str, value: Any): - if isinstance(value, dict): - for key, nested in value.items(): - if not isinstance(key, str) or not key: - continue - new_prefix = f"{prefix}.{key}" if prefix else key - yield from _flatten(new_prefix, nested) - else: - yield prefix, value - - def _format_override_value(val: Any) -> str: - if isinstance(val, bool): - return "true" if val else "false" - if val is None: - return "null" - return str(val) - - for path, value in _flatten("", hf_overrides): - if not path: - continue - command.extend( - [f"--hf-overrides.{path}", _format_override_value(value)] - ) - elif isinstance(hf_overrides, str) and hf_overrides.strip(): - command.extend(["--hf-overrides", hf_overrides.strip()]) - # LMDeploy uses --disable-metrics (inverted logic) - # When enable_metrics=false, send --disable-metrics - # When enable_metrics=true (default), don't send anything (metrics enabled by default) - if not config.get("enable_metrics", True): - command.append("--disable-metrics") - if scaling_enabled: - command.extend(["--rope-scaling-factor", str(rope_scaling_factor)]) - num_tokens_per_iter = config.get("num_tokens_per_iter") - if num_tokens_per_iter: - command.extend(["--num-tokens-per-iter", str(num_tokens_per_iter)]) - max_prefill_iters = config.get("max_prefill_iters") - if max_prefill_iters: - command.extend(["--max-prefill-iters", str(max_prefill_iters)]) - communicator = config.get("communicator") - if communicator and str(communicator).strip(): - command.extend(["--communicator", str(communicator).strip()]) - - # Server configuration parameters - allow_origins = config.get("allow_origins") - if allow_origins: - if isinstance(allow_origins, list): - command.extend( - ["--allow-origins"] + [str(origin) for origin in allow_origins] - ) - elif isinstance(allow_origins, str): - command.extend(["--allow-origins", allow_origins]) - if config.get("allow_credentials"): - command.append("--allow-credentials") - allow_methods = config.get("allow_methods") - if allow_methods: - if isinstance(allow_methods, list): - command.extend( - ["--allow-methods"] + [str(method) for method in allow_methods] - ) - elif isinstance(allow_methods, str): - command.extend(["--allow-methods", allow_methods]) - allow_headers = config.get("allow_headers") - if allow_headers: - if isinstance(allow_headers, list): - command.extend( - ["--allow-headers"] + [str(header) for header in allow_headers] - ) - elif isinstance(allow_headers, str): - command.extend(["--allow-headers", allow_headers]) - proxy_url = config.get("proxy_url") - if proxy_url and str(proxy_url).strip(): - command.extend(["--proxy-url", str(proxy_url).strip()]) - max_concurrent_requests = config.get("max_concurrent_requests") - if max_concurrent_requests is not None: - command.extend( - ["--max-concurrent-requests", str(int(max_concurrent_requests))] - ) - log_level = config.get("log_level") - if log_level and str(log_level).strip(): - command.extend(["--log-level", str(log_level).strip()]) - api_keys = config.get("api_keys") - if api_keys: - if isinstance(api_keys, list): - command.extend(["--api-keys"] + [str(key) for key in api_keys]) - elif isinstance(api_keys, str): - command.extend(["--api-keys", api_keys]) - if config.get("ssl"): - command.append("--ssl") - max_log_len = config.get("max_log_len") - if max_log_len is not None: - command.extend(["--max-log-len", str(int(max_log_len))]) - if config.get("disable_fastapi_docs"): - command.append("--disable-fastapi-docs") - if config.get("allow_terminate_by_client"): - command.append("--allow-terminate-by-client") - if config.get("enable_abort_handling"): - command.append("--enable-abort-handling") - - # Model configuration parameters - chat_template = config.get("chat_template") - if chat_template and str(chat_template).strip(): - command.extend(["--chat-template", str(chat_template).strip()]) - tool_call_parser = config.get("tool_call_parser") - if tool_call_parser and str(tool_call_parser).strip(): - command.extend(["--tool-call-parser", str(tool_call_parser).strip()]) - reasoning_parser = config.get("reasoning_parser") - if reasoning_parser and str(reasoning_parser).strip(): - command.extend(["--reasoning-parser", str(reasoning_parser).strip()]) - revision = config.get("revision") - if revision and str(revision).strip(): - command.extend(["--revision", str(revision).strip()]) - download_dir = config.get("download_dir") - if download_dir and str(download_dir).strip(): - command.extend(["--download-dir", str(download_dir).strip()]) - adapters = config.get("adapters") - if adapters: - if isinstance(adapters, list): - command.extend(["--adapters"] + [str(adapter) for adapter in adapters]) - elif isinstance(adapters, str): - command.extend(["--adapters", adapters]) - device = config.get("device") - if device and str(device).strip(): - command.extend(["--device", str(device).strip()]) - if config.get("eager_mode"): - command.append("--eager-mode") - if config.get("disable_vision_encoder"): - command.append("--disable-vision-encoder") - logprobs_mode = config.get("logprobs_mode") - if logprobs_mode is not None: - command.extend(["--logprobs-mode", str(logprobs_mode)]) - - # DLLM parameters - dllm_block_length = config.get("dllm_block_length") - if dllm_block_length is not None: - command.extend(["--dllm-block-length", str(int(dllm_block_length))]) - dllm_unmasking_strategy = config.get("dllm_unmasking_strategy") - if dllm_unmasking_strategy and str(dllm_unmasking_strategy).strip(): - command.extend( - ["--dllm-unmasking-strategy", str(dllm_unmasking_strategy).strip()] - ) - dllm_denoising_steps = config.get("dllm_denoising_steps") - if dllm_denoising_steps is not None: - command.extend(["--dllm-denoising-steps", str(int(dllm_denoising_steps))]) - dllm_confidence_threshold = config.get("dllm_confidence_threshold") - if dllm_confidence_threshold is not None: - command.extend( - ["--dllm-confidence-threshold", str(float(dllm_confidence_threshold))] - ) - - # Distributed/Multi-node parameters - dp = config.get("dp") - if dp is not None: - command.extend(["--dp", str(int(dp))]) - ep = config.get("ep") - if ep is not None: - command.extend(["--ep", str(int(ep))]) - if config.get("enable_microbatch"): - command.append("--enable-microbatch") - if config.get("enable_eplb"): - command.append("--enable-eplb") - role = config.get("role") - if role and str(role).strip(): - command.extend(["--role", str(role).strip()]) - migration_backend = config.get("migration_backend") - if migration_backend and str(migration_backend).strip(): - command.extend(["--migration-backend", str(migration_backend).strip()]) - node_rank = config.get("node_rank") - if node_rank is not None: - command.extend(["--node-rank", str(int(node_rank))]) - nnodes = config.get("nnodes") - if nnodes is not None: - command.extend(["--nnodes", str(int(nnodes))]) - cp = config.get("cp") - if cp is not None: - command.extend(["--cp", str(int(cp))]) - if config.get("enable_return_routed_experts"): - command.append("--enable-return-routed-experts") - distributed_executor_backend = config.get("distributed_executor_backend") - if distributed_executor_backend and str(distributed_executor_backend).strip(): - command.extend( - [ - "--distributed-executor-backend", - str(distributed_executor_backend).strip(), - ] - ) - - # Vision parameters - vision_max_batch_size = config.get("vision_max_batch_size") - if vision_max_batch_size is not None: - command.extend(["--vision-max-batch-size", str(int(vision_max_batch_size))]) - - # Speculative decoding parameters - speculative_algorithm = config.get("speculative_algorithm") - if speculative_algorithm and str(speculative_algorithm).strip(): - command.extend( - ["--speculative-algorithm", str(speculative_algorithm).strip()] - ) - speculative_draft_model = config.get("speculative_draft_model") - if speculative_draft_model and str(speculative_draft_model).strip(): - command.extend( - ["--speculative-draft-model", str(speculative_draft_model).strip()] - ) - speculative_num_draft_tokens = config.get("speculative_num_draft_tokens") - if speculative_num_draft_tokens is not None: - command.extend( - [ - "--speculative-num-draft-tokens", - str(int(speculative_num_draft_tokens)), - ] - ) - - additional_args = config.get("additional_args") - if isinstance(additional_args, str) and additional_args.strip(): - command.extend(shlex.split(additional_args.strip())) - - return command - - async def _wait_for_ready(self) -> None: - """Poll LMDeploy server until healthy or timeout.""" - start_time = asyncio.get_event_loop().time() - url = f"http://{self.host}:{self.port}/v1/models" - async with httpx.AsyncClient(timeout=5.0) as client: - while True: - if self._process and self._process.returncode not in (None, 0): - self._raise_with_logs( - f"LMDeploy exited unexpectedly with code {self._process.returncode}" - ) - try: - response = await client.get(url) - if response.status_code == 200: - self._last_health_status = { - "status": "ready", - "checked_at": datetime.utcnow().isoformat() + "Z", - } - return - except Exception as exc: - logger.debug(f"LMDeploy health check pending: {exc}") - if asyncio.get_event_loop().time() - start_time > self._health_timeout: - self._raise_with_logs( - "Timed out waiting for LMDeploy server to become ready" - ) - await asyncio.sleep(2) - - def _cleanup_process_state(self) -> None: - if self._log_file: - try: - self._log_file.close() - except Exception: - pass - self._log_file = None - self._process = None - self._current_instance = None - self._started_at = None - self._last_health_status = { - "status": "stopped", - "checked_at": datetime.utcnow().isoformat() + "Z", - } - - def read_log_tail(self, max_bytes: int = 8192) -> str: - """Return the tail of the lmdeploy log file for debugging.""" + self._last_error = str(exc) + self._refresh_state_from_environment() + await self._finish_operation(False, str(exc)) + + self._create_task(_runner()) + return { + "message": "LMDeploy install from source started", + "repo": repo_url, + "branch": branch, + } + + async def remove(self) -> Dict[str, Any]: + """Remove LMDeploy from its venv and clean up state.""" + async with self._lock: + if self._operation: + raise RuntimeError("Another LMDeploy operation is already running") + await self._set_operation("remove") + args = ["uninstall", "-y", "lmdeploy"] + + async def _runner(): try: - with open(self._log_path, "rb") as log_file: - log_file.seek(0, os.SEEK_END) - file_size = log_file.tell() - seek_pos = max(0, file_size - max_bytes) - log_file.seek(seek_pos) - data = log_file.read().decode("utf-8", errors="replace") - if seek_pos > 0: - # Remove potential partial first line - data = data.split("\n", 1)[-1] - return data.strip() + from backend.data_store import get_store + + store = get_store() + active = store.get_active_engine_version("lmdeploy") + venv_path = active.get("venv_path") if active else self._venv_path + + python_exists = os.path.exists(self._venv_python()) + if python_exists: + code = await self._run_pip(args, "remove", ensure_venv=False) + if code != 0: + raise RuntimeError(f"pip exited with status {code}") + if venv_path: + shutil.rmtree(venv_path, ignore_errors=True) + if active and active.get("version"): + try: + store.delete_engine_version("lmdeploy", active["version"]) + except Exception as exc: # pragma: no cover + logger.debug(f"Failed to delete LMDeploy engine version metadata: {exc}") + self._update_installed_state(False, None) + await self._finish_operation(True, "LMDeploy removed") except Exception as exc: - logger.error(f"Failed to read LMDeploy log tail: {exc}") - return "" + self._last_error = str(exc) + self._refresh_state_from_environment() + await self._finish_operation(False, str(exc)) + + self._create_task(_runner()) + return {"message": "LMDeploy removal started"} + + # --- Introspection -------------------------------------------------------------- + + def status(self) -> Dict[str, Any]: + version = self._detect_installed_version() + binary_path = self._resolve_binary_path() + installed = version is not None and binary_path is not None + state = self._load_state() + return { + "installed": installed, + "version": version, + "binary_path": binary_path, + "venv_path": state.get("venv_path") or self._venv_path, + "installed_at": state.get("installed_at"), + "removed_at": state.get("removed_at"), + "operation": self._operation, + "operation_started_at": self._operation_started_at, + "last_error": self._last_error, + "log_path": self._log_path, + } + + def is_operation_running(self) -> bool: + return self._operation is not None + + def read_log_tail(self, max_bytes: int = 8192) -> str: + if not os.path.exists(self._log_path): + return "" + with open(self._log_path, "rb") as log_file: + log_file.seek(0, os.SEEK_END) + size = log_file.tell() + log_file.seek(max(0, size - max_bytes)) + data = log_file.read().decode("utf-8", errors="replace") + if size > max_bytes: + data = data.split("\n", 1)[-1] + return data.strip() - async def _broadcast_runtime_logs(self) -> None: - """Broadcast new runtime log lines via SSE.""" - try: - if not os.path.exists(self._log_path): - return - - # Read new content since last broadcast - current_size = os.path.getsize(self._log_path) - if current_size <= self._last_broadcast_log_position: - return # No new content - - # Read only new content - with open(self._log_path, "rb") as log_file: - log_file.seek(self._last_broadcast_log_position) - new_content = log_file.read().decode("utf-8", errors="replace") - self._last_broadcast_log_position = current_size - - if new_content: - # Split into lines and broadcast each non-empty line via SSE - lines = new_content.split('\n') - for line in lines: - if line.strip(): - get_progress_manager().emit("lmdeploy_runtime_log", {"line": line.strip(), "timestamp": datetime.utcnow().isoformat()}) - except Exception as exc: - logger.debug(f"Failed to broadcast LMDeploy runtime logs: {exc}") - - def _read_log_tail(self, max_bytes: int = 8192) -> str: - """Private alias for backward compatibility.""" - return self.read_log_tail(max_bytes) - - def _raise_with_logs(self, message: str) -> None: - """Raise a runtime error that includes the recent LMDeploy logs.""" - log_tail = self.read_log_tail() - if log_tail: - logger.error( - f"{message}\n--- LMDeploy log tail ---\n{log_tail}\n--- end ---" - ) - raise RuntimeError(f"{message}. See logs for details.\n{log_tail}") - raise RuntimeError(message) - - def _detect_external_process(self) -> Optional[Dict[str, Any]]: - """Scan system processes for an LMDeploy server launched outside the manager.""" - try: - for proc in psutil.process_iter(attrs=["pid", "cmdline", "create_time"]): - cmdline: List[str] = proc.info.get("cmdline") or [] - if not cmdline: - continue - lowered = " ".join(cmdline).lower() - if "lmdeploy" not in lowered: - continue - if "serve" not in lowered or "api_server" not in lowered: - continue - - try: - api_server_idx = cmdline.index("api_server") - except ValueError: - continue - model_dir = ( - cmdline[api_server_idx + 1] - if len(cmdline) > api_server_idx + 1 - else None - ) - detection = { - "pid": proc.info["pid"], - "cmdline": cmdline, - "model_dir": model_dir, - "detected_at": datetime.utcnow().isoformat() + "Z", - } - - config = self._config_from_cmdline(cmdline) - model_entry = ( - self._lookup_model_by_dir(model_dir) if model_dir else None - ) - if model_entry: - self._ensure_running_instance_record(model_entry.get("id"), config) - detection["instance"] = { - "model_id": model_entry.get("id"), - "huggingface_id": model_entry.get("huggingface_id"), - "file_path": model_entry.get("file_path"), - "config": config, - "pid": proc.info["pid"], - "auto_detected": True, - } - detection["model_id"] = model_entry.get("id") - detection["huggingface_id"] = model_entry.get("huggingface_id") - else: - detection["instance"] = { - "model_id": None, - "huggingface_id": None, - "file_path": model_dir, - "config": config, - "pid": proc.info["pid"], - "auto_detected": True, - } - - started_at = proc.info.get("create_time") - if started_at: - detection["started_at"] = ( - datetime.utcfromtimestamp(started_at).isoformat() + "Z" - ) - else: - detection["started_at"] = datetime.utcnow().isoformat() + "Z" - return detection - except Exception as exc: - logger.debug(f"LMDeploy external scan failed: {exc}") - return None - - def _config_from_cmdline(self, cmdline: List[str]) -> Dict[str, Any]: - """Reconstruct a minimal config dict from lmdeploy CLI arguments.""" - - def _extract(flag: str, cast, default=None): - if flag in cmdline: - idx = cmdline.index(flag) - if idx + 1 < len(cmdline): - try: - return cast(cmdline[idx + 1]) - except (ValueError, TypeError): - return default - return default - - def _extract_list(flag: str, default=None): - """Extract list of values for flags that accept multiple arguments.""" - if flag not in cmdline: - return default - idx = cmdline.index(flag) - result = [] - i = idx + 1 - while i < len(cmdline) and not cmdline[i].startswith("--"): - result.append(cmdline[i]) - i += 1 - return result if result else default - - session_len = _extract("--session-len", int, DEFAULT_LMDEPLOY_CONTEXT) - max_prefill = _extract("--max-prefill-token-num", int, session_len) - # Note: --max-context-token-num doesn't exist in LMDeploy, so derive from session_len - max_context = session_len - - rope_scaling_factor = _extract("--rope-scaling-factor", float, 1.0) - rope_scaling_mode = "disabled" - if rope_scaling_factor and rope_scaling_factor > 1.0: - rope_scaling_mode = "detected" - - hf_overrides: Dict[str, Any] = {} - - def _assign_nested(target: Dict[str, Any], path: List[str], value: Any) -> None: - current = target - for segment in path[:-1]: - current = current.setdefault(segment, {}) - current[path[-1]] = value - - def _coerce_override_value(raw: str) -> Any: - lowered = raw.lower() - if lowered in {"true", "false"}: - return lowered == "true" - if lowered == "null": - return None - try: - if "." in raw: - return float(raw) - return int(raw) - except ValueError: - return raw - - i = 0 - while i < len(cmdline): - token = cmdline[i] - if token.startswith("--hf-overrides."): - path_str = token[len("--hf-overrides.") :] - if path_str and i + 1 < len(cmdline): - value = _coerce_override_value(cmdline[i + 1]) - _assign_nested(hf_overrides, path_str.split("."), value) - i += 2 - continue - i += 1 - - config = { - "session_len": session_len, - "tensor_parallel": _extract("--tp", int, 1), - "max_batch_size": _extract("--max-batch-size", int, 4), - "max_prefill_token_num": max_prefill, - "max_context_token_num": max_context, - "dtype": _extract("--dtype", str, "auto"), - "cache_max_entry_count": _extract("--cache-max-entry-count", float, 0.8), - "cache_block_seq_len": _extract("--cache-block-seq-len", int, 64), - "enable_prefix_caching": "--enable-prefix-caching" in cmdline, - "quant_policy": _extract("--quant-policy", int, 0), - "model_format": _extract("--model-format", str, ""), - "hf_overrides": hf_overrides or _extract("--hf-overrides", str, ""), - # LMDeploy uses --disable-metrics, so enable_metrics=True when flag is NOT present - "enable_metrics": "--disable-metrics" not in cmdline, - "rope_scaling_factor": rope_scaling_factor, - "rope_scaling_mode": rope_scaling_mode, - "num_tokens_per_iter": _extract("--num-tokens-per-iter", int, 0), - "max_prefill_iters": _extract("--max-prefill-iters", int, 1), - "communicator": _extract("--communicator", str, "nccl"), - "model_name": _extract("--model-name", str, ""), - # Server configuration - "allow_origins": _extract_list("--allow-origins"), - "allow_credentials": "--allow-credentials" in cmdline, - "allow_methods": _extract_list("--allow-methods"), - "allow_headers": _extract_list("--allow-headers"), - "proxy_url": _extract("--proxy-url", str, ""), - "max_concurrent_requests": _extract("--max-concurrent-requests", int), - "log_level": _extract("--log-level", str, ""), - "api_keys": _extract_list("--api-keys"), - "ssl": "--ssl" in cmdline, - "max_log_len": _extract("--max-log-len", int), - "disable_fastapi_docs": "--disable-fastapi-docs" in cmdline, - "allow_terminate_by_client": "--allow-terminate-by-client" in cmdline, - "enable_abort_handling": "--enable-abort-handling" in cmdline, - # Model configuration - "chat_template": _extract("--chat-template", str, ""), - "tool_call_parser": _extract("--tool-call-parser", str, ""), - "reasoning_parser": _extract("--reasoning-parser", str, ""), - "revision": _extract("--revision", str, ""), - "download_dir": _extract("--download-dir", str, ""), - "adapters": _extract_list("--adapters"), - "device": _extract("--device", str, ""), - "eager_mode": "--eager-mode" in cmdline, - "disable_vision_encoder": "--disable-vision-encoder" in cmdline, - "logprobs_mode": _extract("--logprobs-mode", str), - # DLLM parameters - "dllm_block_length": _extract("--dllm-block-length", int), - "dllm_unmasking_strategy": _extract("--dllm-unmasking-strategy", str, ""), - "dllm_denoising_steps": _extract("--dllm-denoising-steps", int), - "dllm_confidence_threshold": _extract("--dllm-confidence-threshold", float), - # Distributed/Multi-node parameters - "dp": _extract("--dp", int), - "ep": _extract("--ep", int), - "enable_microbatch": "--enable-microbatch" in cmdline, - "enable_eplb": "--enable-eplb" in cmdline, - "role": _extract("--role", str, ""), - "migration_backend": _extract("--migration-backend", str, ""), - "node_rank": _extract("--node-rank", int), - "nnodes": _extract("--nnodes", int), - "cp": _extract("--cp", int), - "enable_return_routed_experts": "--enable-return-routed-experts" in cmdline, - "distributed_executor_backend": _extract( - "--distributed-executor-backend", str, "" - ), - # Vision parameters - "vision_max_batch_size": _extract("--vision-max-batch-size", int), - # Speculative decoding parameters - "speculative_algorithm": _extract("--speculative-algorithm", str, ""), - "speculative_draft_model": _extract("--speculative-draft-model", str, ""), - "speculative_num_draft_tokens": _extract( - "--speculative-num-draft-tokens", int - ), - "additional_args": "", - } - - return config - - def _lookup_model_by_dir(self, model_dir: Optional[str]) -> Optional[Dict[str, Any]]: - if not model_dir: - return None - store = get_store() - for candidate in store.list_models(): - if (candidate.get("format") or candidate.get("model_format")) != "safetensors": - continue - fp = candidate.get("file_path") - if fp and os.path.dirname(fp) == model_dir: - return candidate - return None - - def _ensure_running_instance_record( - self, model_id: Optional[Any], config: Dict[str, Any] - ) -> None: - # No-op: running state is not persisted to DB (Phase 1 YAML store) - pass diff --git a/backend/main.py b/backend/main.py index c692ee4..d182e60 100644 --- a/backend/main.py +++ b/backend/main.py @@ -15,13 +15,10 @@ llama_versions, status, gpu_info, - llama_version_manager, - lmdeploy, + lmdeploy_versions, ) from backend.huggingface import set_huggingface_token from backend.logging_config import setup_logging, get_logger -from backend.lmdeploy_installer import get_lmdeploy_installer -from backend.lmdeploy_manager import get_lmdeploy_manager # Set up logging setup_logging(level="INFO") @@ -133,32 +130,9 @@ async def register_all_models_with_llama_swap(): logger.warning("llama-server not found, skipping model registration") return - from backend.routes.models import _get_model_file_path - from backend.data_store import generate_proxy_name - - for model in model_list: - file_path = _get_model_file_path(model) - if not file_path or not os.path.exists(file_path): - logger.debug(f"Model '{model.get('id')}' not found in HF cache, skipping") - continue - try: - proxy_name = generate_proxy_name( - model.get("huggingface_id", ""), - model.get("quantization"), - ) - config = (model.get("config") or {}).copy() - config.setdefault("host", "0.0.0.0") - config.setdefault("ctx_size", 2048) - config.setdefault("batch_size", 512) - config.setdefault("threads", 4) - model_with_proxy = dict(model, proxy_name=proxy_name) - await llama_swap_manager.register_model(model_with_proxy, config) - logger.info( - f"Registered model '{model.get('display_name', model.get('id'))}' as '{proxy_name}' with llama-swap" - ) - except Exception as e: - logger.error(f"Failed to register model '{model.get('id')}' with llama-swap: {e}") - + # Legacy auto-registration based on local file paths has been removed. + # llama-swap configuration is now generated purely from logical models + # (Hugging Face repo + quantization) via generate_llama_swap_config. await llama_swap_manager.regenerate_config_with_active_version() @@ -266,12 +240,9 @@ async def lifespan(app: FastAPI): app.include_router( llama_versions.router, prefix="/api/llama-versions", tags=["llama-versions"] ) -app.include_router( - llama_version_manager.router, prefix="/api", tags=["llama-version-manager"] -) app.include_router(status.router, prefix="/api", tags=["status"]) app.include_router(gpu_info.router, prefix="/api", tags=["gpu"]) -app.include_router(lmdeploy.router, prefix="/api", tags=["lmdeploy"]) +app.include_router(lmdeploy_versions.router, prefix="/api", tags=["lmdeploy"]) # SSE endpoint for progress tracking from backend.progress_manager import get_progress_manager diff --git a/backend/model_introspection.py b/backend/model_introspection.py new file mode 100644 index 0000000..ec365b4 --- /dev/null +++ b/backend/model_introspection.py @@ -0,0 +1,555 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Tuple + +from backend.logging_config import get_logger + +logger = get_logger(__name__) + + +@dataclass +class ModelInfo: + """Normalized, high-level view of a GGUF model.""" + + architecture: str + layer_count: int + block_count: int + context_length: int + parameter_count_display: Optional[str] + vocab_size: Optional[int] + embedding_length: Optional[int] + attention_head_count: Optional[int] + attention_head_count_kv: Optional[int] + is_moe: bool + expert_count: Optional[int] + experts_used_count: Optional[int] + raw_metadata: Dict[str, Any] + + +@dataclass +class TensorInfo: + """Lightweight description of a tensor from GGUF metadata.""" + + name: str + shape: Tuple[int, ...] + type_id: int + offset: int + + +def _parse_numeric_with_suffix(value: Any) -> Optional[int]: + """ + Parse human-readable numeric strings like '7B', '1.7M', or plain integers. + + Returns an integer number of parameters / units, or None if parsing fails. + """ + if isinstance(value, (int, float)): + return int(value) if value > 0 else None + + if not isinstance(value, str): + return None + + text = value.strip() + if not text: + return None + + # Normalize underscores and commas + text = text.replace("_", "").replace(",", "") + last = text[-1].upper() + + multiplier = 1 + number_part = text + if last in ("K", "M", "B"): + number_part = text[:-1] + if last == "K": + multiplier = int(1e3) + elif last == "M": + multiplier = int(1e6) + else: + multiplier = int(1e9) + + try: + num = float(number_part) + if num <= 0: + return None + return int(num * multiplier) + except (ValueError, TypeError): + return None + + +def _format_human_readable(value: Optional[int]) -> Optional[str]: + """Format an integer as K/M/B string for display, or return None.""" + if value is None: + return None + if value >= 1_000_000_000: + base = value / 1_000_000_000 + return f"{int(base)}B" if base.is_integer() else f"{base:.1f}B" + if value >= 1_000_000: + base = value / 1_000_000 + return f"{int(base)}M" if base.is_integer() else f"{base:.1f}M" + if value >= 1_000: + base = value / 1_000 + return f"{int(base)}K" if base.is_integer() else f"{base:.1f}K" + return str(value) + + +def _find_numeric_candidates( + metadata: Dict[str, Any], + include_terms: Iterable[str], + exclude_terms: Iterable[str] | None = None, + max_value: Optional[int] = None, +) -> List[Tuple[str, int]]: + """Return (key, value) pairs whose key and numeric value match the filters.""" + exclude_terms = tuple(exclude_terms or ()) + include_terms = tuple(include_terms) + + candidates: List[Tuple[str, int]] = [] + for key, value in metadata.items(): + key_lower = key.lower() + if not all(term in key_lower for term in include_terms): + continue + if any(term in key_lower for term in exclude_terms): + continue + + parsed = _parse_numeric_with_suffix(value) + if parsed is None: + continue + if max_value is not None and parsed > max_value: + continue + candidates.append((key, parsed)) + + return candidates + + +_INTROSPECTION_CONFIG: Optional[Dict[str, Any]] = None + + +def _load_introspection_config() -> Dict[str, Any]: + """ + Load optional JSON config for architecture-specific GGUF introspection rules. + + The file is expected at ``backend/gguf_introspection_config.json``. Any + errors while loading are logged and result in an empty config. + """ + global _INTROSPECTION_CONFIG + if _INTROSPECTION_CONFIG is not None: + return _INTROSPECTION_CONFIG + + cfg_path = os.path.join(os.path.dirname(__file__), "gguf_introspection_config.json") + if not os.path.exists(cfg_path): + _INTROSPECTION_CONFIG = {} + return _INTROSPECTION_CONFIG + + try: + with open(cfg_path, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + _INTROSPECTION_CONFIG = data + else: + logger.warning( + "gguf_introspection_config.json must contain a JSON object; got %s", + type(data), + ) + _INTROSPECTION_CONFIG = {} + except Exception as exc: + logger.warning("Failed to load gguf_introspection_config.json: %s", exc) + _INTROSPECTION_CONFIG = {} + + return _INTROSPECTION_CONFIG + + +class GgufIntrospector: + """ + Data-driven GGUF model introspector. + + Consumes raw GGUF metadata and tensor descriptors and produces a normalized + ModelInfo structure using generic key-pattern matching and simple heuristics. + """ + + # Sanity limits to defend against corrupted or adversarial metadata + MAX_CONTEXT = 1_000_000_000 + MAX_LAYERS = 4096 + MAX_HEADS = 8192 + + def __init__( + self, + metadata: Dict[str, Any], + tensors: Dict[str, Dict[str, Any]] | None = None, + ): + self.metadata = metadata or {} + self.tensors = tensors or {} + self.architecture = str( + self.metadata.get("general.architecture", "") or "" + ).lower() + self._config = _load_introspection_config() + + # Public orchestration ------------------------------------------------- + + def build_model_info(self) -> ModelInfo: + context_length = self._extract_context_length() + block_count, layer_count = self._extract_layer_and_block_counts() + param_count_int, param_display = self._extract_parameter_count() + ( + attention_head_count, + attention_head_count_kv, + ) = self._extract_attention_heads() + is_moe, expert_count, experts_used_count = self._extract_moe_info() + embedding_length = self._extract_embedding_length() + vocab_size = self._extract_vocab_size() + + return ModelInfo( + architecture=self.architecture, + layer_count=layer_count, + block_count=block_count, + context_length=context_length, + parameter_count_display=param_display, + vocab_size=vocab_size, + embedding_length=embedding_length, + attention_head_count=attention_head_count, + attention_head_count_kv=attention_head_count_kv, + is_moe=is_moe, + expert_count=expert_count, + experts_used_count=experts_used_count, + raw_metadata=self.metadata, + ) + + # Property extractors -------------------------------------------------- + + def _get_property_configs(self, prop: str) -> List[Dict[str, Any]]: + """ + Return a list of config sections relevant for the given property. + + Order of precedence: + 1. Global section + 2. Architecture-specific sections whose ``match_arch`` entries are + contained in the lowercased architecture string. + """ + cfg = self._config or {} + results: List[Dict[str, Any]] = [] + + global_cfg = cfg.get("global") + if isinstance(global_cfg, dict): + prop_cfg = global_cfg.get(prop) + if isinstance(prop_cfg, dict): + results.append(prop_cfg) + + for name, section in cfg.items(): + if name == "global" or not isinstance(section, dict): + continue + match_arch = section.get("match_arch") or [] + if not isinstance(match_arch, list): + continue + if not any( + isinstance(token, str) and token.lower() in self.architecture + for token in match_arch + ): + continue + prop_cfg = section.get(prop) + if isinstance(prop_cfg, dict): + results.append(prop_cfg) + + return results + + def _extract_context_length(self) -> int: + candidates: List[int] = [] + + # 1) Config-driven preferred keys + for cfg in self._get_property_configs("context_length"): + preferred = cfg.get("preferred_keys") or [] + for key in preferred: + if key in self.metadata: + parsed = _parse_numeric_with_suffix(self.metadata[key]) + if parsed is None or parsed <= 0 or parsed > self.MAX_CONTEXT: + continue + candidates.append(parsed) + + if candidates: + break + + fallback_terms = cfg.get("fallback_terms") or [] + if fallback_terms: + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=tuple(fallback_terms), + exclude_terms=("generation", "prefill"), + max_value=self.MAX_CONTEXT, + ): + candidates.append(value) + + if candidates: + break + + # 2) Generic terms for context length (if config did not resolve it) + if not candidates: + terms_sets = [ + ("context",), + ("model_max_length",), + ("max_position_embeddings",), + ("max_seq_len",), + ("max_sequence_length",), + ] + + for terms in terms_sets: + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=terms, + exclude_terms=("generation", "prefill"), + max_value=self.MAX_CONTEXT, + ): + candidates.append(value) + + if not candidates: + # As a last resort, look for any key that mentions both "max" and "length" + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=("max", "length"), + max_value=self.MAX_CONTEXT, + ): + candidates.append(value) + + if not candidates: + return 0 + + best = max(candidates) + if len(set(candidates)) > 1: + logger.debug( + "Multiple context length candidates detected %s, using max=%s", + candidates, + best, + ) + return best + + def _extract_layer_and_block_counts(self) -> Tuple[int, int]: + numeric_candidates: List[int] = [] + + # 1) Config-driven preferred keys + for cfg in self._get_property_configs("layer_count"): + preferred = cfg.get("preferred_keys") or [] + for key in preferred: + if key in self.metadata: + parsed = _parse_numeric_with_suffix(self.metadata[key]) + if parsed is None or parsed <= 0 or parsed > self.MAX_LAYERS: + continue + numeric_candidates.append(parsed) + + if numeric_candidates: + break + + fallback_terms = cfg.get("fallback_terms") or [] + if fallback_terms: + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=tuple(fallback_terms), + max_value=self.MAX_LAYERS, + ): + numeric_candidates.append(value) + + if numeric_candidates: + break + + # 2) Generic key-based candidates + if not numeric_candidates: + key_terms = [ + ("block_count",), + ("layer_count",), + ("n_layer",), + ("num_layers",), + ("num_hidden_layers",), + ] + + for terms in key_terms: + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=terms, + max_value=self.MAX_LAYERS, + ): + numeric_candidates.append(value) + + block_count = layer_count = 0 + if numeric_candidates: + layer_count = max(numeric_candidates) + block_count = layer_count + if len(set(numeric_candidates)) > 1: + logger.debug( + "Multiple layer/block candidates detected %s, using max=%s", + numeric_candidates, + layer_count, + ) + else: + # Tensor-based heuristic: count distinct block indices if names contain ".block." + block_indices = self._infer_blocks_from_tensors() + if block_indices: + block_count = len(block_indices) + layer_count = block_count + 1 # usually add output head + else: + # Fallback default for unknown models + layer_count = 32 + block_count = 32 + logger.debug( + "No explicit layer/block metadata found; using default=%s", layer_count + ) + + return block_count, layer_count + + def _infer_blocks_from_tensors(self) -> List[int]: + indices: set[int] = set() + for name in self.tensors.keys(): + lower = name.lower() + # Common patterns: layers.N., layer.N., blk.N., block.N. + for marker in ("layers.", "layer.", "blk.", "block."): + if marker in lower: + try: + after = lower.split(marker, 1)[1] + num_str = "" + for ch in after: + if ch.isdigit(): + num_str += ch + else: + break + if num_str: + indices.add(int(num_str)) + except Exception: + continue + return sorted(indices) + + def _extract_parameter_count(self) -> Tuple[Optional[int], Optional[str]]: + # Look for any key mentioning parameters + raw_candidates: List[int] = [] + for key, value in self.metadata.items(): + key_lower = key.lower() + if "param" not in key_lower: + continue + parsed = _parse_numeric_with_suffix(value) + if parsed is not None and parsed > 0: + raw_candidates.append(parsed) + + if not raw_candidates: + return None, None + + best = max(raw_candidates) + if len(set(raw_candidates)) > 1: + logger.debug( + "Multiple parameter count candidates detected %s, using max=%s", + raw_candidates, + best, + ) + + return best, _format_human_readable(best) + + def _extract_attention_heads(self) -> Tuple[Optional[int], Optional[int]]: + # Attention heads + att_candidates: List[int] = [] + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=("attention", "head"), + max_value=self.MAX_HEADS, + ): + att_candidates.append(value) + + head_count = max(att_candidates) if att_candidates else None + + # KV heads (GQA) + kv_candidates: List[int] = [] + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=("attention", "head", "kv"), + max_value=self.MAX_HEADS, + ): + kv_candidates.append(value) + + head_count_kv = max(kv_candidates) if kv_candidates else None + + return head_count, head_count_kv + + def _extract_moe_info(self) -> Tuple[bool, Optional[int], Optional[int]]: + architecture = str(self.metadata.get("general.architecture", "") or "").lower() + is_moe = "moe" in architecture or "experts" in architecture + + expert_candidates: List[int] = [] + experts_used_candidates: List[int] = [] + + for key, value in self.metadata.items(): + key_lower = key.lower() + if "expert" not in key_lower and "experts" not in key_lower: + continue + + parsed = _parse_numeric_with_suffix(value) + if parsed is None or parsed <= 0: + continue + + if any(term in key_lower for term in ("per_tok", "used", "active")): + experts_used_candidates.append(parsed) + else: + expert_candidates.append(parsed) + + expert_count = max(expert_candidates) if expert_candidates else None + experts_used_count = ( + max(experts_used_candidates) if experts_used_candidates else None + ) + + if expert_count: + is_moe = True + + # Default active experts if only total experts is known + if is_moe and experts_used_count is None and expert_count: + if expert_count >= 64: + experts_used_count = 8 + elif expert_count >= 32: + experts_used_count = 4 + else: + experts_used_count = 2 + + return is_moe, expert_count, experts_used_count + + def _extract_embedding_length(self) -> Optional[int]: + # First try explicit metadata + candidates: List[int] = [] + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=("embedding",), + ): + candidates.append(value) + + if candidates: + return max(candidates) + + # Fallback: use tensor shapes for token embeddings + best: Optional[int] = None + for name, info in self.tensors.items(): + lower = name.lower() + if not any(term in lower for term in ("token_emb", "embed_tokens", "tok_embeddings", "tok_embed")): + continue + shape = info.get("shape") or [] + if len(shape) >= 2: + dim = int(shape[-1]) + if best is None or dim > best: + best = dim + return best + + def _extract_vocab_size(self) -> Optional[int]: + # Prefer scalar vocab size keys + candidates: List[int] = [] + for _, value in _find_numeric_candidates( + self.metadata, + include_terms=("vocab_size",), + ): + candidates.append(value) + + if candidates: + return max(candidates) + + # Fallback: derive from embedding matrix first dimension + best: Optional[int] = None + for name, info in self.tensors.items(): + lower = name.lower() + if not any(term in lower for term in ("token_emb", "embed_tokens", "tok_embeddings", "tok_embed")): + continue + shape = info.get("shape") or [] + if len(shape) >= 2: + size = int(shape[0]) + if best is None or size > best: + best = size + return best + diff --git a/backend/param_registry.py b/backend/param_registry.py index 9b8e5bb..b8c5cbd 100644 --- a/backend/param_registry.py +++ b/backend/param_registry.py @@ -12,6 +12,7 @@ # Basic params shown by default (most common for chat/embedding) # Host and port are not included: they are managed by llama-swap (--port ${PORT}, host default 0.0.0.0) LLAMA_CPP_BASIC: List[ParamDef] = [ + {"key": "model_alias", "label": "Model alias", "type": "string", "default": "", "description": "Expose this model under a custom runtime ID instead of the default Hugging Face-derived name"}, {"key": "ctx_size", "label": "Context size", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum context length in tokens"}, {"key": "n_gpu_layers", "label": "GPU layers", "type": "int", "default": -1, "min": -1, "max": 1000, "description": "Number of layers to offload to GPU (-1 = all)"}, {"key": "batch_size", "label": "Batch size", "type": "int", "default": 512, "min": 1, "max": 2048, "description": "Batch size for prompt processing"}, @@ -71,6 +72,7 @@ # LMDeploy (safetensors / TurboMind) LMDEPLOY_BASIC: List[ParamDef] = [ + {"key": "model_alias", "label": "Model alias", "type": "string", "default": "", "description": "Expose this model under a custom runtime ID instead of the default Hugging Face-derived name"}, {"key": "session_len", "label": "Session length", "type": "int", "default": 2048, "min": 512, "max": 1_000_000, "description": "Maximum session length"}, {"key": "max_batch_size", "label": "Max batch size", "type": "int", "default": 128, "min": 1, "max": 1024, "description": "Maximum batch size"}, {"key": "tensor_parallel", "label": "Tensor parallel", "type": "int", "default": 1, "min": 1, "max": 8, "description": "Tensor parallelism degree"}, diff --git a/backend/routes/llama_version_manager.py b/backend/routes/llama_version_manager.py deleted file mode 100644 index 6b9ee9a..0000000 --- a/backend/routes/llama_version_manager.py +++ /dev/null @@ -1,159 +0,0 @@ -from fastapi import APIRouter, HTTPException -import os -import shutil -import stat -import time - -from backend.data_store import get_store -from backend.logging_config import get_logger - -logger = get_logger(__name__) -router = APIRouter() - - -def _remove_readonly(func, path, exc): - try: - os.chmod(path, stat.S_IWRITE) - func(path) - except Exception as e: - logger.warning(f"Could not remove {path}: {e}") - - -def _robust_rmtree(path: str, max_retries: int = 3) -> None: - if not os.path.exists(path): - return - for attempt in range(max_retries): - try: - shutil.rmtree(path, onerror=_remove_readonly) - logger.info(f"Successfully deleted directory: {path}") - return - except (PermissionError, OSError) as e: - if attempt < max_retries - 1: - time.sleep(0.5) - else: - logger.error(f"Failed to delete {path} after {max_retries} attempts: {e}") - raise - - -def _resolve_binary_path(binary_path: str) -> str: - if not binary_path: - return "" - if os.path.isabs(binary_path): - return binary_path - return os.path.join("/app", binary_path) - - -@router.get("/llama-versions") -async def list_llama_versions(): - """List all installed llama-cpp versions (llama_cpp engine).""" - store = get_store() - versions = store.get_engine_versions("llama_cpp") - result = [] - for i, v in enumerate(versions): - binary_path = _resolve_binary_path(v.get("binary_path")) - result.append({ - "id": i, - "version": v.get("version"), - "install_type": v.get("type", "source"), - "source_commit": v.get("source_commit"), - "is_active": store.get_active_engine_version("llama_cpp") and store.get_active_engine_version("llama_cpp").get("version") == v.get("version"), - "installed_at": v.get("installed_at"), - "binary_path": v.get("binary_path"), - "exists": os.path.exists(binary_path) if binary_path else False, - }) - return {"versions": result} - - -@router.post("/llama-versions/{version_id}/activate") -async def activate_llama_version(version_id: str): - """Activate a specific llama-cpp version (version_id can be index, version string, or "llama_cpp:version").""" - store = get_store() - versions = store.get_engine_versions("llama_cpp") - # Frontend may send id from list endpoint: "llama_cpp:version_str" - lookup_id = version_id - if ":" in str(version_id): - parts = str(version_id).split(":", 1) - if parts[0] == "llama_cpp": - lookup_id = parts[1] - version_entry = None - try: - idx = int(lookup_id) - if 0 <= idx < len(versions): - version_entry = versions[idx] - except ValueError: - pass - if not version_entry: - version_entry = next((v for v in versions if str(v.get("version")) == str(lookup_id)), None) - if not version_entry: - raise HTTPException(status_code=404, detail="Version not found") - binary_path = _resolve_binary_path(version_entry.get("binary_path")) - if not os.path.exists(binary_path): - raise HTTPException(status_code=400, detail="Binary file does not exist") - version_str = str(version_entry.get("version")) - store.set_active_engine_version("llama_cpp", version_str) - try: - from backend.llama_swap_manager import get_llama_swap_manager - llama_swap_manager = get_llama_swap_manager() - await llama_swap_manager._ensure_correct_binary_path() - await llama_swap_manager.regenerate_config_with_active_version() - try: - await llama_swap_manager.start_proxy() - except Exception as e: - logger.warning(f"Failed to start llama-swap after version activation: {e}") - except Exception as e: - logger.error(f"Failed to regenerate llama-swap config: {e}") - logger.info(f"Activated llama-cpp version: {version_str}") - return {"message": f"Activated llama-cpp version {version_str}"} - - -@router.delete("/llama-versions/{version_id}") -async def delete_llama_version(version_id: str): - """Delete a llama-cpp version (version_id can be index or version string).""" - store = get_store() - versions = store.get_engine_versions("llama_cpp") - version_entry = None - try: - idx = int(version_id) - if 0 <= idx < len(versions): - version_entry = versions[idx] - except ValueError: - pass - if not version_entry: - version_entry = next((v for v in versions if str(v.get("version")) == str(version_id)), None) - if not version_entry: - raise HTTPException(status_code=404, detail="Version not found") - version_str = str(version_entry.get("version")) - active = store.get_active_engine_version("llama_cpp") - if active and str(active.get("version")) == version_str: - raise HTTPException(status_code=400, detail="Cannot delete active version") - binary_path = _resolve_binary_path(version_entry.get("binary_path")) - version_dir = os.path.dirname(os.path.dirname(binary_path)) if binary_path else None - if version_dir and os.path.exists(version_dir): - try: - _robust_rmtree(version_dir) - except Exception as e: - logger.error(f"Failed to delete directory {version_dir}: {e}") - raise HTTPException(status_code=500, detail=f"Failed to delete directory: {e}") - store.delete_engine_version("llama_cpp", version_str) - logger.info(f"Deleted llama-cpp version: {version_str}") - return {"message": f"Deleted llama-cpp version {version_str}"} - - -@router.get("/llama-versions/active") -async def get_active_llama_version(): - """Get the currently active llama-cpp version.""" - store = get_store() - active_version = store.get_active_engine_version("llama_cpp") - if not active_version: - return {"active_version": None} - binary_path = _resolve_binary_path(active_version.get("binary_path")) - return { - "active_version": { - "id": 0, - "version": active_version.get("version"), - "install_type": active_version.get("type"), - "source_commit": active_version.get("source_commit"), - "binary_path": active_version.get("binary_path"), - "exists": os.path.exists(binary_path) if binary_path else False, - } - } diff --git a/backend/routes/llama_versions.py b/backend/routes/llama_versions.py index 4941c5c..deb69ac 100644 --- a/backend/routes/llama_versions.py +++ b/backend/routes/llama_versions.py @@ -7,6 +7,7 @@ import requests import time import platform +import re import shutil import stat from datetime import datetime @@ -84,6 +85,8 @@ async def list_llama_versions(): "install_type": v.get("type", "source"), "binary_path": v.get("binary_path"), "source_commit": v.get("source_commit"), + "source_ref": v.get("source_ref"), + "source_ref_type": v.get("source_ref_type"), "patches": [], # No longer storing patches in YAML "installed_at": v.get("installed_at"), "is_active": v.get("version") == active_version, @@ -93,28 +96,178 @@ async def list_llama_versions(): return result +def _default_build_settings() -> dict: + """Default build-settings payload for engines when nothing is saved yet.""" + return { + "cuda": False, + "flash_attention": False, + "native": True, + "backend_dl": False, + "cpu_all_variants": False, + "cuda_architectures": "", + } + + +def _coerce_build_settings(settings: Optional[dict]) -> dict: + base = _default_build_settings() + if not isinstance(settings, dict): + return base + + def _bool(v): + if isinstance(v, bool): + return v + if isinstance(v, str): + return v.strip().lower() in ("1", "true", "yes", "on") + return bool(v) + + return { + "cuda": _bool(settings.get("cuda", base["cuda"])), + "flash_attention": _bool(settings.get("flash_attention", base["flash_attention"])), + "native": _bool(settings.get("native", base["native"])), + "backend_dl": _bool(settings.get("backend_dl", base["backend_dl"])), + "cpu_all_variants": _bool(settings.get("cpu_all_variants", base["cpu_all_variants"])), + "cuda_architectures": str(settings.get("cuda_architectures") or ""), + } + + +def _build_config_from_settings(settings: Optional[dict]) -> BuildConfig: + normalized = _coerce_build_settings(settings) + return BuildConfig( + enable_cuda=normalized["cuda"], + enable_flash_attention=normalized["flash_attention"], + enable_native=normalized["native"], + enable_backend_dl=normalized["backend_dl"], + enable_cpu_all_variants=normalized["cpu_all_variants"], + cuda_architectures=normalized["cuda_architectures"], + ) + + +def _source_ref_slug(source_ref: str) -> str: + value = str(source_ref or "").strip().lower() + value = re.sub(r"[^a-z0-9._-]+", "-", value) + value = re.sub(r"-{2,}", "-", value).strip("-._") + return value[:32] or "source" + + +def _resolve_engine_build_target(engine: str) -> tuple[str, str]: + if engine == "ik_llama": + repository_source = "ik_llama.cpp" + elif engine == "llama_cpp": + repository_source = "llama.cpp" + else: + raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'") + + repository_url = llama_manager.REPOSITORY_SOURCES.get(repository_source) + if not repository_url: + raise HTTPException(status_code=400, detail=f"Unknown repository source: {repository_source}") + return repository_source, repository_url + + +def _fetch_latest_release(repository_source: str) -> Optional[dict]: + if repository_source == "ik_llama.cpp": + releases_url = "https://api.github.com/repos/ikawrakow/ik_llama.cpp/releases?per_page=10" + else: + releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases?per_page=10" + + response = requests.get(releases_url, allow_redirects=True) + if response.status_code == 404: + return None + response.raise_for_status() + + releases = response.json() + if isinstance(releases, dict): + # Defensive fallback in case GitHub changes shape or proxies return a single object. + return releases + if not isinstance(releases, list): + return None + + for release in releases: + if isinstance(release, dict) and not release.get("draft"): + return release + return None + + +@router.get("/build-settings") +async def get_build_settings(engine: str = "llama_cpp"): + """Get persisted build settings for an engine ('llama_cpp' or 'ik_llama').""" + if engine not in ("llama_cpp", "ik_llama"): + raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'") + store = get_store() + settings = store.get_engine_build_settings(engine) or {} + # Always return a full shape so the frontend can rely on defaults. + base = _default_build_settings() + base.update({k: v for k, v in settings.items() if k in base}) + return base + + +@router.put("/build-settings") +async def update_build_settings(engine: str = "llama_cpp", settings: dict = Body(...)): + """Persist build settings for an engine ('llama_cpp' or 'ik_llama').""" + if engine not in ("llama_cpp", "ik_llama"): + raise HTTPException(status_code=400, detail="engine must be 'llama_cpp' or 'ik_llama'") + if not isinstance(settings, dict): + raise HTTPException(status_code=400, detail="settings must be an object") + store = get_store() + # Only persist known build keys; ignore extras. + allowed = _default_build_settings().keys() + filtered = {k: v for k, v in settings.items() if k in allowed} + stored = store.update_engine_build_settings(engine, filtered) + base = _default_build_settings() + base.update({k: v for k, v in stored.items() if k in base}) + return base + + +@router.post("/update") +async def update_engine(request: dict): + """Build the latest source release for an engine using persisted build settings, then auto-activate it.""" + engine = (request or {}).get("engine", "llama_cpp") + version_suffix = (request or {}).get("version_suffix") + repository_source, repository_url = _resolve_engine_build_target(engine) + store = get_store() + settings = store.get_engine_build_settings(engine) or {} + build_config = _build_config_from_settings(settings) + + try: + latest_release = _fetch_latest_release(repository_source) + if not latest_release or not latest_release.get("tag_name"): + raise HTTPException(status_code=404, detail="No release found for this engine") + source_ref = latest_release["tag_name"] + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded. Please try again later.") + if e.response.status_code == 404: + raise HTTPException(status_code=404, detail="GitHub repository or release not found") + raise HTTPException(status_code=500, detail=f"GitHub API error: {str(e)}") + except requests.exceptions.RequestException as e: + raise HTTPException(status_code=500, detail=f"Network error: {str(e)}") + + return _schedule_source_build( + source_ref=source_ref, + patches=[], + build_config=build_config, + repository_source=repository_source, + repository_url=repository_url, + version_suffix=version_suffix, + auto_activate=True, + source_ref_type="release", + ) + + @router.get("/check-updates") async def check_updates(source: str | None = None): - """Check for llama.cpp or ik_llama.cpp updates (releases and/or source). + """Check for llama.cpp or ik_llama.cpp source releases and latest commit. source: None or 'llama_cpp' for ggerganov/llama.cpp; 'ik_llama' for ikawrakow/ik_llama.cpp. """ try: is_ik = source == "ik_llama" if is_ik: - commits_url = ( - "https://api.github.com/repos/ikawrakow/ik_llama.cpp/commits?per_page=1" - ) - latest_release = None + repository_source = "ik_llama.cpp" + commits_url = "https://api.github.com/repos/ikawrakow/ik_llama.cpp/commits?per_page=1" else: - # ai-dock/llama.cpp-cuda: pre-built releases with CUDA support - releases_url = "https://api.github.com/repos/ai-dock/llama.cpp-cuda/releases" - commits_url = ( - "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1" - ) - releases_response = requests.get(releases_url, allow_redirects=True) - releases_response.raise_for_status() - releases = releases_response.json() - latest_release = releases[0] if releases else None + repository_source = "llama.cpp" + commits_url = "https://api.github.com/repos/ggerganov/llama.cpp/commits?per_page=1" + + latest_release = _fetch_latest_release(repository_source) commits_response = requests.get(commits_url, allow_redirects=True) commits_response.raise_for_status() @@ -125,8 +278,8 @@ async def check_updates(source: str | None = None): "latest_release": ( { "tag_name": latest_release["tag_name"], - "published_at": latest_release["published_at"], - "html_url": latest_release["html_url"], + "published_at": latest_release.get("published_at"), + "html_url": latest_release.get("html_url"), } if latest_release else None @@ -159,26 +312,10 @@ async def check_updates(source: str | None = None): @router.get("/releases/{tag_name}/assets") async def get_release_assets(tag_name: str): - """List compatible release artifacts for a given tag.""" - try: - assets = llama_manager.get_release_assets(tag_name) - return assets - except requests.exceptions.HTTPError as e: - if e.response.status_code == 403: - raise HTTPException( - status_code=429, - detail="GitHub API rate limit exceeded. Please try again later.", - ) - elif e.response.status_code == 404: - raise HTTPException(status_code=404, detail=f"Release {tag_name} not found") - else: - raise HTTPException(status_code=500, detail=f"GitHub API error: {str(e)}") - except requests.exceptions.RequestException as e: - raise HTTPException(status_code=500, detail=f"Network error: {str(e)}") - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to fetch release assets: {str(e)}" - ) + raise HTTPException( + status_code=410, + detail="Prebuilt llama.cpp release installation has been removed. Build from source instead.", + ) @router.get("/build-capabilities") @@ -215,75 +352,10 @@ async def get_build_capabilities_endpoint(): @router.post("/install-release") async def install_release(request: dict): - """Install llama.cpp from ai-dock/llama.cpp-cuda release (CUDA builds).""" - try: - tag_name = request.get("tag_name") - if not tag_name: - raise HTTPException(status_code=400, detail="tag_name is required") - - raw_asset_id = request.get("asset_id") - asset_id = None - if raw_asset_id is not None: - try: - asset_id = int(raw_asset_id) - except (TypeError, ValueError): - raise HTTPException( - status_code=400, detail="asset_id must be an integer" - ) - - try: - preview = llama_manager.get_release_install_preview(tag_name, asset_id) - except requests.exceptions.HTTPError as e: - if e.response.status_code == 403: - raise HTTPException( - status_code=429, - detail="GitHub API rate limit exceeded. Please try again later.", - ) - elif e.response.status_code == 404: - raise HTTPException( - status_code=404, detail=f"Release {tag_name} not found" - ) - else: - raise HTTPException( - status_code=500, detail=f"GitHub API error: {str(e)}" - ) - except requests.exceptions.RequestException as e: - raise HTTPException(status_code=500, detail=f"Network error: {str(e)}") - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - version_name = preview.get("version_name") - - store = get_store() - existing_versions = store.get_engine_versions("llama_cpp") - existing = next( - (v for v in existing_versions if v.get("version") in (version_name, tag_name)), - None, - ) - if existing: - detail = "400: Version already installed" - if version_name: - detail = f"{detail} ({version_name})" - raise HTTPException(status_code=400, detail=detail) - - # Generate task ID for tracking - task_id = f"install_release_{tag_name}_{int(time.time())}" - - # Start installation in background (asyncio.create_task so it runs regardless of middleware) - pm = get_progress_manager() - pm.create_task("install_release", f"Install {tag_name}", {"tag_name": tag_name}, task_id=task_id) - asyncio.create_task(install_release_task(tag_name, pm, task_id, asset_id)) - - return { - "message": f"Installing release {tag_name}", - "task_id": task_id, - "status": "started", - "progress": 0, - "asset_id": asset_id, - "version_name": version_name, - } - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) + raise HTTPException( + status_code=410, + detail="Prebuilt llama.cpp release installation has been removed. Build from source instead.", + ) async def install_release_task( @@ -361,29 +433,17 @@ async def build_source(request: dict): build_config_dict = request.get("build_config") repository_source = request.get("repository_source", "llama.cpp") version_suffix = request.get("version_suffix") + auto_activate = bool(request.get("auto_activate")) + source_ref_type = request.get("source_ref_type", "ref") if not commit_sha: raise HTTPException(status_code=400, detail="commit_sha is required") - commit_short = commit_sha[:8] - if version_suffix: - version_name = f"source-{commit_short}-{version_suffix}" + if repository_source == "ik_llama.cpp": + _, repository_url = _resolve_engine_build_target("ik_llama") + elif repository_source == "llama.cpp": + _, repository_url = _resolve_engine_build_target("llama_cpp") else: - timestamp = int(time.time()) - version_name = f"source-{commit_short}-{timestamp}" - - store = get_store() - engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp" - existing_versions = store.get_engine_versions(engine) - existing = next((v for v in existing_versions if v.get("version") == version_name), None) - if existing: - raise HTTPException( - status_code=400, detail=f"Version '{version_name}' already installed" - ) - - # Get repository URL from source name - repository_url = llama_manager.REPOSITORY_SOURCES.get(repository_source) - if not repository_url: raise HTTPException( status_code=400, detail=f"Unknown repository source: {repository_source}", @@ -414,33 +474,16 @@ def _bool(v): logger.warning("BuildConfig from request failed (%s), using defaults", e) build_config = BuildConfig() - # Generate task ID for tracking - task_id = f"build_{version_name}_{int(time.time())}" - - # Start build in background (asyncio.create_task so it runs regardless of middleware) - pm = get_progress_manager() - pm.create_task("build", f"Build {repository_source} {commit_sha[:8]}", {"version_name": version_name}, task_id=task_id) - asyncio.create_task( - build_source_task( - commit_sha, - patches, - build_config or BuildConfig(), - version_name, - repository_source, - repository_url, - pm, - task_id, - ) + return _schedule_source_build( + source_ref=commit_sha, + patches=patches, + build_config=build_config or BuildConfig(), + repository_source=repository_source, + repository_url=repository_url, + version_suffix=version_suffix, + auto_activate=auto_activate, + source_ref_type=source_ref_type, ) - - return { - "message": f"Building from source {commit_sha[:8]}", - "task_id": task_id, - "status": "started", - "progress": 0, - "version_name": version_name, - "repository_source": repository_source, - } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -454,6 +497,8 @@ async def build_source_task( repository_url: str, progress_manager=None, task_id: str = None, + auto_activate: bool = False, + source_ref_type: str = "ref", ): """Background task to build from source with SSE progress""" logger.info( @@ -485,12 +530,23 @@ async def build_source_task( "type": "patched" if patches else "source", "binary_path": binary_path, "source_commit": commit_sha, + "source_ref": commit_sha, + "source_ref_type": source_ref_type, "build_config": build_config_dict, "repository_source": repository_source, "installed_at": datetime.utcnow().isoformat() + "Z", } store.add_engine_version(engine, version_data) + if auto_activate: + try: + # Reuse the existing activation flow (includes llama-swap handling). + await _do_activate_version(f"{engine}:{version_name}") + except HTTPException as e: + logger.error("Auto-activation failed for %s:%s: %s", engine, version_name, e.detail) + except Exception as e: + logger.exception("Auto-activation failed for %s:%s: %s", engine, version_name, e) + from backend.llama_swap_manager import get_llama_swap_manager active_version = store.get_active_engine_version(engine) if active_version and active_version.get("binary_path") and os.path.exists(active_version.get("binary_path", "")): @@ -521,14 +577,6 @@ async def build_source_task( message=f"Failed to build llama.cpp from source: {str(e)}", type="error", ) - if task_id: - await progress_manager.send_build_progress( - task_id=task_id, - stage="error", - progress=0, - message=f"Build task failed: {str(e)}", - log_lines=[f"Task error: {str(e)}", f"Error type: {type(e).__name__}"], - ) except Exception as ws_error: logger.error(f"Failed to send build failure notification: {ws_error}") @@ -611,6 +659,71 @@ def _find_version_entry(store, version_id: str): return version_entry, engine +def _schedule_source_build( + source_ref: str, + patches: List[str], + build_config: BuildConfig, + repository_source: str, + repository_url: str, + version_suffix: Optional[str] = None, + auto_activate: bool = False, + source_ref_type: str = "ref", +): + store = get_store() + engine = "ik_llama" if repository_source == "ik_llama.cpp" else "llama_cpp" + ref_slug = _source_ref_slug(source_ref) + if version_suffix: + version_name = f"source-{ref_slug}-{version_suffix}" + else: + timestamp = int(time.time()) + version_name = f"source-{ref_slug}-{timestamp}" + + existing_versions = store.get_engine_versions(engine) + existing = next((v for v in existing_versions if v.get("version") == version_name), None) + if existing: + raise HTTPException(status_code=400, detail=f"Version '{version_name}' already installed") + + task_id = f"build_{version_name}_{int(time.time())}" + pm = get_progress_manager() + pm.create_task( + "build", + f"Build {repository_source} {ref_slug}", + { + "version_name": version_name, + "engine": engine, + "repository_source": repository_source, + "auto_activate": auto_activate, + "source_ref": source_ref, + "source_ref_type": source_ref_type, + }, + task_id=task_id, + ) + asyncio.create_task( + build_source_task( + source_ref, + patches, + build_config or BuildConfig(), + version_name, + repository_source, + repository_url, + pm, + task_id, + auto_activate=auto_activate, + source_ref_type=source_ref_type, + ) + ) + return { + "message": f"Building from source {ref_slug}", + "task_id": task_id, + "status": "started", + "progress": 0, + "version_name": version_name, + "repository_source": repository_source, + "source_ref": source_ref, + "source_ref_type": source_ref_type, + } + + @router.post("/versions/activate") async def activate_version_body(payload: dict = Body(...)): """Activate a version; body: { \"version_id\": \"llama_cpp:version\" or \"version\" }.""" diff --git a/backend/routes/lmdeploy.py b/backend/routes/lmdeploy.py deleted file mode 100644 index d24e5cf..0000000 --- a/backend/routes/lmdeploy.py +++ /dev/null @@ -1,82 +0,0 @@ -from typing import Dict, Optional - -import httpx -from fastapi import APIRouter, HTTPException - -from backend.lmdeploy_installer import get_lmdeploy_installer -from backend.lmdeploy_manager import get_lmdeploy_manager - -router = APIRouter() - - -@router.get("/lmdeploy/check-updates") -async def lmdeploy_check_updates() -> Dict: - """Check PyPI for latest LMDeploy version.""" - try: - async with httpx.AsyncClient() as client: - r = await client.get("https://pypi.org/pypi/lmdeploy/json", timeout=10.0) - r.raise_for_status() - data = r.json() - info = data.get("info", {}) - return { - "latest_version": info.get("version"), - "releases": list(data.get("releases", {}).keys()), - } - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Failed to check PyPI: {exc}") - - -@router.get("/lmdeploy/status") -async def lmdeploy_installer_status() -> Dict: - installer = get_lmdeploy_installer() - return installer.status() - - -@router.post("/lmdeploy/install") -async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict: - installer = get_lmdeploy_installer() - payload = request or {} - version = payload.get("version") - force_reinstall = bool(payload.get("force_reinstall")) - try: - return await installer.install(version=version, force_reinstall=force_reinstall) - except RuntimeError as exc: - raise HTTPException(status_code=409, detail=str(exc)) - - -@router.post("/lmdeploy/install-source") -async def lmdeploy_install_source(request: Optional[Dict[str, str]] = None) -> Dict: - """Install LMDeploy from a git repo and branch (for development).""" - installer = get_lmdeploy_installer() - payload = request or {} - repo_url = payload.get("repo_url", "https://github.com/InternLM/lmdeploy.git") - branch = payload.get("branch", "main") - try: - return await installer.install_from_source(repo_url=repo_url, branch=branch) - except RuntimeError as exc: - raise HTTPException(status_code=409, detail=str(exc)) - - -@router.post("/lmdeploy/remove") -async def lmdeploy_remove() -> Dict: - installer = get_lmdeploy_installer() - try: - return await installer.remove() - except RuntimeError as exc: - raise HTTPException(status_code=409, detail=str(exc)) - - -@router.get("/lmdeploy/logs") -async def lmdeploy_logs(max_bytes: int = 8192) -> Dict[str, str]: - """Get LMDeploy installer logs.""" - installer = get_lmdeploy_installer() - max_bytes = max(1024, min(max_bytes, 1024 * 1024)) - return {"log": installer.read_log_tail(max_bytes)} - - -@router.get("/lmdeploy/runtime-logs") -async def lmdeploy_runtime_logs(max_bytes: int = 8192) -> Dict[str, str]: - """Get LMDeploy runtime logs (from running server instances).""" - manager = get_lmdeploy_manager() - max_bytes = max(1024, min(max_bytes, 1024 * 1024)) - return {"log": manager.read_log_tail(max_bytes)} diff --git a/backend/routes/lmdeploy_versions.py b/backend/routes/lmdeploy_versions.py new file mode 100644 index 0000000..944ec15 --- /dev/null +++ b/backend/routes/lmdeploy_versions.py @@ -0,0 +1,65 @@ +from typing import Dict, Optional + +import httpx +from fastapi import APIRouter, HTTPException + +from backend.lmdeploy_manager import get_lmdeploy_manager + +router = APIRouter() + + +@router.get("/lmdeploy/check-updates") +async def lmdeploy_check_updates() -> Dict: + """Check PyPI for latest LMDeploy version.""" + try: + async with httpx.AsyncClient() as client: + r = await client.get("https://pypi.org/pypi/lmdeploy/json", timeout=10.0) + r.raise_for_status() + data = r.json() + info = data.get("info", {}) + return { + "latest_version": info.get("version"), + "releases": list(data.get("releases", {}).keys()), + } + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Failed to check PyPI: {exc}") + + +@router.get("/lmdeploy/status") +async def lmdeploy_installer_status() -> Dict: + manager = get_lmdeploy_manager() + return manager.status() + + +@router.post("/lmdeploy/install") +async def lmdeploy_install(request: Optional[Dict[str, str]] = None) -> Dict: + manager = get_lmdeploy_manager() + payload = request or {} + version = payload.get("version") + force_reinstall = bool(payload.get("force_reinstall")) + try: + return await manager.install_release(version=version, force_reinstall=force_reinstall) + except RuntimeError as exc: + raise HTTPException(status_code=409, detail=str(exc)) + + +@router.post("/lmdeploy/install-source") +async def lmdeploy_install_source(request: Optional[Dict[str, str]] = None) -> Dict: + """Install LMDeploy from a git repo and branch (for development).""" + manager = get_lmdeploy_manager() + payload = request or {} + repo_url = payload.get("repo_url", "https://github.com/InternLM/lmdeploy.git") + branch = payload.get("branch", "main") + try: + return await manager.install_from_source(repo_url=repo_url, branch=branch) + except RuntimeError as exc: + raise HTTPException(status_code=409, detail=str(exc)) + + +@router.post("/lmdeploy/remove") +async def lmdeploy_remove() -> Dict: + manager = get_lmdeploy_manager() + try: + return await manager.remove() + except RuntimeError as exc: + raise HTTPException(status_code=409, detail=str(exc)) diff --git a/backend/routes/models.py b/backend/routes/models.py index 5859db4..cf739a2 100644 --- a/backend/routes/models.py +++ b/backend/routes/models.py @@ -8,7 +8,7 @@ import re from datetime import datetime -from backend.data_store import get_store, generate_proxy_name +from backend.data_store import get_store, generate_proxy_name, resolve_proxy_name from backend.progress_manager import get_progress_manager from backend.huggingface import ( search_models, @@ -23,11 +23,8 @@ list_safetensors_downloads, delete_safetensors_download, record_safetensors_download, - get_default_lmdeploy_config, - update_lmdeploy_config, list_grouped_safetensors_downloads, create_gguf_manifest_entry, - get_gguf_manifest_entry, get_safetensors_manifest_entries, save_safetensors_manifest_entries, DEFAULT_LMDEPLOY_CONTEXT, @@ -35,7 +32,7 @@ MAX_ROPE_SCALING_FACTOR, get_model_disk_size, get_accurate_file_sizes, - get_mmproj_f16_filename, + resolve_cached_model_path, ) from backend.gpu_detector import get_gpu_info from backend.gguf_reader import get_model_layer_info @@ -43,8 +40,6 @@ logger = get_logger(__name__) from backend.llama_swap_config import get_supported_flags -from backend.lmdeploy_manager import get_lmdeploy_manager -from backend.lmdeploy_installer import get_lmdeploy_installer import psutil router = APIRouter() @@ -68,6 +63,22 @@ "minilm", ] + +def _is_mmproj_filename(filename: Optional[str]) -> bool: + name = (filename or "").strip().lower() + return bool(name) and "mmproj" in name and name.endswith(".gguf") + + +async def _regenerate_llama_swap_config(reason: str): + try: + from backend.llama_swap_manager import get_llama_swap_manager + + llama_swap_manager = get_llama_swap_manager() + await llama_swap_manager.regenerate_config_with_active_version() + logger.info("Regenerated llama-swap config after %s", reason) + except Exception as exc: + logger.warning("Failed to regenerate llama-swap config after %s: %s", reason, exc) + # Lightweight cache for GPU info to avoid repeated NVML calls during rapid estimate requests _gpu_info_cache: Dict[str, Any] = {"data": None, "timestamp": 0.0} GPU_INFO_CACHE_TTL = 2.0 # seconds @@ -115,7 +126,9 @@ def _get_actual_file_size(file_path: Optional[str]) -> Optional[int]: """Return actual file size in bytes from disk, or None if not available.""" if not file_path: return None - path = _normalize_model_path(file_path) + # For new HF-backed models we do not store paths; this helper is only used for + # legacy/local models that still reference concrete filesystem locations. + path = file_path.replace("\\", "/") if not path or not os.path.exists(path): return None try: @@ -125,54 +138,6 @@ def _get_actual_file_size(file_path: Optional[str]) -> Optional[int]: return None -def _get_model_filename(model: dict) -> Optional[str]: - """Return the filename for a model record. - - Prefers the dedicated ``filename`` field (new records). Falls back to - deriving it from the legacy ``file_path`` field (old records). - """ - fname = model.get("filename") - if fname: - return fname - return _extract_filename(model.get("file_path")) or None - - -def _get_model_file_path(model: dict) -> Optional[str]: - """Return the actual filesystem path for a model file. - - Resolution order: - 1. HF cache via huggingface_id + filename (new records). - 2. Stored file_path (legacy records that still reference custom storage). - """ - from backend.huggingface import resolve_cached_model_path - - hf_id = model.get("huggingface_id") - filename = _get_model_filename(model) - - if hf_id and filename: - cached = resolve_cached_model_path(hf_id, filename) - if cached: - return cached - - return _normalize_model_path(model.get("file_path")) or None - - -def _normalize_model_path(file_path: Optional[str]) -> Optional[str]: - if not file_path: - return None - normalized = file_path.replace("\\", "/") - normalized = os.path.normpath(normalized) - return normalized - - -def _extract_filename(file_path: Optional[str]) -> str: - if not file_path: - return "" - normalized = file_path.replace("\\", "/") - parts = normalized.split("/") - return parts[-1] if parts else normalized - - def normalize_architecture(raw_architecture: str) -> str: """Normalize GGUF architecture string (stub after smart_auto removal).""" if not raw_architecture or not isinstance(raw_architecture, str): @@ -233,25 +198,6 @@ def _assign_numeric(src_key: str, dest_keys): return defaults -def _apply_hf_defaults_to_model(model: dict, metadata: Dict[str, Any], store) -> None: - if not metadata: - return - defaults = _derive_hf_defaults(metadata) - if not defaults: - return - config = _coerce_model_config(model.get("config")) - changed = False - for key, value in defaults.items(): - if value is None: - continue - existing = config.get(key) - if existing in (None, "", 0): - config[key] = value - changed = True - if changed: - store.update_model(model["id"], {"config": config}) - - def _coerce_model_config(config_value: Optional[Any]) -> Dict[str, Any]: """Return a dict regardless of whether config is stored as dict or JSON string.""" if not config_value: @@ -272,7 +218,11 @@ def _refresh_model_metadata_from_file(model: dict, store) -> Dict[str, Any]: Re-read GGUF metadata from disk and update the model record. Returns metadata details for downstream consumers. """ - normalized_path = _get_model_file_path(model) + # Only supported for legacy/local models that still carry a concrete file_path. + file_path = model.get("file_path") + if not file_path: + raise FileNotFoundError("Model file not found on disk") + normalized_path = file_path.replace("\\", "/") if not normalized_path or not os.path.exists(normalized_path): raise FileNotFoundError("Model file not found on disk") @@ -458,18 +408,21 @@ async def _save_safetensors_download( if not model_record: from datetime import timezone as _tz + # Safetensors-backed models are treated as a single logical entity per + # Hugging Face repo. Derive base name and type from the repo id, not the + # shard filename. + repo_name = huggingface_id.split("/")[-1] if isinstance(huggingface_id, str) else "" + base_model_name = repo_name or extract_base_model_name(filename) + model_type = extract_model_type(huggingface_id or repo_name or filename) model_record = { "id": model_id, "huggingface_id": huggingface_id, - "filename": filename, - "display_name": filename.replace(".safetensors", ""), - "base_model_name": extract_base_model_name(filename), + "display_name": base_model_name, + "base_model_name": base_model_name, "file_size": file_size, - "quantization": os.path.splitext(filename)[0], - "model_type": extract_model_type(filename), + "model_type": model_type, "downloaded_at": datetime.now(_tz.utc).isoformat(), "format": "safetensors", - "model_format": "safetensors", "pipeline_tag": detected_pipeline, "config": {"embedding": True} if is_embedding_like else {}, } @@ -498,7 +451,6 @@ async def _save_safetensors_download( store.update_model(model_id, updates) model_record = store.get_model(model_id) or model_record - lmdeploy_config = get_default_lmdeploy_config(max_context) record_safetensors_download( huggingface_id=huggingface_id, filename=filename, @@ -506,7 +458,6 @@ async def _save_safetensors_download( file_size=file_size, metadata=safetensors_metadata, tensor_summary=tensor_summary, - lmdeploy_config=lmdeploy_config, model_id=model_record.get("id"), ) logger.info(f"Safetensors download recorded for {huggingface_id}/{filename} (model_id={model_record.get('id')})") @@ -518,12 +469,9 @@ def _get_safetensors_model(store, model_id: str) -> dict: model_format = (model.get("model_format") or model.get("format") or "gguf").lower() if model_format != "safetensors": raise HTTPException(status_code=400, detail="Model is not a safetensors download") - resolved_path = _get_model_file_path(model) - if not resolved_path or not os.path.exists(resolved_path): - raise HTTPException(status_code=400, detail="Model file not found on disk") - model = dict(model) - model["file_path"] = resolved_path - return model + # Safetensors models are treated as repo-level entities; concrete file paths + # are tracked in the safetensors manifest, not on the model record itself. + return dict(model) def _load_manifest_entry_for_model(model: dict) -> Dict[str, Any]: @@ -658,472 +606,6 @@ def _sanitize(obj: Any) -> Any: status_code=400, detail="hf_overrides must be an object or JSON string" ) - -def _validate_lmdeploy_config( - new_config: Optional[Dict[str, Any]], manifest_entry: Dict[str, Any] -) -> Dict[str, Any]: - """ - Merge and validate LMDeploy configuration. - """ - if new_config is not None and not isinstance(new_config, dict): - raise HTTPException(status_code=400, detail="Config payload must be an object") - - base_context_limit = _resolve_context_limit(manifest_entry) - stored_config = (manifest_entry.get("lmdeploy") or {}).get("config") - baseline = stored_config or get_default_lmdeploy_config(base_context_limit) - merged = dict(baseline) - if new_config: - merged.update(new_config) - - def _as_int(key: str, minimum: int = 1, maximum: Optional[int] = None) -> int: - value = merged.get(key, minimum) - try: - value = int(value) - except (TypeError, ValueError): - raise HTTPException(status_code=400, detail=f"{key} must be an integer") - if value < minimum: - value = minimum - if maximum is not None and value > maximum: - value = maximum - return value - - def _as_float(key: str, minimum: float, maximum: float) -> float: - value = merged.get(key, minimum) - try: - value = float(value) - except (TypeError, ValueError): - raise HTTPException(status_code=400, detail=f"{key} must be a number") - if value < minimum: - value = minimum - if value > maximum: - value = maximum - return value - - legacy_keys = { - "context_length": "session_len", - "max_batch_tokens": "max_prefill_token_num", - } - for legacy, target in legacy_keys.items(): - if legacy in merged and target not in merged: - merged[target] = merged[legacy] - - session_len = _as_int("session_len", minimum=1024, maximum=base_context_limit) - - raw_scaling_mode = str( - merged.get("rope_scaling_mode") or merged.get("rope_scaling_type") or "disabled" - ).lower() - if raw_scaling_mode in {"", "none", "disabled"}: - scaling_mode = "disabled" - else: - scaling_mode = raw_scaling_mode - - scaling_factor_value = merged.get("rope_scaling_factor", 1.0) - try: - scaling_factor = float(scaling_factor_value) - except (TypeError, ValueError): - raise HTTPException( - status_code=400, detail="rope_scaling_factor must be a number" - ) - if scaling_factor < 1.0: - scaling_factor = 1.0 - if scaling_factor > MAX_ROPE_SCALING_FACTOR: - scaling_factor = MAX_ROPE_SCALING_FACTOR - - if scaling_mode == "disabled" or scaling_factor <= 1.0: - scaling_mode = "disabled" - scaling_factor = 1.0 - else: - # Scaling only makes sense when we know the base context; otherwise reject it. - if not base_context_limit: - raise HTTPException( - status_code=400, - detail="RoPE scaling cannot be enabled without a known base context length", - ) - - # Check if model_max_length > max_position_embeddings (means rope scaling can achieve model_max_length) - metadata = manifest_entry.get("metadata") or {} - config_data = ( - metadata.get("config", {}) - if isinstance(metadata.get("config"), dict) - else {} - ) - model_max_length = _coerce_positive_int(metadata.get("model_max_length")) - max_position_embeddings = _coerce_positive_int( - config_data.get("max_position_embeddings") - ) - - if ( - model_max_length - and max_position_embeddings - and model_max_length > max_position_embeddings - ): - # Adapt base context to model_max_length / 4 for scaling - # This allows 4x scaling to reach model_max_length - adapted_base = int(model_max_length / 4) - if adapted_base >= 1024: - session_len = adapted_base - else: - # If adapted base is too small, use base context limit - session_len = base_context_limit - else: - # Use base context limit (max_position_embeddings is used for clamping, not for scaling decisions) - session_len = base_context_limit - - effective_session_len = session_len - if scaling_mode != "disabled": - effective_session_len = int(session_len * scaling_factor) - # Clamp to model_max_length if available, otherwise max_position_embeddings - metadata = manifest_entry.get("metadata") or {} - config_data = ( - metadata.get("config", {}) - if isinstance(metadata.get("config"), dict) - else {} - ) - model_max_length = _coerce_positive_int(metadata.get("model_max_length")) - max_position_embeddings = _coerce_positive_int( - config_data.get("max_position_embeddings") - ) - if model_max_length: - effective_session_len = min(effective_session_len, model_max_length) - elif max_position_embeddings: - effective_session_len = min(effective_session_len, max_position_embeddings) - # Also clamp to LMDeploy's maximum - effective_session_len = max( - session_len, min(effective_session_len, MAX_LMDEPLOY_CONTEXT) - ) - - merged["session_len"] = session_len - merged["effective_session_len"] = effective_session_len - merged["rope_scaling_mode"] = scaling_mode - merged["rope_scaling_factor"] = scaling_factor - - max_context_token_num = _as_int( - "max_context_token_num", - minimum=session_len, - maximum=base_context_limit, - ) - merged["max_context_token_num"] = max(max_context_token_num, session_len) - - max_prefill_token_num = _as_int( - "max_prefill_token_num", - minimum=1, - maximum=None, - ) - merged["max_prefill_token_num"] = max_prefill_token_num - - merged["tensor_parallel"] = _as_int("tensor_parallel", minimum=1) - merged["max_batch_size"] = _as_int("max_batch_size", minimum=1) - - merged["temperature"] = _as_float("temperature", 0.0, 2.0) - merged["top_p"] = _as_float("top_p", 0.0, 1.0) - merged["top_k"] = _as_int("top_k", minimum=1) - merged["kv_cache_percent"] = _as_float("kv_cache_percent", 0.0, 100.0) - - # Note: tensor_split is kept for backward compatibility but not sent to LMDeploy (--tp-split doesn't exist) - tensor_split = merged.get("tensor_split") or [] - if isinstance(tensor_split, str): - tensor_split = [ - part.strip() for part in tensor_split.split(",") if part.strip() - ] - if tensor_split: - cleaned_split = [] - for part in tensor_split: - try: - cleaned_split.append(float(part)) - except (TypeError, ValueError): - raise HTTPException( - status_code=400, detail="tensor_split values must be numbers" - ) - merged["tensor_split"] = cleaned_split - else: - merged["tensor_split"] = [] - - # Server configuration validation - def _as_list(key: str) -> list: - value = merged.get(key) - if value is None: - return [] - if isinstance(value, list): - return [str(v) for v in value] - if isinstance(value, str): - return [v.strip() for v in value.split(",") if v.strip()] - return [str(value)] - - merged["allow_origins"] = _as_list("allow_origins") - merged["allow_credentials"] = bool(merged.get("allow_credentials", False)) - merged["allow_methods"] = _as_list("allow_methods") - merged["allow_headers"] = _as_list("allow_headers") - merged["proxy_url"] = str(merged.get("proxy_url", "")).strip() - max_concurrent_requests = merged.get("max_concurrent_requests") - if max_concurrent_requests is not None: - merged["max_concurrent_requests"] = _as_int( - "max_concurrent_requests", minimum=1 - ) - log_level = merged.get("log_level") - if log_level is not None: - log_level = str(log_level).strip().upper() - valid_log_levels = { - "CRITICAL", - "FATAL", - "ERROR", - "WARN", - "WARNING", - "INFO", - "DEBUG", - "NOTSET", - } - if log_level and log_level not in valid_log_levels: - raise HTTPException( - status_code=400, - detail=f"log_level must be one of {sorted(valid_log_levels)}", - ) - merged["log_level"] = log_level if log_level else None - else: - merged["log_level"] = None - merged["api_keys"] = _as_list("api_keys") - merged["ssl"] = bool(merged.get("ssl", False)) - max_log_len = merged.get("max_log_len") - if max_log_len is not None: - merged["max_log_len"] = _as_int("max_log_len", minimum=1) - merged["disable_fastapi_docs"] = bool(merged.get("disable_fastapi_docs", False)) - merged["allow_terminate_by_client"] = bool( - merged.get("allow_terminate_by_client", False) - ) - merged["enable_abort_handling"] = bool(merged.get("enable_abort_handling", False)) - - # Model configuration validation - merged["chat_template"] = str(merged.get("chat_template", "")).strip() - merged["tool_call_parser"] = str(merged.get("tool_call_parser", "")).strip() - merged["reasoning_parser"] = str(merged.get("reasoning_parser", "")).strip() - merged["revision"] = str(merged.get("revision", "")).strip() - merged["download_dir"] = str(merged.get("download_dir", "")).strip() - merged["adapters"] = _as_list("adapters") - device = merged.get("device") - if device is not None: - device = str(device).strip().lower() - valid_devices = {"cuda", "ascend", "maca", "camb"} - if device and device not in valid_devices: - raise HTTPException( - status_code=400, detail=f"device must be one of {sorted(valid_devices)}" - ) - merged["device"] = device if device else None - else: - merged["device"] = None - merged["eager_mode"] = bool(merged.get("eager_mode", False)) - merged["disable_vision_encoder"] = bool(merged.get("disable_vision_encoder", False)) - logprobs_mode = merged.get("logprobs_mode") - if logprobs_mode is not None: - logprobs_mode = str(logprobs_mode).strip() - valid_logprobs_modes = {"None", "raw_logits", "raw_logprobs"} - if logprobs_mode and logprobs_mode not in valid_logprobs_modes: - raise HTTPException( - status_code=400, - detail=f"logprobs_mode must be one of {sorted(valid_logprobs_modes)}", - ) - merged["logprobs_mode"] = logprobs_mode if logprobs_mode else None - else: - merged["logprobs_mode"] = None - - # DLLM parameters validation - dllm_block_length = merged.get("dllm_block_length") - if dllm_block_length is not None: - merged["dllm_block_length"] = _as_int("dllm_block_length", minimum=1) - dllm_unmasking_strategy = merged.get("dllm_unmasking_strategy") - if dllm_unmasking_strategy is not None: - dllm_unmasking_strategy = str(dllm_unmasking_strategy).strip() - valid_dllm_strategies = { - "low_confidence_dynamic", - "low_confidence_static", - "sequential", - } - if ( - dllm_unmasking_strategy - and dllm_unmasking_strategy not in valid_dllm_strategies - ): - raise HTTPException( - status_code=400, - detail=f"dllm_unmasking_strategy must be one of {sorted(valid_dllm_strategies)}", - ) - merged["dllm_unmasking_strategy"] = ( - dllm_unmasking_strategy if dllm_unmasking_strategy else None - ) - else: - merged["dllm_unmasking_strategy"] = None - dllm_denoising_steps = merged.get("dllm_denoising_steps") - if dllm_denoising_steps is not None: - merged["dllm_denoising_steps"] = _as_int("dllm_denoising_steps", minimum=1) - dllm_confidence_threshold = merged.get("dllm_confidence_threshold") - if dllm_confidence_threshold is not None: - merged["dllm_confidence_threshold"] = _as_float( - "dllm_confidence_threshold", 0.0, 1.0 - ) - - # Distributed/Multi-node parameters validation - dp = merged.get("dp") - if dp is not None: - merged["dp"] = _as_int("dp", minimum=1) - ep = merged.get("ep") - if ep is not None: - merged["ep"] = _as_int("ep", minimum=1) - merged["enable_microbatch"] = bool(merged.get("enable_microbatch", False)) - merged["enable_eplb"] = bool(merged.get("enable_eplb", False)) - role = merged.get("role") - if role is not None: - role = str(role).strip() - valid_roles = {"Hybrid", "Prefill", "Decode"} - if role and role not in valid_roles: - raise HTTPException( - status_code=400, detail=f"role must be one of {sorted(valid_roles)}" - ) - merged["role"] = role if role else None - else: - merged["role"] = None - migration_backend = merged.get("migration_backend") - if migration_backend is not None: - migration_backend = str(migration_backend).strip() - valid_migration_backends = {"DLSlime", "Mooncake"} - if migration_backend and migration_backend not in valid_migration_backends: - raise HTTPException( - status_code=400, - detail=f"migration_backend must be one of {sorted(valid_migration_backends)}", - ) - merged["migration_backend"] = migration_backend if migration_backend else None - else: - merged["migration_backend"] = None - node_rank = merged.get("node_rank") - if node_rank is not None: - merged["node_rank"] = _as_int("node_rank", minimum=0) - nnodes = merged.get("nnodes") - if nnodes is not None: - merged["nnodes"] = _as_int("nnodes", minimum=1) - cp = merged.get("cp") - if cp is not None: - merged["cp"] = _as_int("cp", minimum=1) - merged["enable_return_routed_experts"] = bool( - merged.get("enable_return_routed_experts", False) - ) - distributed_executor_backend = merged.get("distributed_executor_backend") - if distributed_executor_backend is not None: - distributed_executor_backend = str(distributed_executor_backend).strip() - valid_executor_backends = {"uni", "mp", "ray"} - if ( - distributed_executor_backend - and distributed_executor_backend not in valid_executor_backends - ): - raise HTTPException( - status_code=400, - detail=f"distributed_executor_backend must be one of {sorted(valid_executor_backends)}", - ) - merged["distributed_executor_backend"] = ( - distributed_executor_backend if distributed_executor_backend else None - ) - else: - merged["distributed_executor_backend"] = None - - # Vision parameters validation - vision_max_batch_size = merged.get("vision_max_batch_size") - if vision_max_batch_size is not None: - merged["vision_max_batch_size"] = _as_int("vision_max_batch_size", minimum=1) - - # Speculative decoding parameters validation - speculative_algorithm = merged.get("speculative_algorithm") - if speculative_algorithm is not None: - speculative_algorithm = str(speculative_algorithm).strip() - valid_speculative_algorithms = {"eagle", "eagle3", "deepseek_mtp"} - if ( - speculative_algorithm - and speculative_algorithm not in valid_speculative_algorithms - ): - raise HTTPException( - status_code=400, - detail=f"speculative_algorithm must be one of {sorted(valid_speculative_algorithms)}", - ) - merged["speculative_algorithm"] = ( - speculative_algorithm if speculative_algorithm else None - ) - else: - merged["speculative_algorithm"] = None - speculative_draft_model = merged.get("speculative_draft_model") - if speculative_draft_model is not None: - speculative_draft_model = str(speculative_draft_model).strip() - merged["speculative_draft_model"] = ( - speculative_draft_model if speculative_draft_model else None - ) - else: - merged["speculative_draft_model"] = None - speculative_num_draft_tokens = merged.get("speculative_num_draft_tokens") - if speculative_num_draft_tokens is not None: - merged["speculative_num_draft_tokens"] = _as_int( - "speculative_num_draft_tokens", minimum=1 - ) - - # Boolean/style cleanups - merged["use_streaming"] = bool(merged.get("use_streaming", True)) - additional_args = merged.get("additional_args") - if additional_args is None: - merged["additional_args"] = "" - elif not isinstance(additional_args, str): - raise HTTPException(status_code=400, detail="additional_args must be a string") - - # Build hf_overrides from individual fields or use provided hf_overrides - hf_overrides_dict = _normalize_hf_overrides(merged.get("hf_overrides")) - - # If scaling is enabled and model_max_length > max_position_embeddings, - # automatically set original_max_position_embeddings in HF overrides - if scaling_mode != "disabled": - metadata = manifest_entry.get("metadata") or {} - config_data = ( - metadata.get("config", {}) - if isinstance(metadata.get("config"), dict) - else {} - ) - model_max_length = _coerce_positive_int(metadata.get("model_max_length")) - max_position_embeddings = _coerce_positive_int( - config_data.get("max_position_embeddings") - ) - - if ( - model_max_length - and max_position_embeddings - and model_max_length > max_position_embeddings - ): - # Set original_max_position_embeddings to adapted base (model_max_length / 4) - adapted_base = int(model_max_length / 4) - if adapted_base >= 1024: - hf_overrides_dict.setdefault("rope_scaling", {}) - hf_overrides_dict["rope_scaling"][ - "original_max_position_embeddings" - ] = adapted_base - # Also set rope_type if not already set and scaling mode is yarn - if ( - scaling_mode == "yarn" - and "rope_type" not in hf_overrides_dict["rope_scaling"] - ): - hf_overrides_dict["rope_scaling"]["rope_type"] = "yarn" - # Set factor if not already set - if "factor" not in hf_overrides_dict["rope_scaling"]: - hf_overrides_dict["rope_scaling"]["factor"] = scaling_factor - elif max_position_embeddings and max_position_embeddings >= 1024: - # Fallback: use max_position_embeddings directly - hf_overrides_dict.setdefault("rope_scaling", {}) - hf_overrides_dict["rope_scaling"][ - "original_max_position_embeddings" - ] = max_position_embeddings - # Also set rope_type if not already set and scaling mode is yarn - if ( - scaling_mode == "yarn" - and "rope_type" not in hf_overrides_dict["rope_scaling"] - ): - hf_overrides_dict["rope_scaling"]["rope_type"] = "yarn" - # Set factor if not already set - if "factor" not in hf_overrides_dict["rope_scaling"]: - hf_overrides_dict["rope_scaling"]["factor"] = scaling_factor - - merged["hf_overrides"] = hf_overrides_dict - - return merged - - class BundleProgressProxy: """Proxy progress manager that converts per-file progress into bundle-level updates.""" @@ -1228,12 +710,6 @@ async def get_cached_gpu_info() -> Dict[str, Any]: download_lock = asyncio.Lock() -class EstimationRequest(BaseModel): - model_id: str # YAML model id - config: dict - usage_mode: Optional[str] = "single_user" - - class SafetensorsBundleRequest(BaseModel): huggingface_id: str model_id: Optional[int] = None @@ -1254,7 +730,10 @@ async def list_models(): from backend.llama_swap_client import LlamaSwapClient store = get_store() - models = [m for m in store.list_models() if (m.get("format") or m.get("model_format") or "gguf") == "gguf"] + # Include all stored models (GGUF and safetensors). GGUF entries appear as + # individual quantizations; safetensors entries appear as a single logical + # quantization per repo with format "safetensors". + models = list(store.list_models()) try: running_data = await LlamaSwapClient().get_running_models() running_list = running_data.get("running") or [] @@ -1266,7 +745,7 @@ async def list_models(): for model in models: hf_id = model.get("huggingface_id") or "" base_name = model.get("base_model_name") or (hf_id.split("/")[-1] if hf_id else model.get("display_name") or "unknown") - proxy_name = generate_proxy_name(hf_id, model.get("quantization")) + proxy_name = resolve_proxy_name(model) is_active = proxy_name in running_names is_embedding = _model_is_embedding(model) key = f"{hf_id}_{base_name}" @@ -1287,14 +766,21 @@ async def list_models(): if is_embedding and not grouped_models[key].get("is_embedding_model"): grouped_models[key]["is_embedding_model"] = True - # Resolve actual disk size: prefer HF cache, fall back to stored value - resolved_path = _get_model_file_path(model) - file_size = _get_actual_file_size(resolved_path) or model.get("file_size") or 0 + # Resolve actual disk size: + # - For HF-backed GGUF models (identified by huggingface_id + quantization), + # trust the aggregated file_size stored on the model record. + # - For legacy/local models, fall back to resolving a concrete file_path. + if (model.get("format") or model.get("model_format") or "gguf") == "gguf" and model.get("huggingface_id") and model.get("quantization"): + file_size = model.get("file_size") or 0 + else: + legacy_path = model.get("file_path") + file_size = _get_actual_file_size(legacy_path) or model.get("file_size") or 0 grouped_models[key]["quantizations"].append({ "id": model.get("id"), "name": model.get("display_name") or model.get("name"), - "filename": _get_model_filename(model), + # No filename persisted for GGUF models; a model is a single logical + # entity per (huggingface_id, quantization). "file_size": file_size, "quantization": model.get("quantization"), "format": model.get("format") or model.get("model_format") or "gguf", @@ -1302,6 +788,7 @@ async def list_models(): "downloaded_at": model.get("downloaded_at"), "is_active": is_active, "has_config": bool(model.get("config")), + "mmproj_filename": model.get("mmproj_filename"), "huggingface_id": hf_id, "base_model_name": base_name, "model_type": model.get("model_type"), @@ -1433,15 +920,9 @@ async def delete_safetensors_model(request: dict): if not target_model or (target_model.get("format") or target_model.get("model_format")) != "safetensors": raise HTTPException(status_code=404, detail="Safetensors model not found") - manager = get_lmdeploy_manager() - status = manager.status() - if status.get("running"): - current = status.get("current_instance") or {} - if str(current.get("model_id")) == str(model_id): - raise HTTPException( - status_code=400, - detail="Cannot delete a model currently served by LMDeploy", - ) + # LMDeploy runtime is now managed via llama-swap; safetensors models + # are served through the same generic start/stop flow, so we don't + # need to special-case LMDeploy here. from backend.huggingface import ( get_safetensors_manifest_entries, @@ -1471,16 +952,8 @@ async def reload_safetensors_from_disk(): from backend.huggingface import ( SAFETENSORS_DIR, record_safetensors_download, - get_default_lmdeploy_config, ) - manager = get_lmdeploy_manager() - if manager.status().get("running"): - raise HTTPException( - status_code=400, - detail="Cannot reload safetensors models while LMDeploy runtime is active. Please stop the runtime first.", - ) - store = get_store() safetensors_models = [ m for m in store.list_models() @@ -1587,46 +1060,6 @@ async def reload_safetensors_from_disk(): raise HTTPException(status_code=500, detail=str(e)) -@router.get("/safetensors/{model_id:path}/lmdeploy/config") -async def get_lmdeploy_config_endpoint(model_id: str): - """Return stored LMDeploy config and metadata for a safetensors model.""" - store = get_store() - model = _get_safetensors_model(store, model_id) - manifest_entry = _load_manifest_entry_for_model(model) - metadata = manifest_entry.get("metadata") or {} - tensor_summary = manifest_entry.get("tensor_summary") or {} - max_context = manifest_entry.get("max_context_length") or metadata.get( - "max_context_length" - ) - config = (manifest_entry.get("lmdeploy") or {}).get( - "config" - ) or get_default_lmdeploy_config(max_context) - manager_status = get_lmdeploy_manager().status() - installer_status = get_lmdeploy_installer().status() - return { - "config": config, - "metadata": metadata, - "tensor_summary": tensor_summary, - "max_context_length": max_context, - "manager": manager_status, - "installer": installer_status, - } - - -@router.put("/safetensors/{model_id:path}/lmdeploy/config") -async def update_lmdeploy_config_endpoint(model_id: str, request: Dict[str, Any]): - """Persist LMDeploy configuration changes for a safetensors model.""" - store = get_store() - model = _get_safetensors_model(store, model_id) - manifest_entry = _load_manifest_entry_for_model(model) - validated_config = _validate_lmdeploy_config(request, manifest_entry) - updated_entry = update_lmdeploy_config(model.get("huggingface_id"), validated_config) - return { - "config": updated_entry.get("lmdeploy", {}).get("config", validated_config), - "updated_at": updated_entry.get("lmdeploy", {}).get("updated_at"), - } - - @router.post("/safetensors/{model_id:path}/metadata/regenerate") async def regenerate_safetensors_metadata_endpoint(model_id: str): """Refresh safetensors metadata/manifest entries without redownloading files.""" @@ -1705,11 +1138,6 @@ async def regenerate_safetensors_metadata_endpoint(model_id: str): if max_context: manifest["max_context_length"] = max_context - manifest.setdefault("lmdeploy", {}) - manifest["lmdeploy"].setdefault( - "config", get_default_lmdeploy_config(manifest.get("max_context_length")) - ) - save_safetensors_manifest_entries(huggingface_id, manifest) return { "message": f"Metadata regenerated for {huggingface_id}", @@ -1718,153 +1146,6 @@ async def regenerate_safetensors_metadata_endpoint(model_id: str): } -@router.get("/safetensors/lmdeploy/status") -async def get_lmdeploy_status(): - """Return LMDeploy runtime status and running instance info.""" - installer = get_lmdeploy_installer() - installer_status = installer.status() - if not installer_status.get("installed"): - raise HTTPException( - status_code=400, - detail="LMDeploy is not installed. Install it from the LMDeploy page before starting a runtime.", - ) - if installer_status.get("operation"): - raise HTTPException( - status_code=409, - detail="An LMDeploy install/remove operation is still running. Try again once it finishes.", - ) - - manager = get_lmdeploy_manager() - manager_status = manager.status() - - # Use manager's in-memory current_instance (no DB) - instance_payload = None - if manager_status.get("running"): - current_instance = manager_status.get("current_instance") - if current_instance: - instance_payload = { - "model_id": current_instance.get("model_id"), - "started_at": current_instance.get("started_at"), - "config": current_instance.get("config") if isinstance(current_instance.get("config"), dict) else {}, - } - - return { - "manager": manager_status, - "installer": installer.status(), - "running_instance": instance_payload, - } - - -@router.post("/safetensors/{model_id:path}/lmdeploy/start") -async def start_lmdeploy_runtime( - model_id: str, - request: Optional[Dict[str, Any]] = None, -): - """Start LMDeploy runtime for a safetensors model.""" - store = get_store() - model = _get_safetensors_model(store, model_id) - manifest_entry = _load_manifest_entry_for_model(model) - requested_config = ( - (request or {}).get("config") if isinstance(request, dict) else None - ) - validated_config = _validate_lmdeploy_config(requested_config, manifest_entry) - - manager = get_lmdeploy_manager() - status = manager.status() - current_instance = status.get("current_instance") or {} - if status.get("running"): - if current_instance.get("model_id") == model.get("id"): - raise HTTPException( - status_code=400, detail="LMDeploy is already running for this model" - ) - raise HTTPException( - status_code=400, - detail="Another safetensors model is already running via LMDeploy", - ) - - update_lmdeploy_config(model.get("huggingface_id"), validated_config) - - try: - pm = get_progress_manager() - await pm.send_model_status_update( - model_id=model.get("id"), - status="starting", - details={ - "runtime": "lmdeploy", - "message": f"Starting LMDeploy for {model.get('display_name') or model.get('name')}", - }, - ) - except Exception: - pass - - try: - display_name = model.get("huggingface_id") or model.get("base_model_name") or model.get("display_name") or model.get("name") - resolved_file_path = _get_model_file_path(model) - model_dir = os.path.dirname(resolved_file_path or "") - runtime_status = await manager.start( - { - "model_id": model.get("id"), - "huggingface_id": model.get("huggingface_id"), - "file_path": resolved_file_path, - "model_dir": model_dir, - "model_name": display_name, - "display_name": display_name, - }, - validated_config, - ) - except Exception as exc: - try: - await get_progress_manager().send_model_status_update( - model_id=model.get("id"), - status="error", - details={"runtime": "lmdeploy", "message": str(exc)}, - ) - except Exception: - pass - raise HTTPException(status_code=500, detail=str(exc)) - - try: - await get_progress_manager().send_model_status_update( - model_id=model.get("id"), - status="running", - details={"runtime": "lmdeploy", "message": "LMDeploy is ready"}, - ) - except Exception: - pass - - return {"manager": runtime_status, "config": validated_config} - - -@router.post("/safetensors/{model_id:path}/lmdeploy/stop") -async def stop_lmdeploy_runtime(model_id: str): - """Stop the LMDeploy runtime if it is running.""" - manager = get_lmdeploy_manager() - status = manager.status() - if not status.get("running"): - raise HTTPException(status_code=404, detail="No LMDeploy runtime is active") - current_instance = status.get("current_instance") or {} - if str(current_instance.get("model_id")) != str(model_id): - raise HTTPException( - status_code=400, detail="A different model is currently running in LMDeploy" - ) - - try: - await manager.stop() - except Exception as exc: - raise HTTPException(status_code=500, detail=str(exc)) - - try: - await get_progress_manager().send_model_status_update( - model_id=model_id, - status="stopped", - details={"runtime": "lmdeploy", "message": "LMDeploy runtime stopped"}, - ) - except Exception: - pass - - return {"message": "LMDeploy runtime stopped"} - - @router.post("/download") async def download_huggingface_model( request: dict, background_tasks: BackgroundTasks @@ -1900,8 +1181,9 @@ async def download_huggingface_model( ) store = get_store() + is_mmproj_download = model_format == "gguf" and "mmproj" in filename.lower() # Check if this specific quantization already exists - if model_format == "gguf": + if model_format == "gguf" and not is_mmproj_download: quantization = _extract_quantization(filename) model_id = f"{huggingface_id.replace('/', '--')}--{quantization}" if store.get_model(model_id): @@ -1911,7 +1193,9 @@ async def download_huggingface_model( # Extract quantization for better task_id (use same function as search results) quantization = ( - _extract_quantization(filename) + os.path.splitext(os.path.basename(filename))[0] + if is_mmproj_download + else _extract_quantization(filename) if model_format == "gguf" else os.path.splitext(filename)[0] ) @@ -2023,6 +1307,7 @@ async def download_model_task( try: model_record = None metadata_result = None + is_mmproj_download = model_format == "gguf" and "mmproj" in filename.lower() if progress_manager and task_id: file_path, file_size = await download_model_with_progress( @@ -2039,7 +1324,7 @@ async def download_model_task( huggingface_id, filename, model_format ) - if model_format == "gguf": + if model_format == "gguf" and not is_mmproj_download: model_record, metadata_result = await _record_gguf_download_post_fetch( store, huggingface_id, @@ -2047,29 +1332,10 @@ async def download_model_task( file_path, file_size, pipeline_tag=pipeline_tag, + aggregate_size=True, ) - # If vision (mmproj) is available, download F16 projector so the model can run with vision - if model_record: - mmproj_filename = get_mmproj_f16_filename(huggingface_id) - if mmproj_filename: - try: - await download_model( - huggingface_id, mmproj_filename, "gguf" - ) - store.update_model( - model_record["id"], {"mmproj_filename": mmproj_filename} - ) - model_record = store.get_model(model_record["id"]) or model_record - if progress_manager and task_id: - await progress_manager.send_notification( - title="Vision extension", - message=f"Downloaded {mmproj_filename} for vision support", - type="info", - ) - except Exception as mmproj_err: - logger.warning( - f"Could not download vision projector {mmproj_filename} for {huggingface_id}: {mmproj_err}" - ) + elif model_format == "gguf": + logger.info("Downloaded standalone mmproj file for %s: %s", huggingface_id, filename) else: model_record = await _save_safetensors_download( store, @@ -2137,13 +1403,20 @@ async def _record_gguf_download_post_fetch( file_path: str, file_size: int, pipeline_tag: Optional[str] = None, + aggregate_size: bool = True, ) -> Tuple[dict, Optional[Dict[str, Any]]]: """ Shared helper to create GGUF model entries and manifest after a file has been downloaded. Returns (model_record dict, metadata_result). """ quantization = _extract_quantization(filename) - base_model_name = extract_base_model_name(filename) + # Derive the base model name from the Hugging Face repo id instead of any + # specific filename. For typical repos like "unsloth/Qwen3.5-0.8B-GGUF", + # this yields "Qwen3.5-0.8B". + repo_name = huggingface_id.split("/")[-1] if isinstance(huggingface_id, str) else "" + base_model_name = repo_name + if repo_name.endswith("-GGUF"): + base_model_name = repo_name[: -len("-GGUF")] detected_pipeline = pipeline_tag is_embedding_like = _looks_like_embedding_model( detected_pipeline, @@ -2160,18 +1433,20 @@ async def _record_gguf_download_post_fetch( if not model_record: from datetime import timezone as _tz + # New GGUF records do not persist any per-file name. The model is a single + # logical entity identified by (huggingface_id, quantization). model_record = { "id": model_id, "huggingface_id": huggingface_id, - "filename": filename, - "display_name": filename.replace(".gguf", ""), + "display_name": f"{base_model_name}-{quantization}", "base_model_name": base_model_name, - "file_size": file_size, + "file_size": file_size if aggregate_size else 0, "quantization": quantization, "model_type": extract_model_type(filename), "proxy_name": generate_proxy_name(huggingface_id, quantization), + # Persist only the canonical "format" field. "model_format" is still + # read for backward compatibility but no longer written for new records. "format": "gguf", - "model_format": "gguf", "downloaded_at": datetime.now(_tz.utc).isoformat(), "pipeline_tag": detected_pipeline, "config": {"embedding": True} if is_embedding_like else {}, @@ -2179,7 +1454,7 @@ async def _record_gguf_download_post_fetch( store.add_model(model_record) else: updates = {} - if file_size and file_size > 0: + if aggregate_size and file_size and file_size > 0: current_size = model_record.get("file_size") or 0 updates["file_size"] = current_size + file_size if not model_record.get("pipeline_tag") and detected_pipeline: @@ -2211,12 +1486,6 @@ async def _record_gguf_download_post_fetch( ) except Exception as manifest_exc: logger.warning(f"Failed to record GGUF manifest entry for {filename}: {manifest_exc}") - if manifest_entry: - metadata_for_defaults = manifest_entry.get("metadata") or {} - try: - _apply_hf_defaults_to_model(model_record, metadata_for_defaults, store) - except Exception as default_exc: - logger.warning(f"Failed to apply HF defaults for model {model_record.get('id')}: {default_exc}") return model_record, metadata_result @@ -2336,16 +1605,21 @@ async def download_gguf_bundle_task( task_id: str, total_bundle_bytes: int = 0, pipeline_tag: Optional[str] = None, + projector: Optional[Dict[str, Any]] = None, ): store = get_store() try: - total_files = len(files) + total_files = len(files) + (1 if projector and projector.get("filename") else 0) bytes_completed = 0 aggregate_total = total_bundle_bytes or sum( max(f.get("size") or 0, 0) for f in files ) aggregate_total = aggregate_total or None + # Track the total on-disk size of all GGUF shards for this quantization only + # (projector size is stored separately on the model record). + bundle_model_bytes = 0 + for index, file_info in enumerate(files): filename = file_info["filename"] size_hint = max(file_info.get("size") or 0, 0) @@ -2372,6 +1646,9 @@ async def download_gguf_bundle_task( ) try: + # For bundles, record manifest/metadata per shard but do not + # increment the model's stored file_size here. We will set the + # final aggregated size once at the end of the bundle download. await _record_gguf_download_post_fetch( store, huggingface_id, @@ -2379,11 +1656,61 @@ async def download_gguf_bundle_task( file_path, file_size, pipeline_tag=pipeline_tag, + aggregate_size=False, ) except Exception as exc: logger.error(f"Failed to record GGUF download for {filename}: {exc}") bytes_completed += file_size + bundle_model_bytes += file_size + + model_id = f"{huggingface_id.replace('/', '--')}--{quantization}" + model_record = store.get_model(model_id) + + projector_filename = (projector or {}).get("filename") + if projector_filename and model_record: + projector_size_hint = max(int((projector or {}).get("size") or 0), 0) + cached_projector = resolve_cached_model_path(huggingface_id, projector_filename) + if cached_projector and os.path.exists(cached_projector): + try: + bytes_completed += os.path.getsize(cached_projector) + except OSError: + bytes_completed += projector_size_hint + else: + proxy = BundleProgressProxy( + progress_manager, + task_id, + bytes_completed, + aggregate_total or 0, + len(files), + total_files, + projector_filename, + huggingface_id, + "gguf-bundle", + ) + _, projector_file_size = await download_model_with_progress( + huggingface_id, + projector_filename, + proxy, + task_id, + projector_size_hint, + "gguf", + huggingface_id, + ) + bytes_completed += projector_file_size + + store.update_model(model_id, {"mmproj_filename": projector_filename}) + + # Persist the aggregated GGUF shard size on the model record once, + # after all shards have been downloaded. + if model_record and bundle_model_bytes > 0: + try: + store.update_model(model_id, {"file_size": bundle_model_bytes}) + model_record = store.get_model(model_id) or model_record + except Exception as size_exc: + logger.warning( + f"Failed to update aggregated GGUF size for {model_id}: {size_exc}" + ) final_total = aggregate_total or bytes_completed await progress_manager.send_download_progress( @@ -2410,6 +1737,8 @@ async def download_gguf_bundle_task( "model_format": "gguf-bundle", "quantization": quantization, "filenames": [f["filename"] for f in files], + "mmproj_filename": projector_filename, + "model_id": model_id, "timestamp": datetime.utcnow().isoformat(), } ) @@ -2508,6 +1837,8 @@ async def download_gguf_bundle( quantization = request.get("quantization") files = request.get("files") or [] pipeline_tag = request.get("pipeline_tag") + projector_filename = (request.get("mmproj_filename") or "").strip() + projector_size = max(int(request.get("mmproj_size") or 0), 0) if not huggingface_id: raise HTTPException(status_code=400, detail="huggingface_id is required") @@ -2515,6 +1846,8 @@ async def download_gguf_bundle( raise HTTPException(status_code=400, detail="quantization is required") if not files: raise HTTPException(status_code=400, detail="Repository file list is required") + if projector_filename and not _is_mmproj_filename(projector_filename): + raise HTTPException(status_code=400, detail="Invalid projector filename") sanitized_files = [] declared_total = 0 @@ -2529,6 +1862,11 @@ async def download_gguf_bundle( if not sanitized_files: raise HTTPException(status_code=400, detail="No valid files to download") + projector_payload = None + if projector_filename: + declared_total += projector_size + projector_payload = {"filename": projector_filename, "size": projector_size} + task_id = f"download_gguf_bundle_{huggingface_id.replace('/', '_')}_{quantization}_{int(time.time() * 1000)}" async with download_lock: @@ -2560,6 +1898,7 @@ async def download_gguf_bundle( task_id, declared_total, pipeline_tag, + projector_payload, ) return { @@ -2573,6 +1912,150 @@ async def download_gguf_bundle( # Removed duplicate extract_quantization; use `_extract_quantization` from backend.huggingface +async def download_model_projector_task( + model_id: str, + mmproj_filename: str, + progress_manager, + task_id: str, + total_bytes: int = 0, +): + store = get_store() + try: + model = store.get_model(model_id) + if not model: + raise RuntimeError("Model no longer exists") + + huggingface_id = model.get("huggingface_id") + if not huggingface_id: + raise RuntimeError("Model is missing huggingface_id") + + cached_path = resolve_cached_model_path(huggingface_id, mmproj_filename) + if cached_path and os.path.exists(cached_path): + file_path = cached_path + try: + file_size = os.path.getsize(cached_path) + except OSError: + file_size = max(int(total_bytes or 0), 0) + else: + file_path, file_size = await download_model_with_progress( + huggingface_id, + mmproj_filename, + progress_manager, + task_id, + total_bytes, + "gguf", + huggingface_id, + ) + + store.update_model(model_id, {"mmproj_filename": mmproj_filename}) + await _regenerate_llama_swap_config(f"projector update for {model_id}") + + if progress_manager: + progress_manager.complete_task(task_id, f"Applied projector {mmproj_filename}") + await progress_manager.broadcast( + { + "type": "download_complete", + "huggingface_id": huggingface_id, + "model_format": "gguf-projector", + "model_id": model_id, + "filename": mmproj_filename, + "mmproj_filename": mmproj_filename, + "file_size": file_size, + "file_path": file_path, + "timestamp": datetime.utcnow().isoformat(), + } + ) + await progress_manager.send_notification( + title="Projector Ready", + message=f"Applied projector {mmproj_filename}", + type="success", + ) + except Exception as exc: + if progress_manager: + progress_manager.fail_task(task_id, str(exc)) + await progress_manager.send_notification( + title="Projector Update Failed", + message=str(exc), + type="error", + ) + finally: + if task_id: + async with download_lock: + active_downloads.pop(task_id, None) + + +@router.post("/{model_id:path}/projector") +async def update_model_projector( + model_id: str, + request: dict, + background_tasks: BackgroundTasks, +): + store = get_store() + model = _get_model_or_404(store, model_id) + if (model.get("format") or model.get("model_format")) != "gguf": + raise HTTPException(status_code=400, detail="Projectors are only supported for GGUF models") + + mmproj_filename = (request.get("mmproj_filename") or "").strip() or None + total_bytes = max(int(request.get("total_bytes") or 0), 0) + + if mmproj_filename and not _is_mmproj_filename(mmproj_filename): + raise HTTPException(status_code=400, detail="Invalid projector filename") + + current_projector = model.get("mmproj_filename") + if mmproj_filename == current_projector: + return {"message": "Projector already selected", "applied": True} + + if not mmproj_filename: + store.update_model(model_id, {"mmproj_filename": None}) + await _regenerate_llama_swap_config(f"projector cleared for {model_id}") + return {"message": "Projector cleared", "applied": True} + + huggingface_id = model.get("huggingface_id") + cached_path = resolve_cached_model_path(huggingface_id, mmproj_filename) + if cached_path and os.path.exists(cached_path): + store.update_model(model_id, {"mmproj_filename": mmproj_filename}) + await _regenerate_llama_swap_config(f"projector update for {model_id}") + return {"message": "Projector applied", "applied": True} + + task_id = f"download_projector_{model_id.replace('/', '_')}_{int(time.time() * 1000)}" + async with download_lock: + is_downloading = any( + d.get("model_id") == model_id + and d.get("filename") == mmproj_filename + and d.get("model_format") == "gguf-projector" + for d in active_downloads.values() + ) + if is_downloading: + raise HTTPException(status_code=409, detail="This projector is already being applied") + active_downloads[task_id] = { + "huggingface_id": huggingface_id, + "model_id": model_id, + "filename": mmproj_filename, + "model_format": "gguf-projector", + } + + pm = get_progress_manager() + pm.create_task( + "download", + f"Projector {mmproj_filename}", + {"huggingface_id": huggingface_id, "filename": mmproj_filename, "model_id": model_id}, + task_id=task_id, + ) + background_tasks.add_task( + download_model_projector_task, + model_id, + mmproj_filename, + pm, + task_id, + total_bytes, + ) + return { + "message": "Projector download started", + "task_id": task_id, + "applied": False, + } + + def extract_model_type(filename: str) -> str: """Extract model type from filename""" filename_lower = filename.lower() @@ -2584,6 +2067,10 @@ def extract_model_type(filename: str) -> str: return "codellama" elif "gemma" in filename_lower: return "gemma" + # Heuristic: treat any Qwen-family filename as "qwen" unless a more + # specific architecture is provided by GGUF metadata later. + elif "qwen" in filename_lower: + return "qwen" return "unknown" @@ -2612,6 +2099,40 @@ def extract_base_model_name(filename: str) -> str: return name if name else filename +@router.get("/{model_id:path}/limits") +async def get_model_limits(model_id: str): + """ + Return model limits in an engine-agnostic way. Always uses the Hugging Face + model card (config.json / model info). + - max_context_length: from model card (model_max_length / max_position_embeddings). + - layer_count: from model card config (num_hidden_layers / n_layer / num_layers). + """ + store = get_store() + model = _get_model_or_404(store, model_id) + hf_id = model.get("huggingface_id") + if not hf_id: + return {"max_context_length": None, "layer_count": None} + + max_ctx = None + layer_count = None + try: + details = await get_model_details(hf_id) + config = details.get("config") or {} + max_ctx = details.get("model_max_length") or config.get("max_position_embeddings") + if isinstance(max_ctx, (int, float)) and max_ctx > 0: + max_ctx = int(max_ctx) + else: + max_ctx = None + for key in ("num_hidden_layers", "n_layer", "num_layers"): + val = config.get(key) + if isinstance(val, (int, float)) and val > 0: + layer_count = int(val) + break + except Exception: + pass + return {"max_context_length": max_ctx, "layer_count": layer_count} + + @router.get("/{model_id:path}/config") async def get_model_config(model_id: str): """Get model's llama.cpp configuration""" @@ -2640,43 +2161,6 @@ async def update_model_config(model_id: str, config: dict): return {"message": "Configuration updated"} -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.post("/{model_id:path}/auto-config") -async def generate_auto_config(model_id: str): - """Stub: return current config (Smart Auto removed). Optionally apply defaults.""" - store = get_store() - model = _get_model_or_404(store, model_id) - config = (model.get("config") or {}).copy() - config.setdefault("ctx_size", 2048) - config.setdefault("batch_size", 512) - config.setdefault("threads", 4) - config.setdefault("n_gpu_layers", -1) - store.update_model(model_id, {"config": config}) - return config - - -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.post("/{model_id:path}/smart-auto") -async def generate_smart_auto_config( - model_id: str, - preset: Optional[str] = None, - usage_mode: str = "single_user", - speed_quality: Optional[int] = None, - use_case: Optional[str] = None, - debug: Optional[bool] = False, -): - """Stub: apply defaults (Smart Auto removed).""" - store = get_store() - model = _get_model_or_404(store, model_id) - config = (model.get("config") or {}).copy() - config.setdefault("ctx_size", 2048) - config.setdefault("batch_size", 512) - config.setdefault("threads", 4) - config.setdefault("n_gpu_layers", -1) - store.update_model(model_id, {"config": config}) - return config - - @router.post("/{model_id:path}/start") async def start_model(model_id: str): """Start model via llama-swap""" @@ -2684,9 +2168,7 @@ async def start_model(model_id: str): store = get_store() model = _get_model_or_404(store, model_id) - proxy_model_name = model.get("proxy_name") or generate_proxy_name( - model.get("huggingface_id"), model.get("quantization") - ) + proxy_model_name = resolve_proxy_name(model) try: running_data = await LlamaSwapClient().get_running_models() @@ -2717,6 +2199,9 @@ async def start_model(model_id: str): await llama_swap_manager.regenerate_config_with_active_version() model_with_proxy = {**(model or {}), "proxy_name": proxy_model_name} await llama_swap_manager.register_model(model_with_proxy, config) + client = LlamaSwapClient() + client.mark_model_loading(proxy_model_name) + await client.load_model(proxy_model_name) except Exception as e: try: await get_progress_manager().send_model_status_update( @@ -2748,9 +2233,7 @@ async def stop_model(model_id: str): store = get_store() model = _get_model_or_404(store, model_id) - proxy_name = model.get("proxy_name") or generate_proxy_name( - model.get("huggingface_id"), model.get("quantization") - ) + proxy_name = resolve_proxy_name(model) try: running_data = await LlamaSwapClient().get_running_models() @@ -2862,7 +2345,7 @@ async def delete_model_group(request: DeleteGroupRequest): deleted_count = 0 for model in models: - proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization")) + proxy_name = resolve_proxy_name(model) if proxy_name in running_names: try: from backend.llama_swap_manager import get_llama_swap_manager @@ -2870,15 +2353,6 @@ async def delete_model_group(request: DeleteGroupRequest): except Exception as e: logger.warning(f"Failed to stop model {proxy_name}: {e}") - fname = _get_model_filename(model) - if model.get("huggingface_id") and fname: - from backend.huggingface import delete_cached_model_file - deleted_file = delete_cached_model_file(model.get("huggingface_id"), fname) - if not deleted_file: - legacy_path = _normalize_model_path(model.get("file_path")) - if legacy_path and os.path.exists(legacy_path): - os.remove(legacy_path) - store.delete_model(model.get("id")) deleted_count += 1 @@ -2892,7 +2366,7 @@ async def delete_model(model_id: str): store = get_store() model = _get_model_or_404(store, model_id) - proxy_name = model.get("proxy_name") or generate_proxy_name(model.get("huggingface_id"), model.get("quantization")) + proxy_name = resolve_proxy_name(model) try: running_data = await LlamaSwapClient().get_running_models() @@ -2907,142 +2381,10 @@ async def delete_model(model_id: str): except Exception as e: logger.warning(f"Failed to stop model {proxy_name}: {e}") - huggingface_id = model.get("huggingface_id") - filename = _get_model_filename(model) - - if huggingface_id and filename: - from backend.huggingface import delete_cached_model_file - deleted = delete_cached_model_file(huggingface_id, filename) - if not deleted: - # Fall back to direct removal for legacy records with file_path - legacy_path = _normalize_model_path(model.get("file_path")) - if legacy_path and os.path.exists(legacy_path): - os.remove(legacy_path) - logger.info(f"Removed legacy model file: {legacy_path}") - store.delete_model(model_id) return {"message": "Model quantization deleted"} -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.get("/{model_id:path}/layer-info") -async def get_model_layer_info_endpoint(model_id: str): - """Get model layer information from GGUF metadata""" - store = get_store() - model = _get_model_or_404(store, model_id) - - layer_info = None - normalized_path = _get_model_file_path(model) - if normalized_path and os.path.exists(normalized_path): - try: - layer_info = get_model_layer_info(normalized_path) - except Exception as e: - logger.error(f"Failed to get layer info for model {model_id}: {e}") - if layer_info: - return { - "layer_count": layer_info["layer_count"], - "architecture": layer_info["architecture"], - "context_length": layer_info["context_length"], - "parameter_count": layer_info.get( - "parameter_count" - ), # Formatted as "32B", "36B", etc. - "vocab_size": layer_info["vocab_size"], - "embedding_length": layer_info["embedding_length"], - "attention_head_count": layer_info["attention_head_count"], - "attention_head_count_kv": layer_info["attention_head_count_kv"], - "block_count": layer_info["block_count"], - "is_moe": layer_info.get("is_moe", False), - "expert_count": layer_info.get("expert_count", 0), - "experts_used_count": layer_info.get("experts_used_count", 0), - } - # Fallback to default values if metadata unavailable - logger.warning( - f"Using default layer info fallback (32 layers) for model_id={model_id}; " - "GGUF metadata could not be read or did not provide layer information." - ) - return { - "layer_count": 32, - "architecture": "unknown", - "context_length": 0, - "vocab_size": 0, - "embedding_length": 0, - "attention_head_count": 0, - "attention_head_count_kv": 0, - "block_count": 0, - "is_moe": False, - "expert_count": 0, - "experts_used_count": 0, - } - - -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.get("/{model_id:path}/recommendations") -async def get_model_recommendations_endpoint(model_id: str): - """Stub: recommendations removed with smart_auto. Returns empty defaults.""" - return {"gpu_layers": None, "context_size": None, "batch_size": None} - - -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.get("/{model_id:path}/architecture-presets") -async def get_architecture_presets_endpoint(model_id: str): - """Stub: presets removed. Returns minimal structure.""" - return {"architecture": "unknown", "presets": {}, "available_presets": []} - - -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.post("/vram-estimate") -async def estimate_vram_usage(request: EstimationRequest): - """Stub: simple VRAM estimate (smart_auto removed).""" - store = get_store() - _get_model_or_404(store, request.model_id) - cfg = request.config or {} - ngl = int(cfg.get("n_gpu_layers") or -1) - ctx = int(cfg.get("ctx_size") or 2048) - # Very rough: ~1GB base + per-layer and context - estimate_mb = 1024 + (abs(ngl) * 50 if ngl != -1 else 2000) + (ctx // 64) - return {"vram_estimate_mb": min(estimate_mb, 96 * 1024), "vram_estimate_gb": round(estimate_mb / 1024, 2)} - - -# DEPRECATED: remove with ModelConfig.vue rewrite -@router.post("/ram-estimate") -async def estimate_ram_usage(request: EstimationRequest): - """Stub: simple RAM estimate (smart_auto removed).""" - store = get_store() - _get_model_or_404(store, request.model_id) - cfg = request.config or {} - ctx = int(cfg.get("ctx_size") or 2048) - estimate_mb = 512 + (ctx // 32) - return {"ram_estimate_mb": estimate_mb, "ram_estimate_gb": round(estimate_mb / 1024, 2)} - - -@router.get("/{model_id:path}/hf-metadata") -async def get_model_hf_metadata(model_id: str): - store = get_store() - model = _get_model_or_404(store, model_id) - - metadata_entry = None - if (model.get("model_format") or model.get("format") or "gguf").lower() == "safetensors": - metadata_entry = _load_manifest_entry_for_model(model) - else: - filename = _get_model_filename(model) - if not filename: - raise HTTPException(status_code=400, detail="Model filename is not set") - metadata_entry = get_gguf_manifest_entry(model.get("huggingface_id"), filename) - - if not metadata_entry: - raise HTTPException(status_code=404, detail="Metadata not found for model") - - metadata = metadata_entry.get("metadata") or {} - defaults = _derive_hf_defaults(metadata) - - return { - "metadata": metadata, - "gguf_layer_info": metadata_entry.get("gguf_layer_info"), - "max_context_length": metadata_entry.get("max_context_length"), - "hf_defaults": defaults, - } - - @router.post("/{model_id:path}/regenerate-info") async def regenerate_model_info_endpoint(model_id: str): """ diff --git a/backend/routes/status.py b/backend/routes/status.py index c4211d0..62d7b36 100644 --- a/backend/routes/status.py +++ b/backend/routes/status.py @@ -3,13 +3,10 @@ import os from backend.llama_swap_client import LlamaSwapClient -from backend.lmdeploy_manager import get_lmdeploy_manager -from backend.lmdeploy_installer import get_lmdeploy_installer router = APIRouter() DEFAULT_PROXY_PORT = 2000 -LMDEPLOY_PORT = 2001 @router.get("/status") @@ -25,12 +22,15 @@ async def get_system_status(): else: running_list = running_data.get("running") or [] + proxy_health = await client.check_health() + active_instances = [] for i, item in enumerate(running_list): proxy_model_name = item.get("model", "") state = item.get("state", "") runtime_type = "lmdeploy" if state == "lmdeploy" else "llama_cpp" - port = LMDEPLOY_PORT if runtime_type == "lmdeploy" else DEFAULT_PROXY_PORT + # All traffic is served via the unified llama-swap proxy on DEFAULT_PROXY_PORT. + port = DEFAULT_PROXY_PORT active_instances.append( { "id": i, @@ -50,10 +50,6 @@ async def get_system_status(): except FileNotFoundError: disk = psutil.disk_usage("/") - lmdeploy_manager = get_lmdeploy_manager() - lmdeploy_status = lmdeploy_manager.status() - installer_status = get_lmdeploy_installer().status() - return { "system": { "cpu_percent": cpu_percent, @@ -72,15 +68,10 @@ async def get_system_status(): "running_instances": active_instances, "proxy_status": { "enabled": True, - "port": 2000, + "port": DEFAULT_PROXY_PORT, "endpoint": "http://localhost:2000/v1/chat/completions", - }, - "lmdeploy_status": { - "enabled": True, - "port": 2001, - "endpoint": "http://localhost:2001/v1/chat/completions", - "running": lmdeploy_status.get("running"), - "current_instance": lmdeploy_status.get("current_instance"), - "installer": installer_status, + "healthy": proxy_health.get("healthy", False), + "status_code": proxy_health.get("status_code"), + "loading_models": proxy_health.get("loading_models", []), }, } diff --git a/backend/tests/test_lmdeploy_installer.py b/backend/tests/test_lmdeploy_installer.py index 23bdcb8..9f1d6c8 100644 --- a/backend/tests/test_lmdeploy_installer.py +++ b/backend/tests/test_lmdeploy_installer.py @@ -2,12 +2,12 @@ import pytest -from backend.lmdeploy_installer import LMDeployInstaller +from backend.lmdeploy_manager import LMDeployManager @pytest.mark.asyncio async def test_install_prevents_parallel_operations(tmp_path: Path, monkeypatch): - installer = LMDeployInstaller( + installer = LMDeployManager( log_path=str(tmp_path / "lmdeploy.log"), state_path=str(tmp_path / "lmdeploy_state.json"), base_dir=str(tmp_path / "lmdeploy"), @@ -19,15 +19,15 @@ def prevent_task(coro): monkeypatch.setattr(installer, "_create_task", prevent_task) - result = await installer.install() + result = await installer.install_release() assert result["message"].startswith("LMDeploy installation started") with pytest.raises(RuntimeError): - await installer.install() + await installer.install_release() def test_status_reflects_detection(tmp_path: Path, monkeypatch): - installer = LMDeployInstaller( + installer = LMDeployManager( log_path=str(tmp_path / "lmdeploy.log"), state_path=str(tmp_path / "lmdeploy_state.json"), base_dir=str(tmp_path / "lmdeploy"), diff --git a/backend/tests/test_model_introspection.py b/backend/tests/test_model_introspection.py new file mode 100644 index 0000000..da0c81b --- /dev/null +++ b/backend/tests/test_model_introspection.py @@ -0,0 +1,52 @@ +from backend.model_introspection import GgufIntrospector + + +def test_context_length_prefers_largest_and_uses_config_global(): + # global config prefers general.context_length / model_max_length / max_position_embeddings + metadata = { + "general.context_length": 4096, + "general.model_max_length": 8192, + "qwen.context_length": 2048, + } + introspector = GgufIntrospector(metadata=metadata, tensors={}) + info = introspector.build_model_info() + assert info.context_length == 8192 + + +def test_parameter_count_parses_formatted_and_raw_values(): + metadata = { + "general.parameters": "7B", + "general.parameter_count": 6_000_000_000, + } + introspector = GgufIntrospector(metadata=metadata, tensors={}) + info = introspector.build_model_info() + # 7B should win over 6B + assert info.parameter_count_display in {"7B", "7.0B"} + + +def test_moe_detection_from_expert_keys(): + metadata = { + "general.architecture": "glm4moe", + "ffn.expert_count": 64, + "ffn.num_experts_per_tok": 8, + } + introspector = GgufIntrospector(metadata=metadata, tensors={}) + info = introspector.build_model_info() + assert info.is_moe is True + assert info.expert_count == 64 + assert info.experts_used_count == 8 + + +def test_vocab_and_embedding_from_tensors_when_metadata_missing(): + tensors = { + "tok_embeddings.weight": { + "shape": [32000, 4096], + "type": 0, + "offset": 0, + } + } + introspector = GgufIntrospector(metadata={}, tensors=tensors) + info = introspector.build_model_info() + assert info.vocab_size == 32000 + assert info.embedding_length == 4096 + diff --git a/docker-compose.cuda.yml b/docker-compose.cuda.yml index 57d5cc0..ae578a0 100644 --- a/docker-compose.cuda.yml +++ b/docker-compose.cuda.yml @@ -13,8 +13,8 @@ services: environment: - CUDA_VISIBLE_DEVICES=all - HF_HUB_ENABLE_HF_TRANSFER=1 - - HF_HOME=/app/data/temp/.cache/huggingface - - HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub + - HF_HOME=/app/data/hf-cache + - HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub - RELOAD=false # Uncomment and set your HuggingFace API key to enable model search and download # - HUGGINGFACE_API_KEY=your_huggingface_token_here diff --git a/frontend/src/App.vue b/frontend/src/App.vue index b4b0326..e78ed46 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -5,9 +5,7 @@
@@ -27,12 +25,10 @@ diff --git a/frontend/src/components/common/ProgressTracker.vue b/frontend/src/components/common/ProgressTracker.vue index 31fcf45..d8d1cf4 100644 --- a/frontend/src/components/common/ProgressTracker.vue +++ b/frontend/src/components/common/ProgressTracker.vue @@ -13,18 +13,29 @@ {{ task.description }}
- {{ Math.round(task.progress) }}% +
+ + {{ Math.round(task.progress) }}% +
{{ task.message }} +
{{ getTaskLogs(task).join('\n') }}
diff --git a/frontend/src/components/system/VersionTable.vue b/frontend/src/components/system/VersionTable.vue index f6974d4..56c6ef2 100644 --- a/frontend/src/components/system/VersionTable.vue +++ b/frontend/src/components/system/VersionTable.vue @@ -14,7 +14,7 @@
{{ v.version }} - + {{ v.repository_source }} CUDA
@@ -34,7 +34,8 @@ text severity="danger" size="small" - v-tooltip.top="'Delete version'" + :disabled="v.is_active" + v-tooltip.top="v.is_active ? 'Active versions cannot be deleted' : 'Delete version'" @click="$emit('delete', v.id ?? v.version)" /> diff --git a/frontend/src/stores/engines.js b/frontend/src/stores/engines.js index 0642899..5335b09 100644 --- a/frontend/src/stores/engines.js +++ b/frontend/src/stores/engines.js @@ -37,13 +37,25 @@ export const useEnginesStore = defineStore('engines', () => { return data } - async function fetchReleaseAssets(tagName) { - const { data } = await axios.get(`/api/llama-versions/releases/${encodeURIComponent(tagName)}/assets`) + async function fetchBuildSettings(engine) { + const { data } = await axios.get('/api/llama-versions/build-settings', { + params: { engine }, + }) + return data + } + + async function saveBuildSettings(engine, settings) { + const { data } = await axios.put('/api/llama-versions/build-settings', settings, { + params: { engine }, + }) return data } - async function installRelease(params) { - const { data } = await axios.post('/api/llama-versions/install-release', params) + async function updateEngine(engine, params = {}) { + const { data } = await axios.post('/api/llama-versions/update', { + engine, + ...params, + }) await fetchLlamaVersions() return data } @@ -111,11 +123,6 @@ export const useEnginesStore = defineStore('engines', () => { await fetchLmdeployStatus() } - async function fetchLmdeployLogs(maxBytes = 8192) { - const { data } = await axios.get('/api/lmdeploy/logs', { params: { max_bytes: maxBytes } }) - return data - } - // --- GPU / System --- async function fetchGpuInfo() { @@ -165,8 +172,9 @@ export const useEnginesStore = defineStore('engines', () => { checkLlamaCppUpdates, checkIkLlamaUpdates, checkLmdeployUpdates, - fetchReleaseAssets, - installRelease, + fetchBuildSettings, + saveBuildSettings, + updateEngine, buildSource, activateVersion, deleteVersion, @@ -180,7 +188,6 @@ export const useEnginesStore = defineStore('engines', () => { installLmdeploy, installLmdeployFromSource, removeLmdeploy, - fetchLmdeployLogs, fetchGpuInfo, fetchSystemStatus, diff --git a/frontend/src/stores/models.js b/frontend/src/stores/models.js index c8ac178..2d4e369 100644 --- a/frontend/src/stores/models.js +++ b/frontend/src/stores/models.js @@ -5,6 +5,9 @@ import axios from 'axios' export const useModelStore = defineStore('models', () => { const models = ref([]) // array of groups: { huggingface_id, base_model_name, quantizations[] } const loading = ref(false) + const searchQuery = ref('') + const searchLastQuery = ref('') + const searchHasSearched = ref(false) const searchResults = ref([]) const searchLoading = ref(false) const searchFormat = ref('gguf') @@ -90,6 +93,9 @@ export const useModelStore = defineStore('models', () => { searchLoading.value = true try { const { data } = await axios.post('/api/models/search', { query, limit, model_format: modelFormat }) + searchQuery.value = query + searchLastQuery.value = query + searchHasSearched.value = true searchResults.value = Array.isArray(data) ? data : [] searchFormat.value = modelFormat return searchResults.value @@ -102,6 +108,13 @@ export const useModelStore = defineStore('models', () => { } } + function clearSearchState() { + searchQuery.value = '' + searchLastQuery.value = '' + searchHasSearched.value = false + searchResults.value = [] + } + // ── Download ────────────────────────────────────────────── async function downloadModel(huggingfaceId, filename, totalBytes = 0, modelFormat = 'gguf', pipelineTag = null) { @@ -124,12 +137,14 @@ export const useModelStore = defineStore('models', () => { return data } - async function downloadGgufBundle(huggingfaceId, quantization, files, pipelineTag = null) { + async function downloadGgufBundle(huggingfaceId, quantization, files, pipelineTag = null, mmprojFilename = null, mmprojSize = 0) { const { data } = await axios.post('/api/models/gguf/download-bundle', { huggingface_id: huggingfaceId, quantization, files, pipeline_tag: pipelineTag, + mmproj_filename: mmprojFilename, + mmproj_size: mmprojSize, }) return data } @@ -163,6 +178,14 @@ export const useModelStore = defineStore('models', () => { return data } + async function updateModelProjector(modelId, mmprojFilename = null, totalBytes = 0) { + const { data } = await axios.post(`/api/models/${encodeURIComponent(modelId)}/projector`, { + mmproj_filename: mmprojFilename, + total_bytes: totalBytes, + }) + return data + } + // ── HuggingFace Token ───────────────────────────────────── async function fetchHuggingfaceTokenStatus() { @@ -226,6 +249,9 @@ export const useModelStore = defineStore('models', () => { return { models, loading, + searchQuery, + searchLastQuery, + searchHasSearched, searchResults, searchLoading, searchFormat, @@ -249,6 +275,7 @@ export const useModelStore = defineStore('models', () => { deleteModelGroup, deleteSafetensorsModel, searchModels, + clearSearchState, downloadModel, downloadSafetensorsBundle, downloadGgufBundle, @@ -257,6 +284,7 @@ export const useModelStore = defineStore('models', () => { getModelConfig, updateModelConfig, getModelDetails, + updateModelProjector, fetchHuggingfaceTokenStatus, setHuggingfaceToken, clearHuggingfaceToken, diff --git a/frontend/src/stores/progress.js b/frontend/src/stores/progress.js index 7cfecda..a30d033 100644 --- a/frontend/src/stores/progress.js +++ b/frontend/src/stores/progress.js @@ -14,8 +14,6 @@ const SSE_EVENT_TYPES = [ 'model_status', 'model_event', 'unified_monitoring', - 'lmdeploy_status', - 'lmdeploy_runtime_log', 'lmdeploy_install_status', 'lmdeploy_install_log', 'cuda_install_status', @@ -26,9 +24,13 @@ const SSE_EVENT_TYPES = [ export const useProgressStore = defineStore('progress', () => { const tasks = ref({}) + const taskLogs = ref({}) const eventSource = ref(null) const connected = ref(false) const subscribers = ref(new Map()) // eventType -> Set + const CUDA_TASK_ID = 'cuda_operation' + const LMDEPLOY_TASK_ID = 'lmdeploy_operation' + const MAX_LOG_LINES = 200 const activeTasks = computed(() => { return Object.values(tasks.value).filter(t => t.status === 'running') @@ -51,16 +53,214 @@ export const useProgressStore = defineStore('progress', () => { if (any) any.forEach(cb => { try { cb(eventType, data) } catch (_) {} }) } + function upsertTask(taskId, updates) { + const existing = tasks.value[taskId] || {} + tasks.value = { + ...tasks.value, + [taskId]: { + ...existing, + task_id: taskId, + ...updates, + }, + } + } + + function appendTaskLogs(taskId, lines) { + const entries = Array.isArray(lines) ? lines : [lines] + const existing = taskLogs.value[taskId] || [] + const next = [...existing] + const seen = new Set(existing) + + entries.forEach((entry) => { + if (typeof entry !== 'string') return + entry.split(/\r?\n/).forEach((rawLine) => { + const line = rawLine.trim() + if (!line) return + if (seen.has(line)) return + seen.add(line) + next.push(line) + }) + }) + + if (next.length === existing.length) return + + taskLogs.value = { + ...taskLogs.value, + [taskId]: next.slice(-MAX_LOG_LINES), + } + } + + function syncTaskLogsFromTask(task) { + if (!task?.task_id) return + + const existing = taskLogs.value[task.task_id] || [] + const metadataLines = Array.isArray(task.metadata?.log_lines) ? task.metadata.log_lines : [] + + if (existing.length === 0 && metadataLines.length > 0) { + appendTaskLogs(task.task_id, metadataLines) + } + + if (task.message && existing.length === 0) { + appendTaskLogs(task.task_id, task.message) + } + } + + function normalizeCudaTask(eventType, payload) { + if (!payload || typeof payload !== 'object') return + + if (eventType === 'cuda_install_status') { + const operation = payload.operation || payload.status || 'install' + const description = operation === 'uninstall' ? 'Uninstall CUDA' : 'Install CUDA' + + if (payload.status === 'completed' || payload.status === 'failed') { + const existing = tasks.value[CUDA_TASK_ID] || {} + upsertTask(CUDA_TASK_ID, { + type: 'install', + description, + progress: payload.status === 'completed' ? 100 : (existing.progress ?? 0), + status: payload.status, + message: payload.message || existing.message || '', + metadata: { + ...(existing.metadata || {}), + target: 'cuda', + operation, + ended_at: payload.ended_at, + }, + }) + appendTaskLogs(CUDA_TASK_ID, payload.message) + return + } + + upsertTask(CUDA_TASK_ID, { + type: 'install', + description, + progress: 0, + status: 'running', + message: payload.message || (operation === 'uninstall' ? 'Preparing CUDA uninstall...' : 'Preparing CUDA install...'), + metadata: { + target: 'cuda', + operation, + started_at: payload.started_at, + }, + }) + appendTaskLogs(CUDA_TASK_ID, payload.message) + return + } + + if (eventType === 'cuda_install_progress') { + const existing = tasks.value[CUDA_TASK_ID] || {} + const operation = existing.metadata?.operation || 'install' + upsertTask(CUDA_TASK_ID, { + type: 'install', + description: operation === 'uninstall' ? 'Uninstall CUDA' : 'Install CUDA', + progress: Number(payload.progress ?? existing.progress ?? 0), + status: existing.status === 'failed' ? 'failed' : 'running', + message: payload.message || existing.message || '', + metadata: { + ...(existing.metadata || {}), + target: 'cuda', + stage: payload.stage, + timestamp: payload.timestamp, + }, + }) + } + } + + function normalizeLmdeployTask(eventType, payload) { + if (!payload || typeof payload !== 'object') return + + if (eventType === 'lmdeploy_install_status') { + const operation = payload.operation || payload.status || 'install' + const actionMap = { + install: 'Install LMDeploy', + install_source: 'Install LMDeploy from Source', + remove: 'Remove LMDeploy', + } + const description = actionMap[operation] || 'Install LMDeploy' + + if (payload.status === 'completed' || payload.status === 'failed') { + const existing = tasks.value[LMDEPLOY_TASK_ID] || {} + upsertTask(LMDEPLOY_TASK_ID, { + type: 'install', + description, + progress: payload.status === 'completed' ? 100 : (existing.progress ?? 0), + status: payload.status, + message: payload.message || existing.message || '', + metadata: { + ...(existing.metadata || {}), + target: 'lmdeploy', + operation, + ended_at: payload.ended_at, + }, + }) + appendTaskLogs(LMDEPLOY_TASK_ID, payload.message) + return + } + + upsertTask(LMDEPLOY_TASK_ID, { + type: 'install', + description, + progress: 10, + status: 'running', + message: payload.message || 'Preparing LMDeploy operation...', + metadata: { + target: 'lmdeploy', + operation, + started_at: payload.started_at, + log_count: 0, + }, + }) + appendTaskLogs(LMDEPLOY_TASK_ID, payload.message) + return + } + + if (eventType === 'lmdeploy_install_log') { + const existing = tasks.value[LMDEPLOY_TASK_ID] + if (!existing || existing.status !== 'running') return + const logCount = Number(existing.metadata?.log_count || 0) + 1 + const progress = Math.min(90, Math.max(Number(existing.progress || 10), 10 + logCount * 3)) + upsertTask(LMDEPLOY_TASK_ID, { + type: 'install', + description: existing.description || 'Install LMDeploy', + progress, + status: 'running', + message: payload.line || existing.message || '', + metadata: { + ...(existing.metadata || {}), + target: 'lmdeploy', + log_count: logCount, + timestamp: payload.timestamp, + }, + }) + appendTaskLogs(LMDEPLOY_TASK_ID, payload.line) + } + } + function handleEvent(eventType, rawData) { let data = rawData try { if (typeof rawData === 'string') data = JSON.parse(rawData) } catch (_) { return } + const payload = data?.data != null ? data.data : data + if (eventType === 'cuda_install_status' || eventType === 'cuda_install_progress') { + normalizeCudaTask(eventType, payload) + } + if (eventType === 'cuda_install_log') { + appendTaskLogs(CUDA_TASK_ID, payload?.line) + } + if (eventType === 'lmdeploy_install_status' || eventType === 'lmdeploy_install_log') { + normalizeLmdeployTask(eventType, payload) + } + if (eventType === 'build_progress') { + appendTaskLogs(payload?.task_id, payload?.log_lines) + } if (eventType === 'task_created' || eventType === 'task_updated') { const task = data?.data ?? data - if (task?.task_id) tasks.value = { ...tasks.value, [task.task_id]: task } + if (task?.task_id) { + tasks.value = { ...tasks.value, [task.task_id]: task } + syncTaskLogsFromTask(task) + } } - const payload = data?.data != null ? data.data : data notifySubscribers(eventType, payload) if (payload?.type && payload.type !== eventType) notifySubscribers(payload.type, payload) } @@ -104,6 +304,10 @@ export const useProgressStore = defineStore('progress', () => { return tasks.value[taskId] || null } + function getTaskLogs(taskId) { + return taskLogs.value[taskId] || [] + } + function subscribe(eventType, callback) { if (!subscribers.value.has(eventType)) subscribers.value.set(eventType, new Set()) subscribers.value.get(eventType).add(callback) @@ -123,12 +327,11 @@ export const useProgressStore = defineStore('progress', () => { const subscribeToDownloadComplete = (cb) => subscribe('download_complete', cb) const subscribeToUnifiedMonitoring = (cb) => subscribe('unified_monitoring', cb) const subscribeToModelEvents = (cb) => subscribe('model_event', cb) - const subscribeToLmdeployStatus = (cb) => subscribe('lmdeploy_status', cb) const subscribeToLmdeployInstallLog = (cb) => subscribe('lmdeploy_install_log', cb) - const subscribeToLmdeployRuntimeLog = (cb) => subscribe('lmdeploy_runtime_log', cb) return { tasks, + taskLogs, activeTasks, connected, connectionStatus, @@ -136,6 +339,7 @@ export const useProgressStore = defineStore('progress', () => { connect, disconnect, getTask, + getTaskLogs, subscribe, subscribeToDownloadProgress, subscribeToBuildProgress, @@ -144,8 +348,6 @@ export const useProgressStore = defineStore('progress', () => { subscribeToDownloadComplete, subscribeToUnifiedMonitoring, subscribeToModelEvents, - subscribeToLmdeployStatus, - subscribeToLmdeployInstallLog, - subscribeToLmdeployRuntimeLog + subscribeToLmdeployInstallLog } }) diff --git a/frontend/src/styles/_components.css b/frontend/src/styles/_components.css index 2154168..e07017f 100644 --- a/frontend/src/styles/_components.css +++ b/frontend/src/styles/_components.css @@ -112,6 +112,11 @@ margin-top: var(--spacing-xs); } +/* Global PrimeVue Tag tweaks */ +.p-tag.p-component { + padding-inline: 0.75rem; +} + .model-tag { display: inline-flex; align-items: center; @@ -534,7 +539,6 @@ /* Form inputs */ .p-inputtext, .p-textarea, -.p-dropdown, .p-inputnumber { padding: var(--spacing-sm); border-radius: var(--radius-md); @@ -546,6 +550,16 @@ box-sizing: border-box; } +.p-dropdown { + border-radius: var(--radius-md); + border: 1px solid var(--border-primary); + background: var(--bg-surface); + color: var(--text-primary); + transition: all var(--transition-normal); + width: 100%; + box-sizing: border-box; +} + .p-inputtext::placeholder, .p-textarea::placeholder { color: var(--text-muted); @@ -573,7 +587,16 @@ } .p-dropdown .p-dropdown-label { + padding: calc(var(--spacing-sm) * 0.6) var(--spacing-sm); color: var(--text-primary); + display: flex; + align-items: center; + font-size: 0.875rem; +} + +.p-dropdown .p-dropdown-trigger { + width: 2.4rem; + color: var(--text-secondary); } .p-dropdown-panel { diff --git a/frontend/src/views/EnginesView.vue b/frontend/src/views/EnginesView.vue index a7c78b0..363e291 100644 --- a/frontend/src/views/EnginesView.vue +++ b/frontend/src/views/EnginesView.vue @@ -45,148 +45,151 @@ -
+
-
GPU — {{ gpu.name }}
+
CUDA Toolkit
- {{ formatBytesIEC(gpu.memory_used_mb * 1048576) }} / - {{ formatBytesIEC(gpu.memory_total_mb * 1048576) }} VRAM + +
- +
+ + + +
+
+
+
+
+
+ +
+
GPU — {{ gpuItem.name }}
+
+ {{ formatBytesIEC(gpuItem.memory_used_mb * 1048576) }} / + {{ formatBytesIEC(gpuItem.memory_total_mb * 1048576) }} VRAM +
+
- - - - - -
-
-
- -

llama.cpp

- - -
-
-
-
-
-
- - Update available: {{ llamaCppUpdateInfo.latest_version }} - View release -
-
- Up to date ({{ llamaCppUpdateInfo.current_version }}) -
- - - -
-
- - -
-
- - -
-
-
- -

ik_llama.cpp

- - -
-
-
-
-
-
- - Update available: {{ ikLlamaUpdateInfo.latest_version }} - View -
-
- Up to date ({{ ikLlamaUpdateInfo.current_version }}) -
+
+ - +
+ CUDA Path: + {{ cuda.cuda_path || 'unknown' }} +
-
-
+
+
- - - +
- +
-
+
- -

CUDA Toolkit

- - + +

Engines

-
- - -
- Path: - {{ cuda.cuda_path || 'unknown' }} -
- -
-
- CUDA {{ v.version }} - - + + + +
-
- - No CUDA versions listed. -
- -
-
-
+
@@ -206,114 +209,168 @@ - -
-
-
- -

LMDeploy

- - -
-
-
- -
-
- - Update available: v{{ lmdeployUpdateInfo.latest_version }} - View on PyPI +
+
+ + Update available: {{ llamaCppUpdateInfo.latest_version }} + View release +
+
+ Up to date ({{ llamaCppUpdateInfo.current_version }}) +
+ + + +
-
- Up to date (v{{ lmdeployUpdateInfo.current_version || 'none' }}) +
+ +
+
+
+ +

ik_llama.cpp

+ + +
+
+
+
+
+ + Update available: {{ ikLlamaUpdateInfo.latest_version }} + View +
+
+ Up to date ({{ ikLlamaUpdateInfo.current_version }}) +
- + -
- Install type: - - +
-
- Source: - {{ lm.source_repo }} ({{ lm.source_branch }}) +
+ +
+
+
+ +

LMDeploy

+ + +
+
+
+
+
+ + Update available: v{{ lmdeployUpdateInfo.latest_version }} + View on PyPI +
+
+ Up to date (v{{ lmdeployUpdateInfo.current_version || 'none' }}) +
-
-
+ -
-
-
-
+
+ Install type: + + +
+
+ Source: + {{ lm.source_repo }} ({{ lm.source_branch }}) +
- - -
-
- - Fetching releases… -
- -
- + +
- +
- + - Leave blank for default branch + Use a release tag, branch, or commit. Latest detected release is used by default when available.
@@ -340,7 +397,10 @@
@@ -382,13 +442,12 @@ - - diff --git a/frontend/src/views/ModelLibrary.vue b/frontend/src/views/ModelLibrary.vue index fe75ba8..bb03b38 100644 --- a/frontend/src/views/ModelLibrary.vue +++ b/frontend/src/views/ModelLibrary.vue @@ -34,40 +34,62 @@ -
+
Loading models…
-
+

No models downloaded yet

Search HuggingFace to find and download models.

- +
- -
+ +
- - {{ group.base_model_name || group.huggingface_id }} + + {{ group.huggingface_id }} + +
- {{ group.huggingface_id }}
- - + +
+
+ {{ group.huggingface_id }} + + + +
+
+ + {{ formatBytes(primaryQuant(group).file_size) }} + + + Downloaded {{ formatDate(primaryQuant(group).downloaded_at) }} + +
+
+ + +
-
-
-
- {{ quant.quantization || quant.name }} - - - -
-
- - {{ formatBytes(quant.file_size) }} - - - Downloaded {{ formatDate(quant.downloaded_at) }} - -
-
- -
-
-
+ :quant="quant" + :is-starting="startingModels.has(quant.id)" + :is-stopping="stoppingModels.has(quant.id)" + :format-bytes="formatBytes" + :format-date="formatDate" + @start="startModel" + @stop="stopModel" + @configure="configureModel" + @delete="confirmDeleteModel" + />
@@ -170,7 +221,6 @@ -
@@ -185,6 +235,7 @@ import ProgressSpinner from 'primevue/progressspinner' import Dialog from 'primevue/dialog' import Password from 'primevue/password' import ConfirmDialog from 'primevue/confirmdialog' +import ModelRow from '@/components/ModelRow.vue' import { useModelStore } from '@/stores/models' const router = useRouter() @@ -202,11 +253,25 @@ const savingToken = ref(false) let pollTimer = null // ── Computed ─────────────────────────────────────────────── +// Backend /api/models already returns both GGUF and safetensors models +// grouped appropriately, so we can display models directly from there. +const displayGroups = computed(() => modelStore.models || []) + const totalModels = computed(() => - modelStore.models.reduce((acc, g) => acc + (g.quantizations?.length ?? 0), 0) + displayGroups.value.reduce((acc, g) => acc + (g.quantizations?.length ?? 0), 0) ) // ── Group expand/collapse ────────────────────────────────── +function isSafetensorsGroup(group) { + if (!group || !Array.isArray(group.quantizations) || !group.quantizations.length) return false + return group.quantizations.every(q => q.format === 'safetensors') +} + +function primaryQuant(group) { + if (!group || !Array.isArray(group.quantizations) || !group.quantizations.length) return null + return group.quantizations[0] +} + function toggleGroup(hfId) { if (expandedGroups.value.has(hfId)) { expandedGroups.value.delete(hfId) @@ -216,7 +281,7 @@ function toggleGroup(hfId) { } function expandAllGroups() { - modelStore.models.forEach(g => expandedGroups.value.add(g.huggingface_id)) + displayGroups.value.forEach(g => expandedGroups.value.add(g.huggingface_id)) } // ── Model actions ────────────────────────────────────────── @@ -331,11 +396,17 @@ function formatDate(iso) { // ── Lifecycle ────────────────────────────────────────────── onMounted(async () => { - await modelStore.fetchModels() - await modelStore.fetchHuggingfaceTokenStatus() + await Promise.all([ + modelStore.fetchModels(), + modelStore.fetchSafetensorsModels(), + modelStore.fetchHuggingfaceTokenStatus(), + ]) expandAllGroups() // Poll every 10 seconds for status updates - pollTimer = setInterval(() => modelStore.fetchModels(), 10000) + pollTimer = setInterval(() => { + modelStore.fetchModels() + modelStore.fetchSafetensorsModels() + }, 10000) }) onUnmounted(() => { @@ -477,14 +548,14 @@ onUnmounted(() => { .group-collapse-enter-to, .group-collapse-leave-from { max-height: 1000px; opacity: 1; } -.quantizations { +:deep(.quantizations) { padding: 0.5rem; display: flex; flex-direction: column; gap: 0.375rem; } -.quant-row { +:deep(.quant-row) { display: flex; justify-content: space-between; align-items: center; @@ -496,45 +567,52 @@ onUnmounted(() => { transition: border-color 0.15s; } -.quant-row.is-active { +:deep(.quant-row.is-active) { border-color: rgba(34, 197, 94, 0.4); background: rgba(34, 197, 94, 0.04); } -.quant-info { flex: 1; min-width: 0; } +:deep(.quant-info) { flex: 1; min-width: 0; } -.quant-main { +:deep(.quant-main) { display: flex; align-items: center; gap: 0.4rem; flex-wrap: wrap; } -.quant-name { +:deep(.quant-name) { font-weight: 600; font-size: 0.875rem; font-family: monospace; } -.quant-sub { +:deep(.quant-sub) { display: flex; gap: 0.75rem; margin-top: 0.2rem; } -.file-size, -.downloaded-at { +:deep(.file-size), +:deep(.downloaded-at) { font-size: 0.75rem; color: var(--text-secondary, #9ca3af); } -.quant-actions { +:deep(.quant-actions) { display: flex; gap: 0.25rem; flex-shrink: 0; align-items: center; } +/* Emphasize engine tag with a distinct background */ +.engine-tag { + background-color: rgba(59, 130, 246, 0.15); /* soft blue */ + border-color: rgba(59, 130, 246, 0.65); + color: #bfdbfe; +} + /* ── Token dialog ─────────────────────────────────────── */ .token-form { display: flex; flex-direction: column; gap: 0.75rem; } .token-desc { font-size: 0.875rem; color: var(--text-secondary, #9ca3af); margin: 0; } diff --git a/frontend/src/views/ModelSearch.vue b/frontend/src/views/ModelSearch.vue index 15cde29..22fc98d 100644 --- a/frontend/src/views/ModelSearch.vue +++ b/frontend/src/views/ModelSearch.vue @@ -17,7 +17,7 @@ text severity="secondary" class="clear-btn" - @click="query = ''; searchResults = []" + @click="clearSearchResults" />
@@ -101,6 +101,12 @@ {{ formatNumber(result.likes) }} + + {{ getResultArtifactCount(result) }} + + + {{ getResultSizeSummary(result) }} + {{ result.license }} @@ -133,43 +139,82 @@ - + + + - + - - + @@ -182,7 +227,8 @@ @@ -612,9 +943,12 @@ onMounted(async () => { .files-table tr:last-child td { border-bottom: none; } -.file-name { display: flex; align-items: center; gap: 0.4rem; } -.file-name code { font-size: 0.8rem; } +.file-subtext { color: var(--text-secondary, #9ca3af); font-size: 0.75rem; } .file-size { color: var(--text-secondary, #9ca3af); white-space: nowrap; } +.file-count { color: var(--text-secondary, #9ca3af); white-space: nowrap; } +.projector-cell { min-width: 9rem; } +.projector-select { min-width: 8rem; } +.file-actions { display: flex; align-items: center; gap: 0.35rem; justify-content: flex-end; flex-wrap: wrap; } .not-downloaded { color: var(--text-secondary, #9ca3af); } .safetensors-download { diff --git a/package-lock.json b/package-lock.json index d1a1f49..6a572b2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,13 +14,13 @@ "primeicons": "^6.0.0", "primevue": "^3.45.0", "vue": "^3.4.0", - "vue-router": "^4.2.0", - "vue3-toastify": "^0.1.0" + "vue-router": "^4.2.0" }, "devDependencies": { "@types/node": "^20.9.0", "@vitejs/plugin-vue": "^4.5.0", "concurrently": "^9.0.0", + "cross-env": "^10.1.0", "eslint": "^9.39.2", "eslint-plugin-vue": "^10.6.2", "prettier": "^3.7.4", @@ -273,6 +273,13 @@ "url": "https://github.com/sponsors/JounQin" } }, + "node_modules/@epic-web/invariant": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@epic-web/invariant/-/invariant-1.0.0.tgz", + "integrity": "sha512-lrTPqgvfFQtR/eY/qkIzp98OGdNJu0m5ji3q/nJI8v3SXkRKEnWiOxMmbvcSoAIzv/cGiuvRy57k4suKQSAdwA==", + "dev": true, + "license": "MIT" + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", @@ -1746,6 +1753,24 @@ } } }, + "node_modules/cross-env": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/cross-env/-/cross-env-10.1.0.tgz", + "integrity": "sha512-GsYosgnACZTADcmEyJctkJIoqAhHjttw7RsFrVoJNXbsWWqaq6Ym+7kZjq6mS45O0jij6vtiReppKQEtqWy6Dw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@epic-web/invariant": "^1.0.0", + "cross-spawn": "^7.0.6" + }, + "bin": { + "cross-env": "dist/bin/cross-env.js", + "cross-env-shell": "dist/bin/cross-env-shell.js" + }, + "engines": { + "node": ">=20" + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -4101,28 +4126,6 @@ "vue": "^3.5.0" } }, - "node_modules/vue3-toastify": { - "version": "0.1.14", - "resolved": "https://registry.npmjs.org/vue3-toastify/-/vue3-toastify-0.1.14.tgz", - "integrity": "sha512-2wyzMhWq8IjTclL25tqKWknDFdFI1vPueMGZpHNlPWf6TBfxBycBANS+2n4W1xD7tHhX4G6HhCe31sle6OpwYQ==", - "license": "MIT", - "workspaces": [ - "docs", - "playground" - ], - "engines": { - "node": ">=16", - "npm": ">=7" - }, - "peerDependencies": { - "vue": ">=3.2.0" - }, - "peerDependenciesMeta": { - "vue": { - "optional": true - } - } - }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 4d092de..27ee91a 100644 --- a/package.json +++ b/package.json @@ -5,9 +5,9 @@ "scripts": { "dev": "cd frontend && vite", "dev:frontend": "cd frontend && vite", - "dev:backend": "WATCHFILES_FORCE_POLLING=true python -m uvicorn main:app --host 0.0.0.0 --port 8081 --app-dir backend --reload --reload-dir backend", + "dev:backend": "cross-env WATCHFILES_FORCE_POLLING=true python -m uvicorn backend.main:app --host 0.0.0.0 --port 8081 --reload --reload-dir backend", "dev:all": "concurrently -n backend,frontend -c blue,green \"npm run dev:backend\" \"npm run dev:frontend\"", - "kill-ports": "powershell.exe -Command \"Get-NetTCPConnection -LocalPort 5173,8080 -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }\"", + "kill-ports": "powershell.exe -Command \"Get-NetTCPConnection -LocalPort 5173,8081 -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }\"", "build": "cd frontend && vite build", "preview": "cd frontend && vite preview" }, @@ -21,9 +21,10 @@ "vue-router": "^4.2.0" }, "devDependencies": { - "concurrently": "^9.0.0", "@types/node": "^20.9.0", "@vitejs/plugin-vue": "^4.5.0", + "concurrently": "^9.0.0", + "cross-env": "^10.1.0", "eslint": "^9.39.2", "eslint-plugin-vue": "^10.6.2", "prettier": "^3.7.4",
File{{ searchFormat === 'gguf' ? 'Item' : 'Model' }} SizeShardsProjector Status
- {{ file.filename }} - + {{ formatResultItemLabel(file, result) }} + + {{ file.variantPrefix }} variant + + + {{ file.subtext }} + {{ formatBytes(file.size) }} - - + + {{ file.kind === 'quant' ? (file.files?.length || 0) : 1 }} - + + + + +
+