lapy · lapy · Mar 9, 2026 · Mar 9, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -81,8 +81,8 @@ ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_VISIBLE_DEVICES=all \
     NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-    HF_HOME=/app/data/temp/.cache/huggingface \
-    HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub \
+    HF_HOME=/app/data/hf-cache \
+    HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub \
     VENV_PATH=/opt/venv \
     PYTHONPATH=/app \
     PATH="/app/data/cuda/current/bin:${PATH}" \
@@ -133,7 +133,7 @@ RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERS
     && cmake --version
 
 # Install llama-swap binary
-ARG LLAMA_SWAP_VERSION=179
+ARG LLAMA_SWAP_VERSION=197
 RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \
     tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \
     mv /tmp/llama-swap /usr/local/bin/llama-swap && \
@@ -168,7 +168,7 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 # Create non-root user and data directory structure
 RUN useradd -m -s /bin/bash appuser && \
-    mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp/.cache/huggingface/hub && \
+    mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/hf-cache/hub && \
     chown -R appuser:appuser /app && \
     # Ensure entrypoint script is accessible to appuser
     chmod 755 /usr/local/bin/docker-entrypoint.sh

diff --git a/README.md b/README.md
diff --git a/backend/data_store.py b/backend/data_store.py
@@ -1,6 +1,8 @@
 """YAML-backed data store replacing SQLite."""
 
+import json
 import os
+import re
 import threading
 from typing import Any, Dict, List, Optional
 
@@ -31,6 +33,59 @@ def generate_proxy_name(huggingface_id: str, quantization: Optional[str] = None)
     return huggingface_slug
 
 
+def _coerce_config(config_value: Optional[Any]) -> Dict[str, Any]:
+    if not config_value:
+        return {}
+    if isinstance(config_value, dict):
+        return config_value
+    if isinstance(config_value, str):
+        try:
+            return json.loads(config_value)
+        except json.JSONDecodeError:
+            return {}
+    return {}
+
+
+def _model_value(model: Any, key: str, default: Any = None) -> Any:
+    if isinstance(model, dict):
+        return model.get(key, default)
+    return getattr(model, key, default)
+
+
+def normalize_proxy_alias(alias: Optional[str]) -> str:
+    """Normalize a user-provided model alias into a safe exposed engine ID."""
+    if alias is None:
+        return ""
+
+    normalized = str(alias).strip().lower()
+    if not normalized:
+        return ""
+
+    normalized = normalized.replace("/", "-").replace("\\", "-")
+    normalized = re.sub(r"\s+", "-", normalized)
+    normalized = re.sub(r"[^a-z0-9._-]", "-", normalized)
+    normalized = re.sub(r"-{2,}", "-", normalized)
+    normalized = normalized.strip("._-")
+    return normalized
+
+
+def resolve_proxy_name(model: Any) -> str:
+    """Return the exposed runtime model ID for a stored model."""
+    config = _coerce_config(_model_value(model, "config"))
+    alias = normalize_proxy_alias(config.get("model_alias"))
+    if alias:
+        return alias
+
+    existing = normalize_proxy_alias(_model_value(model, "proxy_name"))
+    if existing:
+        return existing
+
+    return generate_proxy_name(
+        _model_value(model, "huggingface_id", ""),
+        _model_value(model, "quantization"),
+    )
+
+
 class DataStore:
     """Thread-safe YAML-backed data store replacing SQLite."""
 
@@ -175,6 +230,23 @@ def delete_engine_version(self, engine: str, version: str) -> bool:
         self._save_yaml("engines.yaml", data)
         return True
 
+    def get_engine_build_settings(self, engine: str) -> Dict[str, Any]:
+        """Return persisted build settings for the given engine (or empty dict)."""
+        data = self._read_yaml("engines.yaml")
+        return data.get(engine, {}).get("build_settings", {}) or {}
+
+    def update_engine_build_settings(self, engine: str, settings: Dict[str, Any]) -> Dict[str, Any]:
+        """Merge and persist build settings for the given engine. Returns the stored settings."""
+        if not isinstance(settings, dict):
+            settings = {}
+        data = self._read_yaml("engines.yaml")
+        engine_data = data.setdefault(engine, {})
+        existing = engine_data.get("build_settings") or {}
+        merged = {**existing, **settings}
+        engine_data["build_settings"] = merged
+        self._save_yaml("engines.yaml", data)
+        return merged
+
     # --- LMDeploy ---
 
     def get_lmdeploy_status(self) -> dict:

diff --git a/backend/gguf_introspection_config.json b/backend/gguf_introspection_config.json
@@ -0,0 +1,23 @@
+{
+  "global": {
+    "context_length": {
+      "preferred_keys": [
+        "general.context_length",
+        "general.model_max_length",
+        "general.max_position_embeddings"
+      ]
+    }
+  },
+  "glm4": {
+    "match_arch": ["glm4", "glm4moe"],
+    "context_length": {
+      "preferred_keys": ["glm4.context_length", "glm4.model_max_length"],
+      "fallback_terms": ["context", "max_position_embeddings"]
+    },
+    "layer_count": {
+      "preferred_keys": ["glm4.num_hidden_layers"],
+      "fallback_terms": ["layer", "block"]
+    }
+  }
+}
+
diff --git a/backend/gguf_reader.py b/backend/gguf_reader.py
@@ -2,13 +2,14 @@
 GGUF file metadata reader for extracting model layer information
 """
 
-import struct
 import os
+import struct
 import mmap
 from enum import IntEnum
-from typing import Dict, Optional, Any, List, Tuple, BinaryIO
+from typing import Any, BinaryIO, Dict, List, Optional, Tuple
 
 from backend.logging_config import get_logger
+from backend.model_introspection import GgufIntrospector
 
 logger = get_logger(__name__)
 
@@ -1247,22 +1248,34 @@ def get_model_layer_info(model_path: str) -> Optional[Dict[str, Any]]:
             logger.error(f"Model file is not GGUF format: {model_path}")
             return None
 
-        metadata = read_gguf_metadata(model_path)
-        if metadata:
-            return {
-                "layer_count": metadata["layer_count"],
-                "architecture": metadata["architecture"],
-                "context_length": metadata["context_length"],
-                "vocab_size": 0,  # Not extracted from metadata
-                "embedding_length": metadata["embedding_length"],
-                "attention_head_count": metadata["attention_head_count"],
-                "attention_head_count_kv": metadata["attention_head_count_kv"],
-                "block_count": metadata["block_count"],
-                "is_moe": metadata["is_moe"],
-                "expert_count": metadata["expert_count"],
-                "experts_used_count": metadata["experts_used_count"],
-            }
-        return None
+        with GGUFReader(model_path) as reader:
+            metadata = reader.metadata
+            tensors = reader.tensors
+
+            introspector = GgufIntrospector(metadata=metadata, tensors=tensors)
+            info = introspector.build_model_info()
+
+        return {
+            "layer_count": int(info.layer_count) if info.layer_count else 0,
+            "architecture": metadata.get("general.architecture", ""),
+            "context_length": int(info.context_length) if info.context_length else 0,
+            "vocab_size": int(info.vocab_size) if info.vocab_size else 0,
+            "embedding_length": int(info.embedding_length)
+            if info.embedding_length
+            else 0,
+            "attention_head_count": int(info.attention_head_count)
+            if info.attention_head_count
+            else 0,
+            "attention_head_count_kv": int(info.attention_head_count_kv)
+            if info.attention_head_count_kv
+            else 0,
+            "block_count": int(info.block_count) if info.block_count else 0,
+            "is_moe": bool(info.is_moe),
+            "expert_count": int(info.expert_count) if info.expert_count else 0,
+            "experts_used_count": int(info.experts_used_count)
+            if info.experts_used_count
+            else 0,
+        }
     except Exception as e:
         logger.error(
             f"Failed to get model layer info from {model_path}: {e}", exc_info=True

diff --git a/backend/huggingface.py b/backend/huggingface.py
@@ -863,9 +863,6 @@ async def _fetch_and_merge(repo_id: Optional[str]):
             metadata["tokenizer"] = tokenizer_json
 
     await _fetch_and_merge(huggingface_id)
-    if huggingface_id and huggingface_id.lower().endswith("-gguf"):
-        base_repo = huggingface_id[:-5]
-        await _fetch_and_merge(base_repo)
 
     try:
         layer_info = get_model_layer_info(file_path) or {}
@@ -1210,6 +1207,20 @@ async def process_model(model):
         if result is not None:
             valid_results.append(result)
 
+    if model_format == "gguf":
+        def _gguf_sort_key(item: Dict[str, Any]):
+            quantizations = item.get("quantizations") or {}
+            size_candidates = [
+                q.get("total_size") or 0
+                for q in quantizations.values()
+                if isinstance(q, dict)
+            ]
+            positive_sizes = [size for size in size_candidates if size > 0]
+            min_size = min(positive_sizes) if positive_sizes else float("inf")
+            return (min_size, -(item.get("downloads") or 0), item.get("id") or "")
+
+        valid_results.sort(key=_gguf_sort_key)
+
     return valid_results[:limit]
 
 
@@ -1219,27 +1230,33 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
         logger.info(f"Processing model: {model.id}")
 
         quantizations: Dict[str, Dict] = {}
+        mmproj_files: List[Dict[str, Any]] = []
         safetensors_files: List[Dict] = []
         repo_files: List[Dict[str, Any]] = []
 
         if hasattr(model, "siblings") and model.siblings:
             if model_format == "gguf":
-                # Group GGUF files by logical quantization, handling multi-part shards
-                # Accept both plain `.gguf` and multi-part patterns like `.gguf.part1of2`
-                # Exclude mmproj (vision/multimodal projection) files – they are extensions, not standalone quants
+                # Group GGUF files by logical quantization, handling multi-part shards.
                 gguf_siblings = [
                     s
                     for s in model.siblings
                     if isinstance(getattr(s, "rfilename", None), str)
                     and re.search(r"\.gguf(\.|$)", s.rfilename)
-                    and "mmproj" not in s.rfilename.lower()
                 ]
                 logger.info(f"Model {model.id}: {len(gguf_siblings)} GGUF files found")
                 if not gguf_siblings:
                     return None
 
                 for sibling in gguf_siblings:
                     filename = sibling.rfilename
+                    if "mmproj" in filename.lower():
+                        mmproj_files.append(
+                            {
+                                "filename": filename,
+                                "size": getattr(sibling, "size", 0) or 0,
+                            }
+                        )
+                        continue
                     # Normalize filename by stripping shard suffix patterns like:
                     #   -00001-of-00002.gguf (TheBloke-style)
                     #   .gguf.part1of2 (Hugging Face-style multi-part)
@@ -1298,25 +1315,9 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                         else 0.0
                     )
 
-                # Siblings from list_models often have size=None; fetch accurate sizes from Hub
-                try:
-                    all_filenames = [s.rfilename for s in gguf_siblings]
-                    accurate_sizes = get_accurate_file_sizes(model.id, all_filenames)
-                    if accurate_sizes:
-                        for entry in quantizations.values():
-                            for f in entry["files"]:
-                                f["size"] = accurate_sizes.get(f["filename"]) or f["size"] or 0
-                            entry["total_size"] = sum(f["size"] for f in entry["files"])
-                            entry["size_mb"] = (
-                                round(entry["total_size"] / (1024 * 1024), 2)
-                                if entry["total_size"]
-                                else 0.0
-                            )
-                except Exception as size_err:
-                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
-
-                # If no quantizations were detected after grouping, skip this model
-                if not quantizations:
+                # Search should stay to a single HF API call. Accurate file sizes are lazy-loaded on expand.
+                # If no downloadable GGUF entries were detected after grouping, skip this model.
+                if not quantizations and not mmproj_files:
                     return None
             else:
                 safetensors_files = []
@@ -1338,15 +1339,6 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
                 )
                 if not safetensors_files:
                     return None
-                # Fetch accurate sizes; list_models siblings often have size=None
-                try:
-                    st_filenames = [f["filename"] for f in safetensors_files]
-                    accurate_sizes = get_accurate_file_sizes(model.id, st_filenames)
-                    if accurate_sizes:
-                        for f in safetensors_files:
-                            f["size"] = accurate_sizes.get(f["filename"]) or 0
-                except Exception as size_err:
-                    logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
         else:
             return None
 
@@ -1364,6 +1356,7 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
             "tags": model.tags or [],
             "model_format": model_format,
             "quantizations": quantizations if model_format == "gguf" else {},
+            "mmproj_files": mmproj_files if model_format == "gguf" else [],
             "safetensors_files": (
                 safetensors_files if model_format == "safetensors" else []
             ),
@@ -1668,7 +1661,7 @@ async def get_model_details(model_id: str) -> Dict:
                 config_path = hf_hub_download(
                     repo_id=model_id,
                     filename="config.json",
-                    local_dir="data/temp",
+                    local_dir="data/hf-cache",
                     local_dir_use_symlinks=False,
                 )
 

diff --git a/backend/llama_manager.py b/backend/llama_manager.py
@@ -2273,13 +2273,19 @@ def set_flag(flag: str, value: bool):
             logger.error(f"Build failed: {e}")
             if progress_manager and task_id:
                 try:
-                    await progress_manager.send_build_progress(
-                        task_id=task_id,
-                        stage="error",
-                        progress=0,
-                        message=f"Build failed: {str(e)}",
-                        log_lines=[f"Error: {str(e)}"],
+                    existing_task = progress_manager.get_task(task_id)
+                    existing_logs = (
+                        (existing_task or {}).get("metadata", {}).get("log_lines") or []
                     )
+                    error_text = str(e)
+                    if error_text not in existing_logs:
+                        await progress_manager.send_build_progress(
+                            task_id=task_id,
+                            stage="error",
+                            progress=0,
+                            message=f"Build failed: {error_text}",
+                            log_lines=[f"Error: {error_text}"],
+                        )
                 except Exception as ws_error:
                     logger.error(f"Failed to send error via SSE: {ws_error}")
             raise Exception(f"Failed to build from source {commit_sha}: {e}")

diff --git a/backend/llama_swap_client.py b/backend/llama_swap_client.py
@@ -149,3 +149,23 @@ async def get_model_info(self, model_id: str, upstream_path: str = "v1/models"):
         except Exception as e:
             logger.error(f"Failed to get model info for {model_id}: {e}")
             raise
+
+    async def load_model(self, model_name: str, retries: int = 20, delay: float = 0.5):
+        """Trigger on-demand model loading via llama-swap's upstream route."""
+        last_error = None
+        for _ in range(max(1, retries)):
+            try:
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(
+                        f"{self.base_url}/upstream/{model_name}/v1/models", timeout=30
+                    )
+                    response.raise_for_status()
+                    self._loading_models.discard(model_name)
+                    return response.json()
+            except Exception as e:
+                last_error = e
+                self._loading_models.add(model_name)
+                await asyncio.sleep(delay)
+        self._loading_models.discard(model_name)
+        logger.error(f"Failed to load model {model_name}: {last_error}")
+        raise last_error