Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ ENV DEBIAN_FRONTEND=noninteractive \
CUDA_VISIBLE_DEVICES=all \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
HF_HOME=/app/data/temp/.cache/huggingface \
HUGGINGFACE_HUB_CACHE=/app/data/temp/.cache/huggingface/hub \
HF_HOME=/app/data/hf-cache \
HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub \
VENV_PATH=/opt/venv \
PYTHONPATH=/app \
PATH="/app/data/cuda/current/bin:${PATH}" \
Expand Down Expand Up @@ -133,7 +133,7 @@ RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERS
&& cmake --version

# Install llama-swap binary
ARG LLAMA_SWAP_VERSION=179
ARG LLAMA_SWAP_VERSION=197
RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \
tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \
mv /tmp/llama-swap /usr/local/bin/llama-swap && \
Expand Down Expand Up @@ -168,7 +168,7 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python

# Create non-root user and data directory structure
RUN useradd -m -s /bin/bash appuser && \
mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/temp/.cache/huggingface/hub && \
mkdir -p /app/data/models /app/data/config /app/data/configs /app/data/logs /app/data/llama-cpp /app/data/hf-cache/hub && \
chown -R appuser:appuser /app && \
# Ensure entrypoint script is accessible to appuser
chmod 755 /usr/local/bin/docker-entrypoint.sh
Expand Down
778 changes: 376 additions & 402 deletions README.md

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions backend/data_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""YAML-backed data store replacing SQLite."""

import json
import os
import re
import threading
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -31,6 +33,59 @@ def generate_proxy_name(huggingface_id: str, quantization: Optional[str] = None)
return huggingface_slug


def _coerce_config(config_value: Optional[Any]) -> Dict[str, Any]:
if not config_value:
return {}
if isinstance(config_value, dict):
return config_value
if isinstance(config_value, str):
try:
return json.loads(config_value)
except json.JSONDecodeError:
return {}
return {}


def _model_value(model: Any, key: str, default: Any = None) -> Any:
if isinstance(model, dict):
return model.get(key, default)
return getattr(model, key, default)


def normalize_proxy_alias(alias: Optional[str]) -> str:
"""Normalize a user-provided model alias into a safe exposed engine ID."""
if alias is None:
return ""

normalized = str(alias).strip().lower()
if not normalized:
return ""

normalized = normalized.replace("/", "-").replace("\\", "-")
normalized = re.sub(r"\s+", "-", normalized)
normalized = re.sub(r"[^a-z0-9._-]", "-", normalized)
normalized = re.sub(r"-{2,}", "-", normalized)
normalized = normalized.strip("._-")
return normalized


def resolve_proxy_name(model: Any) -> str:
"""Return the exposed runtime model ID for a stored model."""
config = _coerce_config(_model_value(model, "config"))
alias = normalize_proxy_alias(config.get("model_alias"))
if alias:
return alias

existing = normalize_proxy_alias(_model_value(model, "proxy_name"))
if existing:
return existing

return generate_proxy_name(
_model_value(model, "huggingface_id", ""),
_model_value(model, "quantization"),
)


class DataStore:
"""Thread-safe YAML-backed data store replacing SQLite."""

Expand Down Expand Up @@ -175,6 +230,23 @@ def delete_engine_version(self, engine: str, version: str) -> bool:
self._save_yaml("engines.yaml", data)
return True

def get_engine_build_settings(self, engine: str) -> Dict[str, Any]:
"""Return persisted build settings for the given engine (or empty dict)."""
data = self._read_yaml("engines.yaml")
return data.get(engine, {}).get("build_settings", {}) or {}

def update_engine_build_settings(self, engine: str, settings: Dict[str, Any]) -> Dict[str, Any]:
"""Merge and persist build settings for the given engine. Returns the stored settings."""
if not isinstance(settings, dict):
settings = {}
data = self._read_yaml("engines.yaml")
engine_data = data.setdefault(engine, {})
existing = engine_data.get("build_settings") or {}
merged = {**existing, **settings}
engine_data["build_settings"] = merged
self._save_yaml("engines.yaml", data)
return merged

# --- LMDeploy ---

def get_lmdeploy_status(self) -> dict:
Expand Down
23 changes: 23 additions & 0 deletions backend/gguf_introspection_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"global": {
"context_length": {
"preferred_keys": [
"general.context_length",
"general.model_max_length",
"general.max_position_embeddings"
]
}
},
"glm4": {
"match_arch": ["glm4", "glm4moe"],
"context_length": {
"preferred_keys": ["glm4.context_length", "glm4.model_max_length"],
"fallback_terms": ["context", "max_position_embeddings"]
},
"layer_count": {
"preferred_keys": ["glm4.num_hidden_layers"],
"fallback_terms": ["layer", "block"]
}
}
}

49 changes: 31 additions & 18 deletions backend/gguf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
GGUF file metadata reader for extracting model layer information
"""

import struct
import os
import struct
import mmap
from enum import IntEnum
from typing import Dict, Optional, Any, List, Tuple, BinaryIO
from typing import Any, BinaryIO, Dict, List, Optional, Tuple

from backend.logging_config import get_logger
from backend.model_introspection import GgufIntrospector

logger = get_logger(__name__)

Expand Down Expand Up @@ -1247,22 +1248,34 @@ def get_model_layer_info(model_path: str) -> Optional[Dict[str, Any]]:
logger.error(f"Model file is not GGUF format: {model_path}")
return None

metadata = read_gguf_metadata(model_path)
if metadata:
return {
"layer_count": metadata["layer_count"],
"architecture": metadata["architecture"],
"context_length": metadata["context_length"],
"vocab_size": 0, # Not extracted from metadata
"embedding_length": metadata["embedding_length"],
"attention_head_count": metadata["attention_head_count"],
"attention_head_count_kv": metadata["attention_head_count_kv"],
"block_count": metadata["block_count"],
"is_moe": metadata["is_moe"],
"expert_count": metadata["expert_count"],
"experts_used_count": metadata["experts_used_count"],
}
return None
with GGUFReader(model_path) as reader:
metadata = reader.metadata
tensors = reader.tensors

introspector = GgufIntrospector(metadata=metadata, tensors=tensors)
info = introspector.build_model_info()

return {
"layer_count": int(info.layer_count) if info.layer_count else 0,
"architecture": metadata.get("general.architecture", ""),
"context_length": int(info.context_length) if info.context_length else 0,
"vocab_size": int(info.vocab_size) if info.vocab_size else 0,
"embedding_length": int(info.embedding_length)
if info.embedding_length
else 0,
"attention_head_count": int(info.attention_head_count)
if info.attention_head_count
else 0,
"attention_head_count_kv": int(info.attention_head_count_kv)
if info.attention_head_count_kv
else 0,
"block_count": int(info.block_count) if info.block_count else 0,
"is_moe": bool(info.is_moe),
"expert_count": int(info.expert_count) if info.expert_count else 0,
"experts_used_count": int(info.experts_used_count)
if info.experts_used_count
else 0,
}
except Exception as e:
logger.error(
f"Failed to get model layer info from {model_path}: {e}", exc_info=True
Expand Down
65 changes: 29 additions & 36 deletions backend/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,9 +863,6 @@ async def _fetch_and_merge(repo_id: Optional[str]):
metadata["tokenizer"] = tokenizer_json

await _fetch_and_merge(huggingface_id)
if huggingface_id and huggingface_id.lower().endswith("-gguf"):
base_repo = huggingface_id[:-5]
await _fetch_and_merge(base_repo)

try:
layer_info = get_model_layer_info(file_path) or {}
Expand Down Expand Up @@ -1210,6 +1207,20 @@ async def process_model(model):
if result is not None:
valid_results.append(result)

if model_format == "gguf":
def _gguf_sort_key(item: Dict[str, Any]):
quantizations = item.get("quantizations") or {}
size_candidates = [
q.get("total_size") or 0
for q in quantizations.values()
if isinstance(q, dict)
]
positive_sizes = [size for size in size_candidates if size > 0]
min_size = min(positive_sizes) if positive_sizes else float("inf")
return (min_size, -(item.get("downloads") or 0), item.get("id") or "")

valid_results.sort(key=_gguf_sort_key)

return valid_results[:limit]


Expand All @@ -1219,27 +1230,33 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
logger.info(f"Processing model: {model.id}")

quantizations: Dict[str, Dict] = {}
mmproj_files: List[Dict[str, Any]] = []
safetensors_files: List[Dict] = []
repo_files: List[Dict[str, Any]] = []

if hasattr(model, "siblings") and model.siblings:
if model_format == "gguf":
# Group GGUF files by logical quantization, handling multi-part shards
# Accept both plain `.gguf` and multi-part patterns like `.gguf.part1of2`
# Exclude mmproj (vision/multimodal projection) files – they are extensions, not standalone quants
# Group GGUF files by logical quantization, handling multi-part shards.
gguf_siblings = [
s
for s in model.siblings
if isinstance(getattr(s, "rfilename", None), str)
and re.search(r"\.gguf(\.|$)", s.rfilename)
and "mmproj" not in s.rfilename.lower()
]
logger.info(f"Model {model.id}: {len(gguf_siblings)} GGUF files found")
if not gguf_siblings:
return None

for sibling in gguf_siblings:
filename = sibling.rfilename
if "mmproj" in filename.lower():
mmproj_files.append(
{
"filename": filename,
"size": getattr(sibling, "size", 0) or 0,
}
)
continue
# Normalize filename by stripping shard suffix patterns like:
# -00001-of-00002.gguf (TheBloke-style)
# .gguf.part1of2 (Hugging Face-style multi-part)
Expand Down Expand Up @@ -1298,25 +1315,9 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
else 0.0
)

# Siblings from list_models often have size=None; fetch accurate sizes from Hub
try:
all_filenames = [s.rfilename for s in gguf_siblings]
accurate_sizes = get_accurate_file_sizes(model.id, all_filenames)
if accurate_sizes:
for entry in quantizations.values():
for f in entry["files"]:
f["size"] = accurate_sizes.get(f["filename"]) or f["size"] or 0
entry["total_size"] = sum(f["size"] for f in entry["files"])
entry["size_mb"] = (
round(entry["total_size"] / (1024 * 1024), 2)
if entry["total_size"]
else 0.0
)
except Exception as size_err:
logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")

# If no quantizations were detected after grouping, skip this model
if not quantizations:
# Search should stay to a single HF API call. Accurate file sizes are lazy-loaded on expand.
# If no downloadable GGUF entries were detected after grouping, skip this model.
if not quantizations and not mmproj_files:
return None
else:
safetensors_files = []
Expand All @@ -1338,15 +1339,6 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
)
if not safetensors_files:
return None
# Fetch accurate sizes; list_models siblings often have size=None
try:
st_filenames = [f["filename"] for f in safetensors_files]
accurate_sizes = get_accurate_file_sizes(model.id, st_filenames)
if accurate_sizes:
for f in safetensors_files:
f["size"] = accurate_sizes.get(f["filename"]) or 0
except Exception as size_err:
logger.debug(f"Could not fetch accurate sizes for {model.id}: {size_err}")
else:
return None

Expand All @@ -1364,6 +1356,7 @@ async def _process_single_model(model, model_format: str) -> Optional[Dict]:
"tags": model.tags or [],
"model_format": model_format,
"quantizations": quantizations if model_format == "gguf" else {},
"mmproj_files": mmproj_files if model_format == "gguf" else [],
"safetensors_files": (
safetensors_files if model_format == "safetensors" else []
),
Expand Down Expand Up @@ -1668,7 +1661,7 @@ async def get_model_details(model_id: str) -> Dict:
config_path = hf_hub_download(
repo_id=model_id,
filename="config.json",
local_dir="data/temp",
local_dir="data/hf-cache",
local_dir_use_symlinks=False,
)

Expand Down
18 changes: 12 additions & 6 deletions backend/llama_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2273,13 +2273,19 @@ def set_flag(flag: str, value: bool):
logger.error(f"Build failed: {e}")
if progress_manager and task_id:
try:
await progress_manager.send_build_progress(
task_id=task_id,
stage="error",
progress=0,
message=f"Build failed: {str(e)}",
log_lines=[f"Error: {str(e)}"],
existing_task = progress_manager.get_task(task_id)
existing_logs = (
(existing_task or {}).get("metadata", {}).get("log_lines") or []
)
error_text = str(e)
if error_text not in existing_logs:
await progress_manager.send_build_progress(
task_id=task_id,
stage="error",
progress=0,
message=f"Build failed: {error_text}",
log_lines=[f"Error: {error_text}"],
)
except Exception as ws_error:
logger.error(f"Failed to send error via SSE: {ws_error}")
raise Exception(f"Failed to build from source {commit_sha}: {e}")
Expand Down
20 changes: 20 additions & 0 deletions backend/llama_swap_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,23 @@ async def get_model_info(self, model_id: str, upstream_path: str = "v1/models"):
except Exception as e:
logger.error(f"Failed to get model info for {model_id}: {e}")
raise

async def load_model(self, model_name: str, retries: int = 20, delay: float = 0.5):
"""Trigger on-demand model loading via llama-swap's upstream route."""
last_error = None
for _ in range(max(1, retries)):
try:
async with httpx.AsyncClient() as client:
response = await client.get(
f"{self.base_url}/upstream/{model_name}/v1/models", timeout=30
)
response.raise_for_status()
self._loading_models.discard(model_name)
return response.json()
except Exception as e:
last_error = e
self._loading_models.add(model_name)
await asyncio.sleep(delay)
self._loading_models.discard(model_name)
logger.error(f"Failed to load model {model_name}: {last_error}")
raise last_error
Loading