Skip to content
1,007 changes: 0 additions & 1,007 deletions RATE_LIMIT_SECURITY_AUDIT.md

This file was deleted.

240 changes: 133 additions & 107 deletions docs/ISSUES.md

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions liminallm/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ def _get_system_settings(runtime) -> dict:
"chat_rate_limit_per_minute": 60,
"chat_rate_limit_window_seconds": 60,
"login_rate_limit_per_minute": 10,
"refresh_rate_limit_per_minute": 20,
"refresh_rate_limit_window_seconds": 60,
"signup_rate_limit_per_minute": 5,
"reset_rate_limit_per_minute": 5,
"mfa_rate_limit_per_minute": 5,
Expand Down Expand Up @@ -1109,13 +1111,21 @@ async def oauth_callback(
async def refresh_tokens(
body: TokenRefreshRequest,
response: Response,
request: Request,
authorization: Optional[str] = Header(None),
x_tenant_id: Optional[str] = Header(
None, convert_underscores=False, alias="X-Tenant-ID"
),
):
runtime = get_runtime()
tenant_hint = body.tenant_id or x_tenant_id
client_ip = request.client.host if request.client else "unknown"
await _enforce_rate_limit(
runtime,
f"refresh:{client_ip}",
_get_rate_limit(runtime, "refresh_rate_limit_per_minute"),
_get_rate_limit(runtime, "refresh_rate_limit_window_seconds"),
)
user, session, tokens = await runtime.auth.refresh_tokens(
body.refresh_token, tenant_hint=tenant_hint
)
Expand Down Expand Up @@ -3133,6 +3143,8 @@ async def update_system_settings(
"chat_rate_limit_per_minute",
"chat_rate_limit_window_seconds",
"login_rate_limit_per_minute",
"refresh_rate_limit_per_minute",
"refresh_rate_limit_window_seconds",
"signup_rate_limit_per_minute",
"reset_rate_limit_per_minute",
"mfa_rate_limit_per_minute",
Expand Down Expand Up @@ -3191,6 +3203,8 @@ async def update_system_settings(
"chat_rate_limit_per_minute",
"chat_rate_limit_window_seconds",
"login_rate_limit_per_minute",
"refresh_rate_limit_per_minute",
"refresh_rate_limit_window_seconds",
"signup_rate_limit_per_minute",
"reset_rate_limit_per_minute",
"mfa_rate_limit_per_minute",
Expand Down
54 changes: 40 additions & 14 deletions liminallm/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@

from liminallm.api.error_handling import register_exception_handlers
from liminallm.api.routes import get_admin_user, router
from liminallm.config import Settings
from liminallm.logging import get_logger, set_correlation_id

logger = get_logger(__name__)

_settings = Settings.from_env()

# Version info per SPEC §18
__version__ = "0.1.0"
__build__ = os.getenv("BUILD_SHA", "dev")
__build__ = _settings.build_sha


_cleanup_task: asyncio.Task | None = None
Expand Down Expand Up @@ -77,9 +80,8 @@ async def lifespan(app: FastAPI):


def _allowed_origins() -> List[str]:
env_value = os.getenv("CORS_ALLOW_ORIGINS")
if env_value:
return [origin.strip() for origin in env_value.split(",") if origin.strip()]
if _settings.cors_allow_origins:
return _settings.cors_allow_origins
# Default to common local dev hosts; avoid wildcard when credentials are enabled.
return [
"http://localhost",
Expand All @@ -91,10 +93,7 @@ def _allowed_origins() -> List[str]:


def _allow_credentials() -> bool:
flag = os.getenv("CORS_ALLOW_CREDENTIALS")
if flag is None:
return False
return flag.lower() in {"1", "true", "yes", "on"}
return _settings.cors_allow_credentials


app.add_middleware(
Expand Down Expand Up @@ -199,12 +198,7 @@ async def add_security_headers(request, call_next):
response.headers.setdefault(
"Permissions-Policy", "camera=(), microphone=(), geolocation=(), payment=()"
)
if request.url.scheme == "https" and os.getenv("ENABLE_HSTS", "false").lower() in {
"1",
"true",
"yes",
"on",
}:
if request.url.scheme == "https" and _settings.enable_hsts:
response.headers.setdefault(
"Strict-Transport-Security", "max-age=63072000; includeSubDomains"
)
Expand Down Expand Up @@ -432,6 +426,38 @@ async def metrics() -> Response:
lines.append('# TYPE liminallm_database_healthy gauge')
lines.append(f'liminallm_database_healthy {db_healthy}')

# Training job activity
list_jobs = getattr(runtime.store, "list_training_jobs", None)
if callable(list_jobs):
try:
jobs = list_jobs()
active = len([j for j in jobs if j.status in {"queued", "running"}])
lines.append('# HELP liminallm_training_jobs_active Active training jobs')
lines.append('# TYPE liminallm_training_jobs_active gauge')
lines.append(f'liminallm_training_jobs_active {active}')
except Exception as exc:
logger.warning("metrics_training_jobs_failed", error=str(exc))

# Preference event ingestion rate proxy
if hasattr(runtime.store, "list_preference_events"):
try:
events = runtime.store.list_preference_events(user_id=None) # type: ignore[arg-type]
lines.append('# HELP liminallm_preference_events_total Total recorded preference events')
lines.append('# TYPE liminallm_preference_events_total counter')
lines.append(f'liminallm_preference_events_total {len(events)}')
except Exception as exc:
logger.warning("metrics_preference_events_failed", error=str(exc))

# Adapter usage counts
if hasattr(runtime.store, "list_artifacts"):
try:
adapters = runtime.store.list_artifacts(kind="adapter", owner_user_id=None) # type: ignore[arg-type]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Wrong parameter name in metrics adapter count query

The call list_artifacts(kind="adapter", ...) uses parameter name kind, but the actual function signature uses kind_filter. Since there's no **kwargs in the function, this raises TypeError: got an unexpected keyword argument 'kind'. The exception is caught and logged as a warning, but the adapter count metric will never be populated.

Fix in Cursor Fix in Web

lines.append('# HELP liminallm_adapters_total Adapters stored in system')
lines.append('# TYPE liminallm_adapters_total gauge')
lines.append(f'liminallm_adapters_total {len(adapters)}')
except Exception as exc:
logger.warning("metrics_adapters_failed", error=str(exc))

except Exception as exc:
logger.error("metrics_collection_failed", error=str(exc))

Expand Down
69 changes: 68 additions & 1 deletion liminallm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Any

from dotenv import dotenv_values
from pydantic import BaseModel, ConfigDict, Field, field_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

from liminallm.logging import get_logger

Expand Down Expand Up @@ -338,6 +338,24 @@ class Settings(BaseModel):
"ALLOW_SIGNUP",
description="Allow new user signups (overridable via admin UI)",
)
log_level: str = env_field("INFO", "LOG_LEVEL")
log_json: bool = env_field(True, "LOG_JSON")
log_dev_mode: bool = env_field(False, "LOG_DEV_MODE")
build_sha: str = env_field("dev", "BUILD_SHA")
cors_allow_origins: list[str] = env_field(
[
"http://localhost",
"http://localhost:3000",
"http://localhost:5173",
"http://127.0.0.1:3000",
"http://127.0.0.1:5173",
],
"CORS_ALLOW_ORIGINS",
description="Comma-separated CORS origins (overridable via admin UI)",
)
cors_allow_credentials: bool = env_field(False, "CORS_ALLOW_CREDENTIALS")
enable_hsts: bool = env_field(False, "ENABLE_HSTS")
mfa_secret_key: str | None = env_field(None, "MFA_SECRET_KEY")
use_memory_store: bool = env_field(False, "USE_MEMORY_STORE")
allow_redis_fallback_dev: bool = env_field(False, "ALLOW_REDIS_FALLBACK_DEV")
test_mode: bool = env_field(
Expand Down Expand Up @@ -459,9 +477,58 @@ class Settings(BaseModel):
"TRAINING_WORKER_POLL_INTERVAL",
description="Training worker poll interval in seconds (overridable via admin UI)",
)
max_active_training_jobs: int = env_field(
10,
"MAX_ACTIVE_TRAINING_JOBS",
description="Global cap on simultaneously active training jobs",
)

model_config = ConfigDict(extra="ignore")

@field_validator("cors_allow_origins", mode="before")
@classmethod
def _parse_cors_origins(cls, value: Any) -> list[str]:
if value is None:
return []
if isinstance(value, str):
return [v.strip() for v in value.split(",") if v.strip()]
if isinstance(value, list):
return value
return []

@field_validator(
"smtp_port", "training_worker_poll_interval", "tmp_cleanup_interval_seconds", "tmp_max_age_hours", "max_active_training_jobs"
)
@classmethod
def _validate_positive_int(cls, value: int) -> int:
if value <= 0:
raise ValueError("must be positive")
return value

@field_validator("smtp_port")
@classmethod
def _validate_smtp_port(cls, value: int) -> int:
if not 1 <= value <= 65535:
raise ValueError("smtp_port must be between 1 and 65535")
return value

@field_validator("log_level")
@classmethod
def _normalize_log_level(cls, value: str) -> str:
return (value or "INFO").upper()

@model_validator(mode="after")
def _validate_required_pairs(self):
if self.oauth_google_client_id and not self.oauth_google_client_secret:
raise ValueError("oauth_google_client_secret required when client_id is set")
if self.oauth_github_client_id and not self.oauth_github_client_secret:
raise ValueError("oauth_github_client_secret required when client_id is set")
if self.oauth_microsoft_client_id and not self.oauth_microsoft_client_secret:
raise ValueError("oauth_microsoft_client_secret required when client_id is set")
if (self.smtp_host or self.smtp_user) and not self.smtp_password:
raise ValueError("smtp_password required when smtp_host or smtp_user is set")
return self

@classmethod
def from_env(cls) -> "Settings":
env_file_values = dotenv_values(".env")
Expand Down
33 changes: 25 additions & 8 deletions liminallm/service/auth.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import base64
import contextlib
import hashlib
import hmac
import json
Expand Down Expand Up @@ -130,6 +131,7 @@ def __init__(
# Issue 28.4: Thread-safe lock for mutable state dictionaries
# Protects all in-memory fallback state from concurrent access
import threading

self._state_lock = threading.Lock()
self._mfa_challenges: dict[str, tuple[str, datetime]] = {}
# Issue 11.1: In-memory fallback for MFA lockout when Redis unavailable
Expand All @@ -153,6 +155,13 @@ def _now(self) -> datetime:

return datetime.now(timezone.utc)

@contextlib.contextmanager
def _with_state_lock(self):
"""Context manager for thread-safe state dictionary access (Issue 28.4)."""

with self._state_lock:
yield

def cleanup_expired_states(self) -> int:
"""Clean up expired OAuth states, MFA challenges, and email verification tokens.

Expand Down Expand Up @@ -406,7 +415,8 @@ async def start_oauth(
expires_at = self._now() + timedelta(minutes=10)
# Issue 28.4: Thread-safe state mutation
with self._state_lock:
self._oauth_states[state] = (provider, expires_at, tenant_id)
with self._with_state_lock():
self._oauth_states[state] = (provider, expires_at, tenant_id)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Double lock acquisition causes deadlock

The code uses with self._state_lock: immediately followed by with self._with_state_lock():, but _with_state_lock() internally acquires the same self._state_lock. Since self._state_lock is a non-reentrant threading.Lock(), not an RLock, this double acquisition causes an immediate deadlock when the code executes. This affects OAuth state management, password reset token handling, and other auth flows.

Additional Locations (2)

Fix in Cursor Fix in Web

if self.cache:
await self.cache.set_oauth_state(state, provider, expires_at, tenant_id)

Expand Down Expand Up @@ -644,14 +654,17 @@ async def complete_oauth(
# Issue 28.4: Thread-safe state mutation
with self._state_lock:
if stored is None:
stored = self._oauth_states.pop(state, None)
with self._with_state_lock():
stored = self._oauth_states.pop(state, None)
else:
self._oauth_states.pop(state, None)
with self._with_state_lock():
self._oauth_states.pop(state, None)
now = self._now()

async def _clear_oauth_state() -> None:
with self._state_lock:
self._oauth_states.pop(state, None)
with self._with_state_lock():
self._oauth_states.pop(state, None)
if self.cache and not cache_state_used:
await self.cache.pop_oauth_state(state)

Expand Down Expand Up @@ -1201,7 +1214,8 @@ async def initiate_password_reset(self, email: str) -> str:
else:
# Issue 11.2: In-memory fallback for password reset tokens
with self._state_lock:
self._password_reset_tokens[token] = (email, expires_at)
with self._with_state_lock():
self._password_reset_tokens[token] = (email, expires_at)
self.logger.info(
"password_reset_requested",
email_hash=hashlib.sha256(email.encode()).hexdigest(),
Expand All @@ -1215,15 +1229,18 @@ async def complete_password_reset(self, token: str, new_password: str) -> bool:
else:
# Issue 11.2: In-memory fallback for password reset tokens
with self._state_lock:
stored = self._password_reset_tokens.get(token)
with self._with_state_lock():
stored = self._password_reset_tokens.get(token)
if stored:
stored_email, expires_at = stored
if expires_at <= self._now() - self._clock_skew_leeway:
# Remove expired token to prevent memory leak
self._password_reset_tokens.pop(token, None)
with self._with_state_lock():
self._password_reset_tokens.pop(token, None)
else:
email = stored_email
self._password_reset_tokens.pop(token, None)
with self._with_state_lock():
self._password_reset_tokens.pop(token, None)
if not email:
self.logger.warning("password_reset_invalid_token", token_prefix=token[:8])
return False
Expand Down
2 changes: 2 additions & 0 deletions liminallm/service/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ def promote_skill_adapters(
if ratio < positive_ratio:
continue
owner_id = cluster.user_id
visibility = "private" if owner_id else "global"
schema = {
"kind": "adapter.lora",
"scope": "per-user" if cluster.user_id else "global",
Expand All @@ -431,6 +432,7 @@ def promote_skill_adapters(
schema=schema,
description=cluster.description or "Cluster skill adapter",
owner_user_id=owner_id,
visibility=visibility,
)
if self.training and cluster.user_id:
self.training.ensure_user_adapter(
Expand Down
Loading
Loading