From 993faf6315ed4d77717d31d105c82cfe9d08d46f Mon Sep 17 00:00:00 2001 From: David Gil - Klaus Date: Wed, 4 Feb 2026 22:06:24 +0100 Subject: [PATCH] feat(llm): add environment variable overrides for model selection Add environment variables for runtime model configuration without code changes: - QMD_EMBED_MODEL - override embedding model - QMD_GENERATE_MODEL - override query expansion model - QMD_RERANK_MODEL - override reranker model - QMD_MODEL_CACHE_DIR - override model cache directory Priority: config object > environment variable > default Use case: Switch to faster reranker (jina-reranker-v1-tiny) for latency-critical applications like API timeouts, without changing defaults or rebuilding. Example: export QMD_RERANK_MODEL='hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf' qmd query "your search" Benchmarks on Mac Mini M4 16GB: | Scenario | qwen3 (default) | jina-tiny | Speedup | |---------------------|-----------------|-----------|---------| | Cold start (8 docs) | ~15,000ms | ~80ms | 185x | | Warm cache (8 docs) | 400-440ms | 40-65ms | 6-10x | | Full pipeline cold | ~20s | ~7s | 3x | | Full pipeline warm | ~15s | ~5.7s | 2.6x | Quality: ~78% top-3 ranking agreement (same relevant docs, order differs slightly). Includes: - Unit tests for env var configuration - README documentation with model override examples No breaking changes. Default remains qwen3-reranker-0.6b. --- README.md | 20 +++++++++++++ src/llm.ts | 9 +++--- test/llm.test.ts | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 42d5efd9..8f1d4cb9 100644 --- a/README.md +++ b/README.md @@ -490,6 +490,26 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores) | Variable | Default | Description | |----------|---------|-------------| | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location | +| `QMD_EMBED_MODEL` | `hf:ggml-org/embeddinggemma-300M-GGUF/...` | Override embedding model URI | +| `QMD_GENERATE_MODEL` | `hf:tobil/qmd-query-expansion-1.7B-gguf/...` | Override query expansion model URI | +| `QMD_RERANK_MODEL` | `hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/...` | Override reranker model URI | +| `QMD_MODEL_CACHE_DIR` | `~/.cache/qmd/models` | Override model cache directory | + +### Model Override Example + +For latency-critical applications (e.g., API timeouts), use a faster reranker: + +```sh +# Use Jina's tiny reranker (185x faster cold start, 6-10x faster warm) +export QMD_RERANK_MODEL="hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf" +qmd query "your search" +``` + +**Priority order:** config object > environment variable > default + +**Trade-offs:** +- `jina-reranker-v1-tiny-en` (33M params, 67MB): ~80ms cold, ~50ms warm, 78% ranking agreement +- `qwen3-reranker-0.6b` (600M params, 639MB): ~15s cold, ~400ms warm, best quality ## How It Works diff --git a/src/llm.ts b/src/llm.ts index 46c62957..8f6f73f5 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -386,10 +386,11 @@ export class LlamaCpp implements LLM { constructor(config: LlamaCppConfig = {}) { - this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL; - this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL; - this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL; - this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR; + // Priority: config > env var > default + this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL; + this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; + this.modelCacheDir = config.modelCacheDir || process.env.QMD_MODEL_CACHE_DIR || MODEL_CACHE_DIR; this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS; this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false; } diff --git a/test/llm.test.ts b/test/llm.test.ts index 662d11c3..cbbf8e76 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -33,6 +33,83 @@ describe("Default LlamaCpp Singleton", () => { }); }); +// ============================================================================= +// Environment Variable Configuration Tests (no model loading required) +// ============================================================================= + +describe("LlamaCpp Environment Variable Configuration", () => { + const originalEnv = { ...process.env }; + + afterAll(() => { + // Restore original environment + process.env = originalEnv; + }); + + test("uses default models when no config or env vars provided", () => { + // Clear any env vars + delete process.env.QMD_EMBED_MODEL; + delete process.env.QMD_GENERATE_MODEL; + delete process.env.QMD_RERANK_MODEL; + delete process.env.QMD_MODEL_CACHE_DIR; + + const llm = new LlamaCpp({}); + + // Access private properties for testing (TypeScript workaround) + expect((llm as any).embedModelUri).toContain("embeddinggemma"); + expect((llm as any).generateModelUri).toContain("qmd-query-expansion"); + expect((llm as any).rerankModelUri).toContain("qwen3-reranker"); + }); + + test("env vars override defaults when no config provided", () => { + process.env.QMD_EMBED_MODEL = "hf:test/embed-model.gguf"; + process.env.QMD_GENERATE_MODEL = "hf:test/generate-model.gguf"; + process.env.QMD_RERANK_MODEL = "hf:test/rerank-model.gguf"; + process.env.QMD_MODEL_CACHE_DIR = "/tmp/test-cache"; + + const llm = new LlamaCpp({}); + + expect((llm as any).embedModelUri).toBe("hf:test/embed-model.gguf"); + expect((llm as any).generateModelUri).toBe("hf:test/generate-model.gguf"); + expect((llm as any).rerankModelUri).toBe("hf:test/rerank-model.gguf"); + expect((llm as any).modelCacheDir).toBe("/tmp/test-cache"); + + // Cleanup + delete process.env.QMD_EMBED_MODEL; + delete process.env.QMD_GENERATE_MODEL; + delete process.env.QMD_RERANK_MODEL; + delete process.env.QMD_MODEL_CACHE_DIR; + }); + + test("config takes priority over env vars", () => { + process.env.QMD_RERANK_MODEL = "hf:env/rerank-model.gguf"; + + const llm = new LlamaCpp({ + rerankModel: "hf:config/rerank-model.gguf", + }); + + expect((llm as any).rerankModelUri).toBe("hf:config/rerank-model.gguf"); + + // Cleanup + delete process.env.QMD_RERANK_MODEL; + }); + + test("partial env var override works correctly", () => { + // Only set rerank model via env var + process.env.QMD_RERANK_MODEL = "hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf"; + + const llm = new LlamaCpp({}); + + // Rerank should use env var + expect((llm as any).rerankModelUri).toBe("hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf"); + // Others should use defaults + expect((llm as any).embedModelUri).toContain("embeddinggemma"); + expect((llm as any).generateModelUri).toContain("qmd-query-expansion"); + + // Cleanup + delete process.env.QMD_RERANK_MODEL; + }); +}); + // ============================================================================= // Model Existence Tests // =============================================================================