diff --git a/README.md b/README.md index 42d5efd..8f1d4cb 100644 --- a/README.md +++ b/README.md @@ -490,6 +490,26 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores) | Variable | Default | Description | |----------|---------|-------------| | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location | +| `QMD_EMBED_MODEL` | `hf:ggml-org/embeddinggemma-300M-GGUF/...` | Override embedding model URI | +| `QMD_GENERATE_MODEL` | `hf:tobil/qmd-query-expansion-1.7B-gguf/...` | Override query expansion model URI | +| `QMD_RERANK_MODEL` | `hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/...` | Override reranker model URI | +| `QMD_MODEL_CACHE_DIR` | `~/.cache/qmd/models` | Override model cache directory | + +### Model Override Example + +For latency-critical applications (e.g., API timeouts), use a faster reranker: + +```sh +# Use Jina's tiny reranker (185x faster cold start, 6-10x faster warm) +export QMD_RERANK_MODEL="hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf" +qmd query "your search" +``` + +**Priority order:** config object > environment variable > default + +**Trade-offs:** +- `jina-reranker-v1-tiny-en` (33M params, 67MB): ~80ms cold, ~50ms warm, 78% ranking agreement +- `qwen3-reranker-0.6b` (600M params, 639MB): ~15s cold, ~400ms warm, best quality ## How It Works diff --git a/src/llm.ts b/src/llm.ts index 46c6295..8f6f73f 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -386,10 +386,11 @@ export class LlamaCpp implements LLM { constructor(config: LlamaCppConfig = {}) { - this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL; - this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL; - this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL; - this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR; + // Priority: config > env var > default + this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL; + this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; + this.modelCacheDir = config.modelCacheDir || process.env.QMD_MODEL_CACHE_DIR || MODEL_CACHE_DIR; this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS; this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false; } diff --git a/test/llm.test.ts b/test/llm.test.ts index 662d11c..cbbf8e7 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -33,6 +33,83 @@ describe("Default LlamaCpp Singleton", () => { }); }); +// ============================================================================= +// Environment Variable Configuration Tests (no model loading required) +// ============================================================================= + +describe("LlamaCpp Environment Variable Configuration", () => { + const originalEnv = { ...process.env }; + + afterAll(() => { + // Restore original environment + process.env = originalEnv; + }); + + test("uses default models when no config or env vars provided", () => { + // Clear any env vars + delete process.env.QMD_EMBED_MODEL; + delete process.env.QMD_GENERATE_MODEL; + delete process.env.QMD_RERANK_MODEL; + delete process.env.QMD_MODEL_CACHE_DIR; + + const llm = new LlamaCpp({}); + + // Access private properties for testing (TypeScript workaround) + expect((llm as any).embedModelUri).toContain("embeddinggemma"); + expect((llm as any).generateModelUri).toContain("qmd-query-expansion"); + expect((llm as any).rerankModelUri).toContain("qwen3-reranker"); + }); + + test("env vars override defaults when no config provided", () => { + process.env.QMD_EMBED_MODEL = "hf:test/embed-model.gguf"; + process.env.QMD_GENERATE_MODEL = "hf:test/generate-model.gguf"; + process.env.QMD_RERANK_MODEL = "hf:test/rerank-model.gguf"; + process.env.QMD_MODEL_CACHE_DIR = "/tmp/test-cache"; + + const llm = new LlamaCpp({}); + + expect((llm as any).embedModelUri).toBe("hf:test/embed-model.gguf"); + expect((llm as any).generateModelUri).toBe("hf:test/generate-model.gguf"); + expect((llm as any).rerankModelUri).toBe("hf:test/rerank-model.gguf"); + expect((llm as any).modelCacheDir).toBe("/tmp/test-cache"); + + // Cleanup + delete process.env.QMD_EMBED_MODEL; + delete process.env.QMD_GENERATE_MODEL; + delete process.env.QMD_RERANK_MODEL; + delete process.env.QMD_MODEL_CACHE_DIR; + }); + + test("config takes priority over env vars", () => { + process.env.QMD_RERANK_MODEL = "hf:env/rerank-model.gguf"; + + const llm = new LlamaCpp({ + rerankModel: "hf:config/rerank-model.gguf", + }); + + expect((llm as any).rerankModelUri).toBe("hf:config/rerank-model.gguf"); + + // Cleanup + delete process.env.QMD_RERANK_MODEL; + }); + + test("partial env var override works correctly", () => { + // Only set rerank model via env var + process.env.QMD_RERANK_MODEL = "hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf"; + + const llm = new LlamaCpp({}); + + // Rerank should use env var + expect((llm as any).rerankModelUri).toBe("hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf"); + // Others should use defaults + expect((llm as any).embedModelUri).toContain("embeddinggemma"); + expect((llm as any).generateModelUri).toContain("qmd-query-expansion"); + + // Cleanup + delete process.env.QMD_RERANK_MODEL; + }); +}); + // ============================================================================= // Model Existence Tests // =============================================================================