Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,26 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores)
| Variable | Default | Description |
|----------|---------|-------------|
| `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
| `QMD_EMBED_MODEL` | `hf:ggml-org/embeddinggemma-300M-GGUF/...` | Override embedding model URI |
| `QMD_GENERATE_MODEL` | `hf:tobil/qmd-query-expansion-1.7B-gguf/...` | Override query expansion model URI |
| `QMD_RERANK_MODEL` | `hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/...` | Override reranker model URI |
| `QMD_MODEL_CACHE_DIR` | `~/.cache/qmd/models` | Override model cache directory |

### Model Override Example

For latency-critical applications (e.g., API timeouts), use a faster reranker:

```sh
# Use Jina's tiny reranker (185x faster cold start, 6-10x faster warm)
export QMD_RERANK_MODEL="hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf"
qmd query "your search"
```

**Priority order:** config object > environment variable > default

**Trade-offs:**
- `jina-reranker-v1-tiny-en` (33M params, 67MB): ~80ms cold, ~50ms warm, 78% ranking agreement
- `qwen3-reranker-0.6b` (600M params, 639MB): ~15s cold, ~400ms warm, best quality

## How It Works

Expand Down
9 changes: 5 additions & 4 deletions src/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,11 @@ export class LlamaCpp implements LLM {


constructor(config: LlamaCppConfig = {}) {
this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
// Priority: config > env var > default
this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
this.modelCacheDir = config.modelCacheDir || process.env.QMD_MODEL_CACHE_DIR || MODEL_CACHE_DIR;
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
}
Expand Down
77 changes: 77 additions & 0 deletions test/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,83 @@ describe("Default LlamaCpp Singleton", () => {
});
});

// =============================================================================
// Environment Variable Configuration Tests (no model loading required)
// =============================================================================

describe("LlamaCpp Environment Variable Configuration", () => {
const originalEnv = { ...process.env };

afterAll(() => {
// Restore original environment
process.env = originalEnv;
});

test("uses default models when no config or env vars provided", () => {
// Clear any env vars
delete process.env.QMD_EMBED_MODEL;
delete process.env.QMD_GENERATE_MODEL;
delete process.env.QMD_RERANK_MODEL;
delete process.env.QMD_MODEL_CACHE_DIR;

const llm = new LlamaCpp({});

// Access private properties for testing (TypeScript workaround)
expect((llm as any).embedModelUri).toContain("embeddinggemma");
expect((llm as any).generateModelUri).toContain("qmd-query-expansion");
expect((llm as any).rerankModelUri).toContain("qwen3-reranker");
});

test("env vars override defaults when no config provided", () => {
process.env.QMD_EMBED_MODEL = "hf:test/embed-model.gguf";
process.env.QMD_GENERATE_MODEL = "hf:test/generate-model.gguf";
process.env.QMD_RERANK_MODEL = "hf:test/rerank-model.gguf";
process.env.QMD_MODEL_CACHE_DIR = "/tmp/test-cache";

const llm = new LlamaCpp({});

expect((llm as any).embedModelUri).toBe("hf:test/embed-model.gguf");
expect((llm as any).generateModelUri).toBe("hf:test/generate-model.gguf");
expect((llm as any).rerankModelUri).toBe("hf:test/rerank-model.gguf");
expect((llm as any).modelCacheDir).toBe("/tmp/test-cache");

// Cleanup
delete process.env.QMD_EMBED_MODEL;
delete process.env.QMD_GENERATE_MODEL;
delete process.env.QMD_RERANK_MODEL;
delete process.env.QMD_MODEL_CACHE_DIR;
});

test("config takes priority over env vars", () => {
process.env.QMD_RERANK_MODEL = "hf:env/rerank-model.gguf";

const llm = new LlamaCpp({
rerankModel: "hf:config/rerank-model.gguf",
});

expect((llm as any).rerankModelUri).toBe("hf:config/rerank-model.gguf");

// Cleanup
delete process.env.QMD_RERANK_MODEL;
});

test("partial env var override works correctly", () => {
// Only set rerank model via env var
process.env.QMD_RERANK_MODEL = "hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf";

const llm = new LlamaCpp({});

// Rerank should use env var
expect((llm as any).rerankModelUri).toBe("hf:gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf");
// Others should use defaults
expect((llm as any).embedModelUri).toContain("embeddinggemma");
expect((llm as any).generateModelUri).toContain("qmd-query-expansion");

// Cleanup
delete process.env.QMD_RERANK_MODEL;
});
});

// =============================================================================
// Model Existence Tests
// =============================================================================
Expand Down
Loading