diff --git a/apps/web/app/api/presets/route.ts b/apps/web/app/api/presets/route.ts index 264305f..ed81c8e 100644 --- a/apps/web/app/api/presets/route.ts +++ b/apps/web/app/api/presets/route.ts @@ -29,6 +29,7 @@ type WizardStateV1 = { modules: { extractors: string[]; connectors: string[]; + batteries?: string[]; }; defaults: { chunkSize: number; @@ -61,6 +62,7 @@ type PresetPayloadV1 = { modules: { extractors: string[]; connectors: string[]; + batteries: string[]; }; config: { defaults: { @@ -93,6 +95,9 @@ function isWizardStateV1(x: unknown): x is WizardStateV1 { const o = x as any; if (o.v !== 1) return false; if (!o.install || !o.modules || !o.defaults || !o.embedding || !o.storage) return false; + if (o.modules && "batteries" in o.modules && o.modules.batteries != null && !Array.isArray(o.modules.batteries)) { + return false; + } return true; } @@ -107,6 +112,9 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 { const connectors = Array.isArray(input.modules.connectors) ? input.modules.connectors.map(String).filter(Boolean) : []; + const batteries = Array.isArray((input.modules as any).batteries) + ? (input.modules as any).batteries.map(String).filter(Boolean) + : []; const chunkSize = Number(input.defaults.chunkSize) || 200; const chunkOverlap = Number(input.defaults.chunkOverlap) || 40; @@ -140,7 +148,7 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 { return { v: 1, install: { installDir, storeAdapter, aliasBase }, - modules: { extractors, connectors }, + modules: { extractors, connectors, batteries }, defaults: { chunkSize, chunkOverlap, topK }, embedding: { type: embeddingType, provider: embeddingProvider, model, timeoutMs }, storage: { storeChunkContent, storeDocumentContent }, @@ -160,6 +168,7 @@ function makePresetFromWizard(state: WizardStateV1): PresetPayloadV1 { modules: { extractors: state.modules.extractors, connectors: state.modules.connectors, + batteries: (state.modules.batteries ?? []).map(String).filter(Boolean), }, config: { defaults: { @@ -242,6 +251,11 @@ export async function POST(req: NextRequest) { .filter((c: any) => c.status === "available") .map((c: any) => String(c.id)) ); + const allowedBatteries = new Set( + (manifest.batteries ?? []) + .filter((b: any) => b.status === "available") + .map((b: any) => String(b.id)) + ); const unknownExtractors = state.modules.extractors.filter((x) => !allowedExtractors.has(x)); if (unknownExtractors.length > 0) { @@ -259,6 +273,15 @@ export async function POST(req: NextRequest) { ); } + const batteryIds = (state.modules.batteries ?? []).map(String).filter(Boolean); + const unknownBatteries = batteryIds.filter((x) => !allowedBatteries.has(x)); + if (unknownBatteries.length > 0) { + return NextResponse.json( + { error: "Unknown or unavailable batteries", unknownBatteries }, + { status: 400 } + ); + } + const preset = makePresetFromWizard(state); const id = newPresetId(); const key = `unrag:preset:${id}`; diff --git a/apps/web/app/docs/[[...slug]]/page.tsx b/apps/web/app/docs/[[...slug]]/page.tsx index 676a2f4..89d9195 100644 --- a/apps/web/app/docs/[[...slug]]/page.tsx +++ b/apps/web/app/docs/[[...slug]]/page.tsx @@ -9,6 +9,7 @@ import { notFound } from 'next/navigation'; import { getMDXComponents } from '@/mdx-components'; import type { Metadata } from 'next'; import { createRelativeLink } from 'fumadocs-ui/mdx'; +import SystemBanner from '@/components/ui/system-banner'; export default async function Page(props: PageProps<'/docs/[[...slug]]'>) { const params = await props.params; @@ -16,9 +17,19 @@ export default async function Page(props: PageProps<'/docs/[[...slug]]'>) { if (!page) notFound(); const MDX = page.data.body; + const slug = params.slug ?? []; + const isExperimentalFeature = + slug[0] === 'eval' + || (slug[0] === 'batteries' && slug[1] === 'eval'); return ( + {page.data.title} {page.data.description} diff --git a/apps/web/app/install/install-wizard-client.tsx b/apps/web/app/install/install-wizard-client.tsx index 2d16269..ea6717d 100644 --- a/apps/web/app/install/install-wizard-client.tsx +++ b/apps/web/app/install/install-wizard-client.tsx @@ -984,6 +984,7 @@ function BatteryCard({ onToggle: () => void; }) { const isAvailable = status === 'available'; + const isExperimental = id === 'eval'; return ( {isAvailable ? 'available' : 'coming soon'} + {isExperimental ? ( + + experimental + + ) : null} {isAvailable && docsHref ? (
@@ -1197,6 +1203,16 @@ export default function InstallWizardClient() { })); }, [state.embedding.type, state.embedding.provider, state.embedding.model, selectedEmbeddingModelOption, setState]); + function pmExecBase(pm: 'bun' | 'npm' | 'pnpm' | 'yarn') { + return pm === 'bun' + ? 'bunx' + : pm === 'pnpm' + ? 'pnpm dlx' + : pm === 'yarn' + ? 'yarn dlx' + : 'npx'; + } + const commandPreview = useMemo(() => { if (presetId) { return `bunx unrag@latest init --yes --preset ${presetId}`; @@ -1216,14 +1232,7 @@ export default function InstallWizardClient() { const installCommand = useMemo(() => { if (!presetId) return null; - const base = - pkgManager === 'bun' - ? 'bunx' - : pkgManager === 'pnpm' - ? 'pnpm dlx' - : pkgManager === 'yarn' - ? 'yarn dlx' - : 'npx'; + const base = pmExecBase(pkgManager); return `${base} unrag@latest init --yes --preset ${presetId}`; }, [pkgManager, presetId]); diff --git a/apps/web/components/ui/system-banner.tsx b/apps/web/components/ui/system-banner.tsx new file mode 100644 index 0000000..afc2007 --- /dev/null +++ b/apps/web/components/ui/system-banner.tsx @@ -0,0 +1,50 @@ +interface SystemBannerProps { + text?: string; + color?: string; + size?: "xs" | "sm" | "md" | "lg"; + show?: boolean; +} + +const sizeClasses: Record, string> = { + xs: "text-[10px] px-1 py-0.5", + sm: "text-xs px-2 py-0.5", + md: "text-sm px-3 py-1", + lg: "text-base px-4 py-1.5" +}; + +export default function SystemBanner({ + text = "Development Mode", + color = "bg-orange-500", + size = "xs", + show = true +}: SystemBannerProps) { + if (!show) return null; + return ( +
+ + {text} + +
+ ); +} diff --git a/apps/web/content/docs/batteries/eval.mdx b/apps/web/content/docs/batteries/eval.mdx new file mode 100644 index 0000000..81e20b9 --- /dev/null +++ b/apps/web/content/docs/batteries/eval.mdx @@ -0,0 +1,96 @@ +--- +title: Eval Harness +description: Measure and improve retrieval quality with deterministic evaluation, metrics, and CI integration. +--- + + +Evaluation is currently **experimental**. It’s safe to use, but expect some CLI flags, report fields, and defaults to change as the harness matures. + + +The eval harness is a battery that adds retrieval evaluation capabilities to your Unrag installation. It gives you a structured way to define test datasets, run your retrieval pipeline against them, compute standard metrics (hit@k, recall@k, precision@k, MRR@k), and track quality changes over time. + +Unlike the reranker battery which adds a new method to your engine, the eval harness is primarily a development and CI tool. You use it to measure how well your retrieval works, catch regressions before they reach production, and make informed decisions when tuning chunking, embeddings, or adding reranking. + +## Installing the eval battery + +```bash +bunx unrag@latest add battery eval +``` + +This creates several files: + + + + + + + + + + + + + + + + + + + + + + +It also adds two npm scripts to your `package.json`: + +```json +{ + "scripts": { + "unrag:eval": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json", + "unrag:eval:ci": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci" + } +} +``` + +## Running your first eval + +After installation, run the sample evaluation: + +```bash +bun run unrag:eval +``` + +The harness will ingest the sample documents, run the test queries, and write report files. You'll see output like: + +``` +[unrag:eval] Wrote report: .unrag/eval/runs/-sample/report.json +[unrag:eval] Wrote summary: .unrag/eval/runs/-sample/summary.md +[unrag:eval] Thresholds: pass +``` + +## Full documentation + +The eval harness is a substantial feature with its own documentation section covering everything from dataset design to CI integration: + + + + Why retrieval evaluation matters and how the harness works + + + Complete setup guide with your first evaluation + + + How to structure documents, queries, and ground truth + + + What each metric measures and how to interpret results + + + All configuration options and CLI flags + + + Automated quality gates and threshold checking + + + Baseline diffs and tracking changes over time + + diff --git a/apps/web/content/docs/batteries/index.mdx b/apps/web/content/docs/batteries/index.mdx index 89c91df..87374b3 100644 --- a/apps/web/content/docs/batteries/index.mdx +++ b/apps/web/content/docs/batteries/index.mdx @@ -12,6 +12,7 @@ The difference is scope. Extractors transform content types (PDFs into text). Co | Battery | Description | Status | |---------|-------------|--------| | [Reranker](/docs/batteries/reranker) | Second-stage reranking using Cohere or custom models | Available | +| [Eval Harness](/docs/batteries/eval) | Deterministic retrieval evaluation with metrics and CI integration | Experimental | ## Installing a battery @@ -29,7 +30,7 @@ After installation, you wire the battery into your engine configuration. Each ba The core Unrag engine handles the fundamental RAG operations: chunking text, generating embeddings, storing vectors, and running similarity search. These operations cover most use cases and keep the default installation small. -But production RAG systems often need more. Reranking can significantly improve precision by reordering initial retrieval results using a more expensive relevance model. Evaluation harnesses help you measure and improve retrieval quality. Hybrid search combines vector similarity with keyword matching. +But production RAG systems often need more. Reranking can significantly improve precision by reordering initial retrieval results using a more expensive relevance model. Hybrid search combines vector similarity with keyword matching. Rather than bundling these features into the core (adding complexity and dependencies everyone pays for), Unrag provides them as optional batteries. Install what you need, skip what you don't. The code is vendored into your project, so you can read it, understand it, and modify it if your requirements differ from the defaults. diff --git a/apps/web/content/docs/batteries/meta.json b/apps/web/content/docs/batteries/meta.json index 6063338..ff88828 100644 --- a/apps/web/content/docs/batteries/meta.json +++ b/apps/web/content/docs/batteries/meta.json @@ -2,5 +2,5 @@ "title": "Batteries", "badge": "New", "description": "Optional modules that add capabilities like reranking to your Unrag installation.", - "pages": ["index", "reranker"] + "pages": ["index", "reranker", "eval"] } diff --git a/apps/web/content/docs/batteries/reranker.mdx b/apps/web/content/docs/batteries/reranker.mdx index c002cdc..b6011dc 100644 --- a/apps/web/content/docs/batteries/reranker.mdx +++ b/apps/web/content/docs/batteries/reranker.mdx @@ -1,6 +1,5 @@ --- title: Reranker -new: true description: Improve retrieval precision with second-stage reranking using Cohere or custom models. --- diff --git a/apps/web/content/docs/eval/ci-integration.mdx b/apps/web/content/docs/eval/ci-integration.mdx new file mode 100644 index 0000000..7ded98a --- /dev/null +++ b/apps/web/content/docs/eval/ci-integration.mdx @@ -0,0 +1,292 @@ +--- +title: CI Integration +description: Add retrieval quality gates to your deployment pipeline with thresholds and automated checks. +--- + +Manual evaluation is useful for exploration and debugging, but the real value comes from automation. When every pull request that touches your RAG configuration runs through an eval suite, you catch regressions before they reach production. When your nightly CI tracks metrics over time, you can see gradual drift and address it proactively. + +This page covers how to set up evaluation in CI pipelines, configure thresholds that make sense for your use case, and integrate eval results with your review process. + +## The basics + +The eval harness is designed for CI from the start. The `--ci` flag changes behavior in two important ways: + +1. It enables threshold checking against your configured minimums +2. It uses exit codes that CI systems understand + +```bash +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/regression.json --ci +``` + +Exit codes: + +- **0**: All thresholds passed +- **1**: At least one threshold failed (retrieval quality regression) +- **2**: Eval failed to run (configuration error, database issue, etc.) + +This means you can use the eval script directly as a CI step. If it exits 0, continue. If it exits non-zero, fail the build. + +## GitHub Actions example + +Here's a complete GitHub Actions workflow that runs eval on PRs touching RAG configuration: + +```yaml +name: Retrieval Eval + +on: + pull_request: + paths: + - 'lib/unrag/**' + - 'unrag.config.ts' + - '.unrag/**' + +jobs: + eval: + runs-on: ubuntu-latest + + services: + postgres: + image: pgvector/pgvector:pg16 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: unrag_eval + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - uses: actions/checkout@v4 + + - uses: oven-sh/setup-bun@v2 + + - run: bun install + + - name: Setup database + run: bun run db:migrate + env: + DATABASE_URL: postgresql://postgres:postgres@localhost:5432/unrag_eval + + - name: Run eval + run: bun run unrag:eval:ci + env: + DATABASE_URL: postgresql://postgres:postgres@localhost:5432/unrag_eval + AI_GATEWAY_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Upload eval report + uses: actions/upload-artifact@v4 + if: always() + with: + name: eval-report + path: .unrag/eval/runs/ +``` + +This workflow spins up a Postgres container with pgvector, runs migrations to create the Unrag tables, runs the eval suite, and uploads the report as an artifact. If thresholds fail, the workflow fails, blocking the PR. + +The `paths` filter ensures the workflow only runs when relevant files change. No point running eval on README updates. + +## Setting useful thresholds + +The default thresholds from installation are intentionally conservative: + +```json +{ + "thresholds": { "min": { "hitAtK": 0.8, "recallAtK": 0.7, "mrrAtK": 0.6 } } +} +``` + +These are starting points. Once you have baseline metrics, adjust thresholds to match your actual performance with some margin. If your current recall@10 is 0.87, setting the threshold at 0.70 won't catch regressions until things get really bad. + +A reasonable approach: + +1. Run eval against your current configuration to establish a baseline +2. Set thresholds 5-10% below your baseline (so normal variance doesn't cause failures) +3. Tighten thresholds as you improve your system + +If your baseline is: +- hit@10: 0.92 +- recall@10: 0.85 +- mrr@10: 0.78 + +Reasonable thresholds might be: +- hit@10: 0.85 +- recall@10: 0.78 +- mrr@10: 0.70 + +This gives you room for minor fluctuations while still catching significant regressions. + +## Baseline comparison in CI + +For even better regression detection, compare against a known-good baseline rather than just checking absolute thresholds. Store your baseline report in the repository and diff against it: + +```yaml +- name: Run eval with baseline comparison + run: | + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --baseline .unrag/eval/baselines/main.json \ + --ci +``` + +The diff report shows which queries improved, which degraded, and by how much. You can configure CI to fail if any query's metrics drop below the baseline by more than a threshold. + +To update baselines, run eval on main after merging and commit the new baseline: + +```yaml +name: Update Eval Baseline + +on: + push: + branches: [main] + paths: + - 'lib/unrag/**' + - 'unrag.config.ts' + +jobs: + update-baseline: + runs-on: ubuntu-latest + # ... setup steps ... + + - name: Run eval and save as baseline + run: | + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --output-dir .unrag/eval/baselines + mv .unrag/eval/baselines/*/report.json .unrag/eval/baselines/main.json + + - name: Commit baseline + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: "chore: update eval baseline" + file_pattern: .unrag/eval/baselines/main.json +``` + +This pattern keeps your baseline fresh—it always reflects the current state of main. PRs are compared against this baseline, so you know exactly how the PR changes retrieval quality relative to the current production state. + +## PR comments with eval results + +You can post eval results as PR comments using the markdown summary. Here's a snippet for GitHub Actions: + +```yaml +- name: Post eval summary to PR + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const glob = require('glob'); + + // Find the most recent summary + const summaries = glob.sync('.unrag/eval/runs/*/summary.md'); + if (summaries.length === 0) return; + + const summaryPath = summaries[summaries.length - 1]; + const summary = fs.readFileSync(summaryPath, 'utf8'); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `## Retrieval Eval Results\n\n${summary}` + }); +``` + +This makes eval results visible directly in the PR, so reviewers can see the impact of the change without digging through CI logs. + +## Multiple datasets + +If you have several eval datasets (regression suite, edge cases, multilingual, etc.), run them all and aggregate results: + +```yaml +- name: Run all eval datasets + run: | + EXIT_CODE=0 + for dataset in regression edge-cases multilingual; do + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/$dataset.json \ + --output-dir .unrag/eval/runs/$dataset \ + --ci || EXIT_CODE=1 + done + exit $EXIT_CODE +``` + +This runs all datasets and fails if any dataset fails. You could also implement weighted aggregation—maybe the regression dataset is the gatekeeper, while edge-cases is advisory only. + +## Scheduled evaluation + +Beyond PR checks, run eval periodically to catch drift: + +```yaml +name: Nightly Eval + +on: + schedule: + - cron: '0 4 * * *' # 4 AM UTC daily + +jobs: + eval: + runs-on: ubuntu-latest + # ... setup steps ... + + - name: Run eval against production + run: | + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --no-ingest \ + --ci + env: + DATABASE_URL: ${{ secrets.PROD_DATABASE_URL }} +``` + +Nightly eval against production data catches issues that emerge from content changes rather than code changes. Maybe someone edited a document in ways that broke retrieval for certain queries. Maybe the embedding model behavior changed slightly. Scheduled eval catches these gradually-emerging problems. + +## Handling embedding API costs + +Eval runs incur embedding costs—you're embedding documents during ingestion and queries during retrieval. For a dataset with 50 documents and 100 queries, you might embed ~500 chunks plus 100 queries, costing a few cents with OpenAI's models. + +This adds up if you're running eval on every commit. Strategies to manage costs: + +**Use a cheaper embedding model for eval**: If your production model is `text-embedding-3-large`, consider `text-embedding-3-small` for eval. The relative comparisons (did metrics go up or down) are still valid even if absolute numbers differ. + +**Cache embeddings**: For datasets that don't change often, you can skip re-ingestion. Use `--no-ingest` after the initial setup, and only re-ingest when documents change. + +**Run less frequently**: Maybe eval on every PR is overkill. Run it on PRs that touch RAG configuration, and run it nightly for general monitoring. + +**Use smaller eval datasets for PR checks**: Keep a small, focused regression dataset for PRs (20-30 queries), and run the comprehensive suite less frequently. + +## Environment considerations + +Eval requires a working database and embedding credentials. In CI, you typically use: + +**Ephemeral databases**: Spin up Postgres in a container for each run. The pgvector/pgvector Docker image works out of the box. This gives you isolation—each CI run has a fresh database with no leftover state. + +**Secrets management**: Store API keys (OPENAI_API_KEY, COHERE_API_KEY, etc.) in your CI system's secrets storage. Never commit credentials. + +**Resource allocation**: Eval is I/O bound (database and API calls), not CPU bound. Standard CI runners are fine. + +If you're using a hosted database for eval (maybe a shared staging database), be careful about concurrency. Two CI runs using the same scopePrefix could interfere with each other. Either use run-specific prefixes or ensure only one eval runs at a time. + +## Interpreting CI failures + +When eval fails in CI, the exit code tells you whether it's a quality problem (exit 1) or an infrastructure problem (exit 2). + +**Exit code 1 (threshold failure)**: Your retrieval quality regressed. Look at the report to see which metrics failed and which queries degraded. This might be expected—if you intentionally changed chunking strategy, metrics might temporarily dip. In that case, update thresholds and baseline, then re-run. + +**Exit code 2 (execution error)**: Something went wrong running eval. Check logs for database connection errors, missing environment variables, or dataset validation failures. These are infrastructure issues, not quality issues. + +When quality fails, don't just bump thresholds to make CI pass. Investigate why metrics dropped. Maybe the change genuinely hurt retrieval and should be reconsidered. Maybe your dataset has a bad ground truth label that's causing a false positive. Understanding the failure is more important than fixing the red build. + +## Next steps + + + + Deep dive into baseline diffs and tracking changes + + + Build comprehensive datasets for thorough testing + + diff --git a/apps/web/content/docs/eval/comparing-runs.mdx b/apps/web/content/docs/eval/comparing-runs.mdx new file mode 100644 index 0000000..09910cf --- /dev/null +++ b/apps/web/content/docs/eval/comparing-runs.mdx @@ -0,0 +1,233 @@ +--- +title: Comparing Runs +description: Track changes over time with baseline diffs and understand what improved or degraded. +--- + +A single eval run tells you how your system performs right now. Comparing runs tells you what changed. Did that chunk size tweak help? Did the new embedding model improve recall? Did someone's innocent-looking commit break retrieval for a whole category of queries? Baseline comparison answers these questions with precision. + +## The diff system + +When you provide a `--baseline` flag to the eval script, the harness compares the current run against the baseline and produces a diff report. The diff shows three things: + +1. How aggregate metrics changed (better, worse, same) +2. Which queries improved (and by how much) +3. Which queries degraded (and by how much) + +```bash +bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --baseline .unrag/eval/runs/2025-01-05-regression/report.json +``` + +The output includes a comparison section: + +``` +Comparing against baseline: .unrag/eval/runs/2025-01-05-regression/report.json + +Aggregate changes: + hit@10: 0.833 → 0.917 (+0.083) ✓ + recall@10: 0.833 → 0.917 (+0.083) ✓ + mrr@10: 0.639 → 0.722 (+0.083) ✓ + +Query changes: + ↑ q_compromised_account: recall 0.00 → 1.00 (+1.00) + ↓ q_free_shipping: mrr 1.00 → 0.50 (-0.50) + = q_return_deadline: no change + = q_express_cost: no change + ... +``` + +The up arrows show improvements, down arrows show regressions, and equals signs show stable queries. This tells you not just that things changed, but exactly which queries changed and in what direction. + +## What to look for in diffs + +### Net improvements + +When you make a change that you expect to help, the diff confirms whether it did. Maybe you increased chunk overlap to improve context preservation, and you see several queries improve their recall. That's the signal you were looking for. + +But also look at what degraded. Changes that help some queries often hurt others. Maybe your chunking change improved recall for long-answer queries but hurt precision for short-answer queries. The diff surfaces these tradeoffs so you can make informed decisions. + +### Unexpected changes + +Sometimes the diff surprises you. You changed something unrelated—maybe refactored how the engine is constructed—and suddenly three queries have different results. These unexpected changes are the most valuable findings. They reveal assumptions you didn't know you had, or side effects you didn't anticipate. + +When you see unexpected regressions, investigate before dismissing them. Maybe the refactor accidentally changed the default topK. Maybe the database connection pool is behaving differently. Unexpected changes are opportunities to catch bugs. + +### Stability + +If you run the same configuration twice, the diff should show no changes (or minimal changes due to floating-point precision). If you're seeing random fluctuations between runs, something is non-deterministic. Maybe your chunking has random elements, or your embedding provider is returning slightly different vectors. Understanding your system's baseline stability is important for interpreting real changes. + +## The diff files + +When you compare against a baseline, the harness creates two additional files: + +**diff.json**: Machine-readable diff data. Useful for programmatic analysis, building dashboards, or integrating with other tools. + +```json +{ + "baseline": { + "path": ".unrag/eval/runs/2025-01-05-regression/report.json", + "timestamp": "2025-01-05T10:30:00Z" + }, + "aggregates": { + "hitAtK": { "before": 0.833, "after": 0.917, "delta": 0.083 }, + "recallAtK": { "before": 0.833, "after": 0.917, "delta": 0.083 }, + "mrrAtK": { "before": 0.639, "after": 0.722, "delta": 0.083 } + }, + "queries": { + "improved": [ + { + "id": "q_compromised_account", + "metrics": { + "recallAtK": { "before": 0, "after": 1, "delta": 1 } + } + } + ], + "degraded": [ + { + "id": "q_free_shipping", + "metrics": { + "mrrAtK": { "before": 1, "after": 0.5, "delta": -0.5 } + } + } + ], + "unchanged": ["q_return_deadline", "q_express_cost", ...] + } +} +``` + +**diff.md**: Human-readable markdown summary. Great for PR comments, Slack notifications, or quick review. + +```markdown +# Eval Diff: regression (2025-01-10 vs 2025-01-05) + +## Aggregate Changes + +| Metric | Before | After | Change | +|--------|--------|-------|--------| +| hit@10 | 0.833 | 0.917 | +0.083 ✓ | +| recall@10 | 0.833 | 0.917 | +0.083 ✓ | +| mrr@10 | 0.639 | 0.722 | +0.083 ✓ | + +## Improved Queries (1) + +**q_compromised_account**: recall 0.00 → 1.00 (+1.00) + +## Degraded Queries (1) + +**q_free_shipping**: mrr 1.00 → 0.50 (-0.50) +``` + +## Tracking over time + +Beyond comparing two runs, you might want to track metrics over many runs to see trends. The harness doesn't have built-in time-series tracking, but the JSON reports make it easy to build. + +A simple approach: store reports with timestamped names and periodically aggregate them into a summary: + +```bash +# Store dated reports +bun run scripts/unrag-eval.ts -- \ + --dataset ... \ + --output-dir .unrag/eval/history/$(date +%Y-%m-%d) +``` + +Then write a script that reads all reports and plots metrics over time. This reveals gradual drift that individual diffs might miss. Maybe recall is slowly declining at 1% per week—not enough to trigger alerts on any single comparison, but significant over a month. + +For more sophisticated tracking, export metrics to a time-series database or monitoring service. The JSON report format is designed to be easy to parse and ingest. + +## When to update baselines + +Baselines should reflect your current "known good" state. Update them when: + +**You merge an intentional improvement**: If a PR improves metrics and you're confident the improvement is real, update the baseline so future comparisons are against the new standard. + +**You accept a tradeoff**: If a change improves some queries at the cost of others, and you've decided the tradeoff is worth it, update the baseline to reflect the new expected behavior. + +**You've investigated and explained regressions**: Sometimes metrics drop for good reasons—maybe you removed content that was incorrectly indexed, so queries that matched it now correctly return nothing. Once you understand and accept the change, update the baseline. + +Don't update baselines when: + +**You don't understand why metrics changed**: Unexplained changes should be investigated, not hidden by updating the baseline. + +**You're just trying to make CI pass**: If CI is failing because of a regression, fix the regression rather than lowering expectations. + +**The change is temporary**: If you're in the middle of a multi-step refactor and expect things to recover, don't keep updating baselines. Wait until you're at a stable state. + +## Comparing different configurations + +Sometimes you want to compare two different configurations, not two points in time. Maybe you're evaluating whether to switch embedding models, and you want to see how they compare head-to-head. + +Run each configuration and save reports to different directories: + +```bash +# Configuration A (current) +bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --output-dir ./comparison/config-a + +# Configuration B (candidate) +bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --output-dir ./comparison/config-b + +# Compare +bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --baseline ./comparison/config-a/report.json \ + --output-dir ./comparison/diff-a-vs-b +``` + +This gives you a side-by-side comparison of two configurations on the same dataset. You can see exactly where they differ and make an informed choice. + +For A/B comparisons, pay attention to the pattern of changes. If model B is better on some query types and worse on others, you have a nuanced decision to make. The diff helps you understand the tradeoff rather than just looking at aggregate numbers. + +## Debugging regressions + +When a query degrades, you want to understand why. The diff tells you what changed; debugging tells you why. + +Start by looking at what was retrieved in both runs. The per-query reports include the list of retrieved sourceIds: + +```json +{ + "id": "q_free_shipping", + "relevant": { "sourceIds": ["eval:support:doc:shipping"] }, + "retrieved": { "sourceIds": ["eval:support:doc:returns", "eval:support:doc:shipping"] } +} +``` + +Compare this to the baseline's retrieved list. Did a previously-retrieved relevant document drop off? Did a new irrelevant document push it out? Did the ordering change? + +Common causes of regressions: + +**Chunking changes**: Different chunk boundaries mean different embeddings. Content that used to be in one chunk might now span two, diluting the embedding signal. + +**Embedding model drift**: Some embedding APIs (especially hosted ones) might update models without notice. If you're seeing unexplained changes, check if your provider changed anything. + +**Content changes**: If you're evaluating against production content (not a fixed dataset), content updates can affect retrieval. A document might have been edited in ways that changed its embedding. + +**Database state**: Leftover documents from previous runs can pollute results. Make sure you're working with clean state, or that the scopePrefix isolation is working correctly. + +## Aggregate vs per-query analysis + +Aggregate metrics are useful for quick summaries, but they can hide important details. A recall@10 that stays flat might mask one query improving while another degrades. Always look at both levels. + +The diff report emphasizes per-query changes because that's where actionable insights live. Knowing that "recall improved 5%" is nice; knowing that "q_compromised_account went from 0% to 100% recall while q_free_shipping degraded" is actionable. + +When reviewing diffs, prioritize: + +1. Any degraded queries (regressions are urgent) +2. Queries with large improvements (verify they're real) +3. Queries that didn't change when you expected them to (might indicate test coverage gaps) + +## Next steps + +Now that you understand how to run evaluations, track changes, and integrate with CI, you're equipped to build a robust retrieval quality practice. The key is consistency: run eval regularly, investigate changes, and keep your ground truth accurate. + + + + Build better datasets for more meaningful evaluation + + + Interpret what the numbers actually mean + + diff --git a/apps/web/content/docs/eval/datasets.mdx b/apps/web/content/docs/eval/datasets.mdx new file mode 100644 index 0000000..682f4f9 --- /dev/null +++ b/apps/web/content/docs/eval/datasets.mdx @@ -0,0 +1,320 @@ +--- +title: Dataset Format +description: How to structure evaluation datasets with documents, queries, and ground truth relevance labels. +--- + +A good evaluation dataset is the foundation of useful metrics. It doesn't need to be large—a few dozen queries with accurate relevance labels is more valuable than hundreds with noisy or incomplete ground truth. What matters is that for each query in your dataset, you know exactly which documents should be retrieved, and you're confident in that judgment. + +This page covers the dataset format in detail, explains each field and when to use it, and provides strategies for building and maintaining datasets that give you reliable signal. + +## Dataset structure + +An eval dataset is a JSON file with this structure: + +```json +{ + "version": "1", + "id": "my-dataset", + "description": "Optional description of what this dataset tests", + "defaults": { + "topK": 10, + "scopePrefix": "eval:mydata:", + "mode": "retrieve" + }, + "documents": [...], + "queries": [...] +} +``` + +The `version` field identifies the schema version. Currently only "1" is supported. This exists so future versions of the harness can add features without breaking existing datasets—if you see a higher version number, you'll know to update. + +The `id` is a stable identifier for this dataset. It appears in reports and is used when comparing runs across time. Pick something descriptive that won't change when you add queries. + +The `description` is optional but helpful when you have multiple datasets. It shows up in report headers and helps you remember what each dataset is testing. + +## The defaults section + +The `defaults` block sets configuration that applies to all queries in this dataset unless a query overrides it: + +```json +{ + "defaults": { + "topK": 10, + "scopePrefix": "eval:mydata:", + "mode": "retrieve", + "rerankTopK": 30 + } +} +``` + +**topK** controls how many results are retrieved and scored. This is the "k" in metrics like recall@k. A value of 10 is common—it balances between catching relevant documents and not over-retrieving. + +**scopePrefix** defines the namespace for this evaluation. Documents are expected to have sourceIds that start with this prefix, and retrieval is scoped to only consider documents within the prefix. This isolation prevents eval queries from accidentally matching production content and vice versa. The prefix should be unique to this dataset. + +**mode** determines whether to evaluate retrieval alone or retrieval plus reranking. Valid values are `"retrieve"` and `"retrieve+rerank"`. When using rerank mode, you need the reranker battery installed and configured. + +**rerankTopK** (optional) specifies how many candidates to retrieve before reranking. Only relevant in `"retrieve+rerank"` mode. Defaults to `topK * 3`. Retrieving more candidates gives the reranker more material to work with, but increases cost and latency. + +## Defining documents + +The `documents` array contains the content that will be ingested and searched: + +```json +{ + "documents": [ + { + "sourceId": "eval:mydata:doc:refund-policy", + "content": "Our refund policy allows returns within 30 days...", + "metadata": { + "category": "support", + "lastUpdated": "2025-01-01" + } + }, + { + "sourceId": "eval:mydata:doc:shipping-guide", + "content": "Standard shipping takes 5-7 business days..." + } + ] +} +``` + +Each document requires a `sourceId` and **either** `content` **or** `loaderRef`. +The sourceId should start with your `scopePrefix` to maintain proper isolation. Metadata is optional but can be useful if you're testing metadata-filtered retrieval. + +You can omit the `documents` array entirely if you're evaluating against content that's already indexed. In that case, make sure your `scopePrefix` matches the content you want to search, and run the eval with `--no-ingest`. + +When documents are present and the harness ingests them, it first deletes any existing documents with the `scopePrefix`. This ensures a clean state—you're always evaluating against exactly the content defined in the dataset, not a mix of old and new documents from previous runs. + +## Defining queries + +The `queries` array is the heart of the dataset. Each query defines what to search for and what should be found: + +```json +{ + "queries": [ + { + "id": "q_return_window", + "query": "How long do I have to return an item?", + "relevant": { + "sourceIds": ["eval:mydata:doc:refund-policy"] + } + }, + { + "id": "q_shipping_time", + "query": "When will my order arrive?", + "relevant": { + "sourceIds": ["eval:mydata:doc:shipping-guide"] + }, + "topK": 5 + } + ] +} +``` + +**id** is a stable identifier for this query. It should be unique within the dataset and shouldn't change when you modify the query text. The harness uses this ID in reports and diffs. + +**query** is the actual search text that will be embedded and searched. + +**relevant.sourceIds** lists the document sourceIds that should be retrieved for this query. These are the ground truth labels. When the harness evaluates results, it checks whether the retrieved chunks came from documents in this list. + +**topK** (optional) overrides the dataset default for this specific query. Useful when some queries naturally have more or fewer relevant documents. + +### Multiple relevant documents + +Some queries legitimately have multiple relevant documents: + +```json +{ + "id": "q_account_security", + "query": "How do I keep my account secure?", + "relevant": { + "sourceIds": [ + "eval:mydata:doc:password-guide", + "eval:mydata:doc:2fa-setup", + "eval:mydata:doc:security-best-practices" + ] + } +} +``` + +The metrics handle this correctly. Recall@k measures what fraction of the relevant documents were retrieved, so if there are three relevant documents and two were found in the top 10, recall@10 is 0.67. Precision@k measures what fraction of retrieved items were relevant. MRR@k is based on the rank of the first relevant document found. + +### Queries with no relevant documents + +Sometimes you want to test that a query returns nothing—or at least that your known documents aren't relevant: + +```json +{ + "id": "q_unrelated", + "query": "What's the weather like today?", + "relevant": { + "sourceIds": [] + } +} +``` + +An empty sourceIds array means nothing should be retrieved. In this case, hit@k will be 0 (correct behavior—no hit was expected), and precision@k measures how many irrelevant items were returned. + +## Strategies for building datasets + +The hardest part of evaluation isn't running the harness; it's building a dataset with accurate ground truth. Here are approaches that work. + +### Start from failure cases + +Run your current retrieval on real user queries and manually review the results. When retrieval returns wrong or irrelevant content, you've found a valuable test case. Add the query to your dataset with the correct ground truth documents. Over time, your dataset accumulates the hard cases that actually matter for quality. + +This approach has a nice property: your dataset becomes a regression test suite. If retrieval fails for a query once, it's in the dataset forever, and you'll know immediately if it breaks again. + +### Use query logs + +If you have logs of what users search for, mine them for query patterns. Group similar queries (people ask "how to reset password" many different ways), pick representative examples, and label them. This grounds your dataset in real usage rather than hypothetical questions. + +Be selective. Not every query in your logs needs to be in your eval dataset. Focus on queries that are common, important, or historically problematic. + +### Create coverage-focused datasets + +Sometimes you want to test specific capabilities rather than real user behavior. Maybe you're adding content in a new language and want to verify multilingual retrieval works. Maybe you're testing how well your system handles long queries versus short ones. + +For these cases, create synthetic datasets designed to probe specific behaviors. They won't tell you about real-world performance, but they'll catch capability regressions. + +### Label incrementally + +You don't need to label everything at once. Start with a small dataset—20-30 queries—and measure. As you tune your system and discover failure modes, add more queries. A dataset that grows organically from real problems is often more useful than a large dataset labeled in a single labeling sprint. + +### Handle ambiguous relevance + +Sometimes a document is partially relevant. Maybe it mentions the topic but doesn't directly answer the query. The current dataset format is binary—a document is either relevant or not—so you have to make a call. + +When in doubt, be strict. Only mark documents as relevant if they should definitely be retrieved. It's better to have precision-focused ground truth than to include borderline cases that make your metrics noisy. + +If you find yourself constantly wrestling with partial relevance, consider whether your documents are too broad. Sometimes the right fix is to split documents into more focused pieces, not to relax your relevance labels. + +## Scope isolation explained + +The `scopePrefix` is worth understanding deeply because it affects both safety and accuracy. + +When the eval harness runs, it uses the scopePrefix in two ways. During ingestion, it first deletes all documents whose sourceId starts with the prefix, then ingests the new documents. This guarantees that you're evaluating against exactly the content defined in the dataset, with no stale documents from previous runs polluting the results. + +During retrieval, the harness scopes queries to the prefix. Only documents with sourceIds starting with the prefix are considered. This prevents eval queries from accidentally matching production content that you haven't labeled. + +The prefix should be unique per dataset and clearly indicate eval content. Using prefixes like `eval:dataset-name:` or `eval:v2:` works well. Don't use prefixes that overlap with your production content prefixes. + +If you're evaluating against production content (using `--no-ingest`), the scopePrefix should match your production content's prefix. In this case, there's no isolation—you're testing against real data. This is fine as long as you're aware of it and your ground truth sourceIds match actual production documents. + +### Safety guardrails + +The harness includes a safety check on scopePrefix. By default, it requires the prefix to start with `eval:`. This prevents accidental deletion of production data if someone misconfigures a dataset. + +If you need to evaluate against a prefix that doesn't start with `eval:`, you can override this with the `--allow-custom-prefix` flag. The harness will prompt for confirmation before proceeding with any deletions. + +## Dataset organization + +As your evaluation practice matures, you'll likely have multiple datasets for different purposes. A reasonable organization: + + + + + + + + + + + + + + +Each dataset can have its own scopePrefix to keep content isolated. You can run them independently or write scripts that run all datasets and aggregate results. + +## Working with loaderRef + +If you don't want to inline document text in your dataset file (common for connector-backed corpora), you can store a **string** reference in `loaderRef` and load the content in your project-local eval script. + +```json +{ + "sourceId": "eval:notion:doc:abc123", + "loaderRef": "notion:pageId:abc123" +} +``` + +When a document uses `loaderRef`, the harness requires your script to provide a `loadDocumentByRef(ref)` hook and pass it to `runEval({ loadDocumentByRef })`. +This keeps the dataset format simple and lets you decide how refs map to content (filesystem, connector API, etc.). + +## Complete example + +Here's a more complete dataset showing various features: + +```json +{ + "version": "1", + "id": "support-faq-eval", + "description": "Evaluation dataset for support FAQ retrieval quality", + "defaults": { + "topK": 10, + "scopePrefix": "eval:support:", + "mode": "retrieve" + }, + "documents": [ + { + "sourceId": "eval:support:doc:refund-policy", + "content": "Our refund policy allows returns within 30 days of purchase. Items must be unused and in original packaging. To initiate a return, contact support with your order number. Refunds are processed within 5-7 business days after we receive the item. Note that digital products and gift cards are non-refundable once redeemed.", + "metadata": { "category": "returns" } + }, + { + "sourceId": "eval:support:doc:shipping", + "content": "We offer several shipping options. Standard shipping (5-7 business days) is free on orders over $50. Express shipping (2-3 business days) costs $9.99. Overnight delivery is available in select areas for $24.99. International shipping varies by destination—see our shipping calculator for exact rates. All orders include tracking information sent via email.", + "metadata": { "category": "shipping" } + }, + { + "sourceId": "eval:support:doc:account-security", + "content": "To keep your account secure, we recommend enabling two-factor authentication in your account settings. Use a unique password at least 12 characters long. We'll never ask for your password via email. If you suspect unauthorized access, reset your password immediately and contact support.", + "metadata": { "category": "account" } + } + ], + "queries": [ + { + "id": "q_return_deadline", + "query": "How long do I have to return something?", + "relevant": { "sourceIds": ["eval:support:doc:refund-policy"] } + }, + { + "id": "q_free_shipping", + "query": "Is shipping free?", + "relevant": { "sourceIds": ["eval:support:doc:shipping"] } + }, + { + "id": "q_express_cost", + "query": "How much does express delivery cost?", + "relevant": { "sourceIds": ["eval:support:doc:shipping"] } + }, + { + "id": "q_2fa", + "query": "How do I enable two-factor authentication?", + "relevant": { "sourceIds": ["eval:support:doc:account-security"] } + }, + { + "id": "q_compromised_account", + "query": "I think someone hacked my account", + "relevant": { "sourceIds": ["eval:support:doc:account-security"] } + }, + { + "id": "q_digital_refund", + "query": "Can I get a refund for a digital download?", + "relevant": { "sourceIds": ["eval:support:doc:refund-policy"] } + } + ] +} +``` + +This dataset tests a support FAQ system with six queries across three documents. Each query has a clear expected answer, and the ground truth reflects actual relevance rather than keyword overlap. + +## Next steps + + + + What hit@k, recall@k, precision@k, and MRR@k actually measure + + + Configuration options and customizing eval behavior + + diff --git a/apps/web/content/docs/eval/getting-started.mdx b/apps/web/content/docs/eval/getting-started.mdx new file mode 100644 index 0000000..615d95b --- /dev/null +++ b/apps/web/content/docs/eval/getting-started.mdx @@ -0,0 +1,266 @@ +--- +title: Getting Started with Evaluation +description: Install the eval harness and run your first retrieval evaluation in under 15 minutes. +--- + +This guide walks you through setting up the evaluation harness, creating a simple dataset, and running your first eval. By the end, you'll have metrics showing how well your retrieval pipeline performs on a set of test queries, and you'll understand the workflow well enough to expand from there. + +## Prerequisites + +Before you start, make sure you have a working Unrag installation. You should be able to ingest content and retrieve it—if you haven't done that yet, work through the [Quickstart](/docs/getting-started/quickstart) first. The eval harness runs against your existing engine configuration, so everything needs to be wired up and working. + +You'll also need some content in your database, or be prepared to let the eval harness ingest test documents for you. The harness can work either way: it can evaluate against your existing indexed content, or it can ingest a curated set of documents specifically for evaluation. + +## Installing the eval battery + +Install the eval harness using the CLI: + +```bash +bunx unrag@latest add battery eval +``` + +This does several things at once. It copies the eval module into your project (at `lib/unrag/eval/` by default), generates a starter dataset and configuration, creates an eval script at `scripts/unrag-eval.ts`, and adds npm scripts to your `package.json` for running evaluations. + +After installation, you'll see these new files: + + + + + + + + + + + + + + + + + + + + + + +The CLI also adds two scripts to your `package.json`: + +```json +{ + "scripts": { + "unrag:eval": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json", + "unrag:eval:ci": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci" + } +} +``` + +## Understanding the sample dataset + +Open `.unrag/eval/datasets/sample.json` to see what a dataset looks like: + +```json +{ + "version": "1", + "id": "sample-eval", + "description": "Sample dataset demonstrating eval harness structure", + "defaults": { + "topK": 10, + "scopePrefix": "eval:sample:", + "mode": "retrieve" + }, + "documents": [ + { + "sourceId": "eval:sample:doc:refund-policy", + "content": "Our refund policy allows returns within 30 days of purchase. Items must be unused and in original packaging. Refunds are processed within 5-7 business days after we receive the returned item. Digital products are non-refundable once downloaded." + }, + { + "sourceId": "eval:sample:doc:shipping-info", + "content": "Standard shipping takes 5-7 business days within the continental US. Express shipping is available for an additional fee and arrives within 2-3 business days. International shipping times vary by destination. All orders include tracking information." + } + ], + "queries": [ + { + "id": "q_refund_window", + "query": "How long do I have to return an item?", + "relevant": { + "sourceIds": ["eval:sample:doc:refund-policy"] + } + }, + { + "id": "q_shipping_time", + "query": "When will my order arrive?", + "relevant": { + "sourceIds": ["eval:sample:doc:shipping-info"] + } + } + ] +} +``` + +The dataset has three main parts. The `defaults` section sets configuration that applies to all queries unless overridden. The `documents` section defines the content that should be searchable—each document has a `sourceId` (a stable identifier) and `content` (the actual text). The `queries` section lists test queries, each with an `id`, the `query` text, and `relevant.sourceIds` indicating which documents should be retrieved for that query. + +Notice the `scopePrefix` in defaults. This prefix (`eval:sample:`) serves two purposes: it namespaces the eval documents so they don't mix with your production content, and it limits retrieval to only consider documents within this namespace. When the harness ingests documents, it uses their `sourceId` as-is. When it retrieves, it scopes the search to the prefix. This isolation is important—you don't want eval queries accidentally matching production content. + +## Running your first evaluation + +Make sure your environment variables are set (DATABASE_URL and your embedding provider credentials), then run: + +```bash +bun run unrag:eval +``` + +The harness will: + +1. Load the dataset and validate its structure +2. Delete any existing documents with the `scopePrefix` (to ensure clean state) +3. Ingest the dataset's documents +4. Run each query through your retrieval pipeline +5. Score the results against ground truth +6. Write a report to `.unrag/eval/runs//` + +You'll see output like this: + +``` +Eval: sample-eval (2 queries) +Mode: retrieve +Scope: eval:sample: + +Ingesting 2 documents... + ✓ eval:sample:doc:refund-policy (4 chunks) + ✓ eval:sample:doc:shipping-info (3 chunks) + +Running queries... + ✓ q_refund_window: hit@10=1, recall@10=1.00 + ✓ q_shipping_time: hit@10=1, recall@10=1.00 + +Aggregates: + hit@10: 1.000 (mean) + recall@10: 1.000 (mean) + precision@10: 0.143 (mean) + mrr@10: 1.000 (mean) + +Report: .unrag/eval/runs/2025-01-10T14-32-00-sample-eval/report.json +``` + +In this simple example, both queries successfully retrieved their relevant documents (hit@10 = 1.0, recall@10 = 1.0). The precision is lower because we're retrieving 10 chunks but only one document is relevant per query—that's expected and normal. + +## Reading the report + +Open the generated `report.json` to see the full results. The report contains everything you need to understand how the evaluation went: + +```json +{ + "dataset": { + "id": "sample-eval", + "description": "Sample dataset demonstrating eval harness structure" + }, + "config": { + "mode": "retrieve", + "topK": 10, + "scopePrefix": "eval:sample:" + }, + "queries": [ + { + "id": "q_refund_window", + "query": "How long do I have to return an item?", + "relevant": ["eval:sample:doc:refund-policy"], + "retrieved": ["eval:sample:doc:refund-policy", "eval:sample:doc:shipping-info"], + "metrics": { + "hitAtK": 1, + "recallAtK": 1, + "precisionAtK": 0.5, + "mrrAtK": 1 + }, + "durations": { + "embeddingMs": 145, + "retrievalMs": 23, + "totalMs": 168 + } + } + // ... more queries + ], + "aggregates": { + "hitAtK": { "mean": 1, "median": 1 }, + "recallAtK": { "mean": 1, "median": 1 }, + "precisionAtK": { "mean": 0.143, "median": 0.143 }, + "mrrAtK": { "mean": 1, "median": 1 } + }, + "timings": { + "p50TotalMs": 168, + "p95TotalMs": 189 + } +} +``` + +The per-query results show exactly what was retrieved versus what should have been retrieved. This is invaluable for debugging—when a query fails, you can see what documents came back instead of the expected ones. + +## Building a real dataset + +The sample dataset demonstrates the format, but two queries isn't enough to evaluate anything meaningful. To get useful metrics, you need a dataset that covers the queries your users actually ask and the content they expect to find. + +Start by looking at your real query logs, support tickets, or search analytics. What questions do people ask? For each question type, find the documents that should answer it. You don't need hundreds of queries to start—20-30 well-chosen queries with accurate relevance labels is enough to catch major regressions and guide tuning decisions. + +A practical approach is to start with failure cases. Run your current retrieval on real queries and manually check the results. When retrieval fails to surface the right content, add that query to your eval dataset with the correct ground truth. Over time, your dataset accumulates the hard cases—the ones that actually matter. + +See the [Datasets documentation](/docs/eval/datasets) for detailed guidance on structuring datasets, handling multiple relevant documents, and strategies for building ground truth incrementally. + +## Evaluating against existing content + +The sample workflow ingests documents from the dataset, but you might want to evaluate against content you've already indexed. Maybe you have a production corpus and want to test queries against it without maintaining duplicate content in your eval datasets. + +To evaluate against existing content, create a dataset with only queries (no `documents` array) and set `scopePrefix` to match your existing content: + +```json +{ + "version": "1", + "id": "prod-queries", + "defaults": { + "topK": 10, + "scopePrefix": "docs:" + }, + "queries": [ + { + "id": "q_auth_setup", + "query": "How do I configure authentication?", + "relevant": { + "sourceIds": ["docs:guides:auth-setup", "docs:reference:auth-api"] + } + } + ] +} +``` + +Then run with `--no-ingest` to skip the ingestion phase: + +```bash +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/prod-queries.json --no-ingest +``` + +The harness will run queries against whatever content is already in your store, scoped to the prefix you specified. + +## Evaluating with reranking + +If you have the reranker battery installed, you can evaluate the full retrieve-then-rerank pipeline. Change the mode in your dataset or via CLI: + +```bash +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --mode retrieve+rerank +``` + +In rerank mode, the harness retrieves more candidates than `topK` (typically 3x), applies your configured reranker, and then scores the reranked results. The report includes metrics both before and after reranking, so you can see exactly how much reranking improved (or didn't improve) your results. + +## Next steps + +You've run your first evaluation and seen the basic workflow. From here: + + + + Learn the full dataset schema and strategies for building ground truth + + + What each metric measures and how to interpret the numbers + + + Add evaluation gates to your deployment pipeline + + diff --git a/apps/web/content/docs/eval/index.mdx b/apps/web/content/docs/eval/index.mdx new file mode 100644 index 0000000..50ae0de --- /dev/null +++ b/apps/web/content/docs/eval/index.mdx @@ -0,0 +1,68 @@ +--- +title: Evaluation Overview +description: Why measuring retrieval quality matters and how Unrag's eval harness helps you do it systematically. +--- + + +Evaluation is currently **experimental**. The core workflow is stable, but expect the dataset schema and report format to evolve as we learn from real-world usage. + + +Building a RAG system is straightforward. Building one that retrieves the right content for a given query is harder than it looks. The initial implementation usually works well enough that you ship it, but over time you want to make changes—different chunking parameters, a new embedding model, maybe adding a reranker. Each change feels like it should improve things, but without measurement, you're guessing. + +This is where retrieval evaluation comes in. Instead of manual spot-checking or hoping the LLM produces better answers, you build a dataset of queries with known relevant documents and measure how well your retrieval pipeline surfaces them. When you change your chunking strategy from 200 to 400 words, you can see exactly how that affects recall. When you switch embedding providers, you know whether precision improved or degraded. + +## What the eval harness does + +Unrag's evaluation harness is an optional module you install like any other battery. It gives you a structured way to define test datasets, run retrieval (and optionally reranking) against them, and produce metrics that tell you how well your system is performing. The output is deterministic—run the same dataset against the same configuration and you get the same numbers—which means you can track changes over time and catch regressions before they reach production. + +The harness focuses specifically on retrieval quality, not end-to-end answer quality. It doesn't call an LLM, doesn't grade generated answers, doesn't try to measure hallucination or citation accuracy. Those things matter, but they're harder to measure reliably and depend heavily on your prompt engineering and model choice. Retrieval is the foundation: if your system can't find the right content, no amount of prompt tuning will save you. + +By staying focused on retrieval, the harness can give you precise, reproducible metrics. You know exactly what "recall@10 of 0.85" means, and you can compare that number across runs, across configurations, across time. + +## When to use evaluation + +The obvious time to evaluate is before deploying a change. You've retuned your chunk overlap, and you want to know if it helped. You're considering a more expensive embedding model, and you want to see if the quality improvement justifies the cost. These are the cases where evaluation pays off immediately. + +But evaluation is also valuable for understanding your system's baseline behavior. What queries does your current configuration handle well? What queries does it struggle with? If you only evaluate when making changes, you're missing the chance to understand your system's strengths and weaknesses. A periodic eval run—even without changes—can surface queries that started failing after you ingested new content, or areas where your test dataset doesn't cover real user behavior. + +Finally, evaluation is essential for CI pipelines. When someone opens a pull request that touches your RAG configuration, you want automated checks that verify retrieval quality hasn't regressed. The eval harness produces JSON reports and exit codes that integrate naturally with CI systems. + +## The mental model + +The harness operates on a simple model: you have documents, you have queries, and for each query you know which documents are relevant. When you run an evaluation, the harness ingests your documents (if needed), runs each query through your retrieval pipeline, and compares the retrieved results against the known-relevant documents. The comparison produces metrics like precision, recall, and MRR that quantify retrieval quality. + +``` +Dataset (documents + queries with ground truth) + ↓ +Ingest documents into your store + ↓ +For each query: + ├─ Retrieve chunks (vector search) + ├─ Optionally rerank + └─ Score against ground truth + ↓ +Aggregate metrics + write report +``` + +The documents in your eval dataset don't have to be your real production content. In fact, it's often better to use a curated subset that you understand well enough to label. The goal is to have ground truth you're confident in, even if it's small. A dataset of 50 queries with accurate relevance labels is more valuable than 500 queries where you're guessing. + +## A note on ground truth + +The hardest part of evaluation isn't running the harness; it's building a dataset with accurate relevance labels. For each query, you need to know which documents should be retrieved. This requires human judgment, and human judgment is expensive and sometimes inconsistent. + +There's no magic solution here. The eval harness gives you the infrastructure to run evaluations and track metrics over time, but the quality of those metrics depends on the quality of your ground truth. Investing time in building a good dataset—even a small one—pays off more than running sophisticated metrics on noisy labels. + +The dataset documentation includes strategies for building ground truth incrementally, using retrieval failures to find gaps, and maintaining datasets as your content evolves. + +## Getting started + +Ready to set up evaluation? Start with the installation and first eval run: + + + + Install the eval battery and run your first evaluation + + + Structure your documents and queries for evaluation + + diff --git a/apps/web/content/docs/eval/meta.json b/apps/web/content/docs/eval/meta.json new file mode 100644 index 0000000..d4ba220 --- /dev/null +++ b/apps/web/content/docs/eval/meta.json @@ -0,0 +1,14 @@ +{ + "title": "Evaluation", + "badge": "Experimental", + "description": "Measure and improve your retrieval quality with a deterministic evaluation harness.", + "pages": [ + "index", + "getting-started", + "datasets", + "metrics", + "running-evals", + "ci-integration", + "comparing-runs" + ] +} diff --git a/apps/web/content/docs/eval/metrics.mdx b/apps/web/content/docs/eval/metrics.mdx new file mode 100644 index 0000000..a6f4924 --- /dev/null +++ b/apps/web/content/docs/eval/metrics.mdx @@ -0,0 +1,147 @@ +--- +title: Understanding Metrics +description: What each retrieval metric measures, when to use it, and how to interpret the numbers in context. +--- + +Metrics are only useful if you understand what they're measuring. A recall@10 of 0.85 sounds good, but whether it's actually good depends on your use case, your baseline, and what you're optimizing for. This page explains each metric the eval harness produces, when each one matters, and how to interpret them in the context of retrieval quality. + +## The four metrics + +The eval harness computes four standard retrieval metrics for each query: hit@k, recall@k, precision@k, and MRR@k. Each measures something different, and together they give you a well-rounded picture of retrieval quality. + +### Hit@k (hit rate) + +Hit@k asks the simplest question: did we find at least one relevant document in the top k results? + +``` +hit@k = 1 if any relevant document was retrieved in top k, else 0 +``` + +This is a binary metric per query—either the retrieval succeeded at finding something relevant, or it completely missed. When you average hit@k across all queries, you get the percentage of queries where retrieval found at least one relevant result. + +Hit@k is useful as a sanity check. If your hit@10 is below 0.90, you have a significant number of queries that return zero relevant content in the top 10. That's a problem regardless of what other metrics say—users asking those questions are getting completely unhelpful results. + +Because hit@k is binary, it doesn't distinguish between "found the one relevant document at position 1" and "found one of three relevant documents at position 10." For that nuance, you need recall and MRR. + +### Recall@k + +Recall@k measures what fraction of the relevant documents were actually retrieved: + +``` +recall@k = (# of relevant docs retrieved in top k) / (# of relevant docs total) +``` + +If a query has three relevant documents and retrieval found two of them in the top 10, recall@10 is 0.67. If it found all three, recall@10 is 1.0. If it found none, recall@10 is 0.0. + +Recall is crucial when you need comprehensive coverage. If you're building context for an LLM and there are multiple documents that together form a complete answer, you want high recall. Missing one of the relevant documents means the LLM is working with incomplete information. + +Note that recall doesn't care about precision—you could retrieve 100 documents, have terrible precision, but still achieve perfect recall if all the relevant ones are in there somewhere. + +### Precision@k + +Precision@k measures what fraction of retrieved results were actually relevant: + +``` +precision@k = (# of relevant docs in top k) / k +``` + +If you retrieve 10 results and 3 are relevant, precision@10 is 0.30. Precision tells you about the signal-to-noise ratio in your results. + +Precision matters when you're showing results directly to users or when you have limited context budget. If you can only fit 5 chunks in your LLM prompt, you want those 5 to be highly relevant. High precision means less noise for the user to wade through (or for the LLM to get confused by). + +In practice, precision@k is often low even when retrieval is working well. If a query has one relevant document and you retrieve 10 results, precision@10 can't exceed 0.10. This doesn't mean retrieval is bad—it means you're retrieving more results than there are relevant documents. That's often intentional, especially when you want to ensure high recall. + +### MRR@k (Mean Reciprocal Rank) + +MRR@k measures how early the first relevant document appears: + +``` +reciprocal_rank = 1 / (position of first relevant result) +MRR@k = average reciprocal_rank across all queries +``` + +If the first relevant document is at position 1, the reciprocal rank is 1.0. At position 2, it's 0.5. At position 10, it's 0.1. If no relevant document appears in the top k, the reciprocal rank is 0. + +MRR captures the intuition that finding a relevant document at position 1 is much better than finding it at position 10, even though both count as a "hit." A high MRR means your most relevant results tend to appear near the top of the list. + +MRR is particularly useful for applications where users expect the first result to be the best one. Search interfaces, autocomplete, and single-answer retrieval all benefit from high MRR. + +## Choosing what to optimize + +Different use cases prioritize different metrics. Understanding your use case helps you know which metrics matter most. + +**For search interfaces**, MRR and precision matter most. Users scan from the top, and every irrelevant result is friction. You want the first few results to be highly relevant, and you'd rather show fewer results than pad with marginal matches. + +**For LLM context building**, recall often matters most. You want to capture all relevant information, even if it means including some noise. The LLM can filter out irrelevant content, but it can't synthesize information that wasn't retrieved. + +**For RAG with limited context windows**, precision and recall both matter. You want comprehensive coverage (high recall) within a tight budget (need high precision to avoid wasting tokens on irrelevant content). This tension is why reranking helps—you retrieve broadly for recall, then rerank for precision. + +**For support chatbots**, hit rate is a good primary metric. If the system can't find anything relevant, the interaction fails completely. Getting partial coverage is better than missing entirely. + +## Interpreting aggregate metrics + +The eval report shows aggregate metrics across all queries, typically as mean and median values. How you interpret these depends on what you're comparing against. + +### Absolute interpretation + +Without context, these rough benchmarks can help orient you: + +For **hit@10**, anything below 0.90 suggests systemic problems. Most queries should find at least one relevant document. If 20% of queries miss entirely, you have chunking issues, embedding mismatches, or dataset problems. + +For **recall@10**, above 0.80 is solid for most applications. Above 0.90 is excellent. Below 0.70 means you're missing a significant fraction of relevant content. + +For **precision@10**, interpretation depends heavily on how many relevant documents exist per query. If most queries have 1-2 relevant documents, precision@10 of 0.15-0.25 is typical and not concerning. If queries have 5+ relevant documents, you'd expect higher precision. + +For **MRR@10**, above 0.80 means relevant content usually appears in the top 2 positions. Above 0.90 means it's usually first. Below 0.60 means users often have to scan down the list to find what they need. + +### Relative interpretation + +Absolute numbers are less important than changes over time. If your recall@10 drops from 0.85 to 0.78 after a configuration change, that's a 8% regression regardless of whether 0.85 was "good" in absolute terms. + +This is why baseline comparison matters. The eval harness can diff two runs and show you exactly which queries improved, which degraded, and by how much. A small change in aggregate numbers might mask big swings in individual queries. + +## Metrics and reranking + +When you run in `retrieve+rerank` mode, the harness produces two sets of metrics: one for retrieval alone, and one after reranking. This lets you measure how much reranking helps. + +A typical pattern: retrieval has good recall but mediocre MRR, and reranking significantly improves MRR while maintaining recall. The reranker doesn't find new content—it reorders what was retrieved—so recall stays the same, but the most relevant items move to the top. + +If reranking doesn't improve your metrics, consider whether: + +1. Your retrieval is already very good (reranking has less room to help) +2. Your queries are simple and embedding similarity is sufficient +3. The reranker model isn't suited to your domain +4. You're not retrieving enough candidates before reranking + +## Per-query analysis + +Aggregate metrics hide important details. A recall@10 of 0.85 could mean "every query has 85% recall" or "half the queries have perfect recall and half are terrible." The per-query breakdown in the report lets you find the problem queries. + +When investigating low-performing queries, look at what was retrieved versus what was expected. Common patterns include: + +**Wrong content type**: The query asks about feature X, but retrieval returns marketing content about X instead of documentation. This suggests your content needs better organization or your chunks are mixing different types of content. + +**Keyword mismatch**: The query uses different words than the relevant document. "How do I cancel my subscription?" matches documents about "membership" that never use the word "subscription." This is an embedding model limitation—consider adding query expansion or synonyms to your content. + +**Overly broad chunks**: The relevant content is buried in a long chunk that's mostly about something else. The embedding represents the whole chunk, which dilutes the relevance signal. Try smaller chunks or different chunking boundaries. + +**Missing content**: The document that should be relevant doesn't exist in your corpus. No amount of tuning will fix retrieval for content that's not there. This is a dataset or content gap, not a retrieval problem. + +## The metrics aren't everything + +Metrics tell you how well your system matches your ground truth labels. They don't tell you whether those labels are correct, whether your queries are representative, or whether high recall actually translates to better user experience. + +Treat metrics as a signal, not a goal. Improving recall from 0.80 to 0.85 is only valuable if it translates to better outcomes for your users. Sometimes the queries that matter most aren't well represented in your eval dataset. Sometimes a metric improvement comes from overfitting to your test set in ways that don't generalize. + +The value of evaluation is in systematic comparison—understanding whether changes help or hurt—not in achieving a particular number. Keep your ground truth accurate, keep your queries representative, and use the metrics to guide decisions rather than as ends in themselves. + +## Next steps + + + + All the configuration options for customizing eval behavior + + + Use metrics for automated quality gating + + diff --git a/apps/web/content/docs/eval/running-evals.mdx b/apps/web/content/docs/eval/running-evals.mdx new file mode 100644 index 0000000..057361a --- /dev/null +++ b/apps/web/content/docs/eval/running-evals.mdx @@ -0,0 +1,259 @@ +--- +title: Running Evals +description: Configuration options, CLI flags, and patterns for running evaluations in different scenarios. +--- + +Once you have a dataset and understand the metrics, you'll want to customize how evaluations run. Maybe you want to skip ingestion when testing against existing content. Maybe you want to compare retrieve-only against retrieve-plus-rerank. Maybe you're debugging a specific query and want verbose output. This page covers all the configuration options and common patterns for running evaluations. + +## The eval script + +When you install the eval battery, the CLI creates `scripts/unrag-eval.ts`. This is a thin wrapper around the eval runner that loads your dataset, configures the engine, and writes reports. You can modify it to suit your needs—it's vendored code, not a dependency. + +At a high level, the script: + +- Loads `.unrag/eval/config.json` (optional) +- Parses CLI flags (optional overrides) +- Calls `runEval({ engine, datasetPath, ... })` +- Writes `report.json`, `summary.md`, and (optionally) `diff.json`/`diff.md` + +You can extend this script to add custom behavior—loading datasets from different sources, sending metrics to an analytics service, or integrating with your monitoring stack. + +## Configuration precedence + +The eval harness accepts configuration from three sources, merged in this order (later sources override earlier ones): + +1. **Dataset defaults**: The `defaults` section in your dataset JSON +2. **Config file**: Settings in `.unrag/eval/config.json` +3. **CLI flags**: Command-line arguments passed to the eval script + +This means you can set sensible defaults in your dataset, override some settings in the config file for your environment, and further override with CLI flags for one-off runs. + +For example, your dataset might set `topK: 10` and `mode: "retrieve"`. Your config file might set thresholds for CI. And when debugging, you might run with `--mode retrieve+rerank` to try reranking. + +## CLI flags + +The eval script accepts these command-line flags: + +### Required flags + +**--dataset ``**: Path to the dataset JSON file. This is the only required flag. + +```bash +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/my-eval.json +``` + +### Retrieval configuration + +**--mode ``**: Either `retrieve` or `retrieve+rerank`. Overrides the dataset's default mode. + +```bash +# Test retrieval only +bun run scripts/unrag-eval.ts -- --dataset ... --mode retrieve + +# Test retrieval + reranking +bun run scripts/unrag-eval.ts -- --dataset ... --mode retrieve+rerank +``` + +**--top-k ``**: Override the number of results to score (this is the "k" in metrics like recall@k). + +```bash +# Score top 5 instead of dataset default +bun run scripts/unrag-eval.ts -- --dataset ... --top-k 5 +``` + +**--rerank-top-k ``**: In rerank mode, how many candidates to retrieve before reranking. Defaults to `topK * 3`. + +### Ingestion control + +**--no-ingest**: Skip the ingestion phase entirely. Use this when evaluating against content that's already indexed. + +```bash +# Evaluate against existing production content +bun run scripts/unrag-eval.ts -- --dataset ... --no-ingest +``` + +**--allow-custom-prefix**: Allow scopePrefix values that don't start with `eval:`. This is dangerous because ingestion deletes by prefix. +Only use it when you understand the risk and pass `--yes`. + +### Output control + +**--output-dir ``**: Where to write the report files. Defaults to `.unrag/eval/runs/-/`. + +```bash +bun run scripts/unrag-eval.ts -- --dataset ... --output-dir ./eval-results/ +``` + +**--baseline ``**: Path to a previous `report.json` to compare against. Produces a diff report showing changes. + +```bash +bun run scripts/unrag-eval.ts -- --dataset ... --baseline .unrag/eval/runs/2025-01-09-sample/report.json +``` + +### CI mode + +**--ci**: Run in CI mode. This enables threshold checking and affects the exit code based on whether thresholds pass. + +```bash +bun run scripts/unrag-eval.ts -- --dataset ... --ci +``` + +In CI mode, the harness exits with: + +- Exit code 0: All thresholds passed +- Exit code 1: At least one threshold failed +- Exit code 2: Eval failed to run (dataset error, engine error, etc.) + +## The config file + +`.unrag/eval/config.json` stores settings that apply across datasets. The installer creates a default config: + +```json +{ + "thresholds": { "min": { "recallAtK": 0.75 } }, + "cleanup": "none", + "ingest": true +} +``` + +### Thresholds + +The `thresholds` section defines acceptable aggregate bounds. When running in CI mode, the harness checks *mean* metrics against these thresholds and fails if any are out of bounds. + +```json +{ + "thresholds": { "min": { "hitAtK": 0.9, "recallAtK": 0.8, "mrrAtK": 0.75 } } +} +``` + +Thresholds are compared against the mean aggregate value. If your mean recall@k is 0.78 and the threshold is 0.80, the check fails. + +Start with conservative thresholds (easy to pass) and tighten them as you improve your system. There's no point setting a threshold of 0.95 if your current baseline is 0.75—you'll just fail every CI run until you fix things. + +### Prefix safety + +By default, the harness refuses to run with a `scopePrefix` that doesn't start with `eval:`. You can override this by setting `"allowNonEvalPrefix": true` in `.unrag/eval/config.json` or passing `--allow-custom-prefix` (and `--yes`). + +## Common patterns + +### Evaluating a configuration change + +You're testing whether changing chunk size improves retrieval. Run baseline metrics first: + +```bash +# Current configuration +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/regression.json --output-dir ./baseline + +# Make your config change, then run again +bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --baseline ./baseline/report.json \ + --output-dir ./comparison +``` + +The diff report shows which queries improved, which degraded, and by how much. + +### Comparing retrieve vs retrieve+rerank + +Run the same dataset in both modes: + +```bash +# Retrieve only +bun run scripts/unrag-eval.ts -- --dataset ... --mode retrieve --output-dir ./retrieve-only + +# Retrieve + rerank +bun run scripts/unrag-eval.ts -- --dataset ... --mode retrieve+rerank --output-dir ./with-rerank +``` + +Compare the reports to see if reranking helps. Look especially at MRR—reranking usually improves ranking more than recall. + +### Testing different topK values + +Maybe you want to understand the precision/recall tradeoff at different k values: + +```bash +for k in 5 10 20; do + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/regression.json \ + --top-k $k \ + --output-dir ./topk-$k +done +``` + +Then compare metrics across the three runs. You'll typically see recall increase and precision decrease as k grows. + +### Running multiple datasets + +If you have several datasets for different purposes, run them all: + +```bash +for ds in regression multilingual edge-cases; do + bun run scripts/unrag-eval.ts -- \ + --dataset .unrag/eval/datasets/$ds.json \ + --output-dir ./runs/$ds +done +``` + +You might wrap this in a script that aggregates results or fails if any dataset fails its thresholds. + +### Debugging a specific query + +When one query is consistently failing and you want to understand why, inspect the generated report files (they include retrieved `sourceIds` per query): + +```bash +bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/regression.json +``` + +Then open the latest `.unrag/eval/runs/*/report.json` and compare `results.queries[].retrieved.sourceIds` (and `reranked.sourceIds` if present) to `relevant.sourceIds`. + +For deeper debugging, you can modify the eval script to log intermediate state, or use your Unrag engine directly to test the query in isolation. + +## Understanding the output + +A typical eval run prints the output paths and whether thresholds passed: + +``` +[unrag:eval] Wrote report: .unrag/eval/runs/-/report.json +[unrag:eval] Wrote summary: .unrag/eval/runs/-/summary.md +[unrag:eval] Thresholds: pass +``` + +The per-query output tells you which queries failed (marked with ✗) and their individual metrics. The `q_compromised_account` query got hit=0, meaning none of the relevant documents were retrieved in the top 10. That's a retrieval failure to investigate. + +The aggregates tell you overall performance. An MRR of 0.639 means relevant content is typically found around position 2-3—not bad but room for improvement. + +## Report files + +Each eval run creates a directory with several files: + +**report.json**: The complete machine-readable report with all metrics, per-query results, and configuration. + +**summary.md**: A human-readable markdown summary suitable for PR comments or Slack notifications. + +**diff.json** (if baseline provided): Machine-readable comparison against the baseline. + +**diff.md** (if baseline provided): Human-readable diff summary. + +The JSON reports are designed for programmatic access—you can build dashboards, track metrics over time, or integrate with other tools. The markdown files are for humans to quickly understand what happened. + +## Error handling + +The eval harness tries to be helpful when things go wrong: + +**Dataset validation errors**: If your dataset JSON is malformed or missing required fields, the harness fails fast with a clear error message telling you what's wrong. + +**Engine errors**: If ingestion or retrieval fails (database connection issues, embedding API errors), the harness reports the error and exits with code 2. + +**Threshold failures**: In CI mode, threshold failures exit with code 1 and the report indicates which thresholds failed. The eval still completes and writes reports—you get the data even though CI fails. + +**Missing documents**: If your ground truth references sourceIds that weren't ingested or don't exist in the store, the harness will still run but those queries will show low metrics. Check that your ground truth sourceIds match the document sourceIds exactly. + +## Next steps + + + + Set up automated quality gates in your pipeline + + + Understand diffs and track changes over time + + diff --git a/apps/web/content/docs/meta.json b/apps/web/content/docs/meta.json index 6cf65b2..797147c 100644 --- a/apps/web/content/docs/meta.json +++ b/apps/web/content/docs/meta.json @@ -10,6 +10,7 @@ "extractors", "connectors", "batteries", + "eval", "frameworks", "guides", "examples", diff --git a/packages/unrag/cli/commands/add.ts b/packages/unrag/cli/commands/add.ts index 410424d..8cd7038 100644 --- a/packages/unrag/cli/commands/add.ts +++ b/packages/unrag/cli/commands/add.ts @@ -1,7 +1,8 @@ -import { outro } from "@clack/prompts"; +import { cancel, confirm, isCancel, outro, select, text } from "@clack/prompts"; +import { writeFile } from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; -import { findUp, tryFindProjectRoot } from "../lib/fs"; +import { ensureDir, exists, findUp, tryFindProjectRoot } from "../lib/fs"; import { readJsonFile, writeJsonFile } from "../lib/json"; import { readRegistryManifest } from "../lib/manifest"; import { copyBatteryFiles, copyConnectorFiles, copyExtractorFiles } from "../lib/registry"; @@ -34,6 +35,105 @@ const CONFIG_FILE = "unrag.json"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); +type PackageJsonWithScripts = Awaited> & { + scripts?: Record; +}; + +const writeTextFile = async (absPath: string, content: string) => { + await ensureDir(path.dirname(absPath)); + await writeFile(absPath, content, "utf8"); +}; + +const shouldWriteFile = async ( + absPath: string, + projectRoot: string, + nonInteractive: boolean +): Promise => { + if (!(await exists(absPath))) return true; + if (nonInteractive) return false; + const answer = await confirm({ + message: `Overwrite ${path.relative(projectRoot, absPath)}?`, + initialValue: false, + }); + if (isCancel(answer)) { + cancel("Cancelled."); + return false; + } + return Boolean(answer); +}; + +const addPackageJsonScripts = async (args: { + projectRoot: string; + pkg: PackageJsonWithScripts; + scripts: Record; + nonInteractive: boolean; +}): Promise<{ added: string[]; kept: string[] }> => { + const existing = args.pkg.scripts ?? {}; + const desired = args.scripts; + const conflicting = Object.keys(desired).filter((k) => k in existing); + + const toAdd: Record = { ...desired }; + + if (conflicting.length > 0 && args.nonInteractive) { + // In non-interactive mode, keep existing scripts (non-destructive). + for (const k of conflicting) delete toAdd[k]; + } + + if (conflicting.length > 0 && !args.nonInteractive) { + for (const scriptName of conflicting) { + const action = await select({ + message: `Script "${scriptName}" already exists. What would you like to do?`, + options: [ + { value: "keep", label: "Keep existing", hint: existing[scriptName] }, + { value: "overwrite", label: "Overwrite", hint: desired[scriptName] }, + { value: "rename", label: "Add with different name", hint: `${scriptName}:new` }, + ], + initialValue: "keep", + }); + if (isCancel(action)) { + cancel("Cancelled."); + return { added: [], kept: Object.keys(desired) }; + } + + if (action === "keep") { + delete toAdd[scriptName]; + continue; + } + + if (action === "rename") { + const newName = await text({ + message: `New script name for ${scriptName}`, + initialValue: `${scriptName}:new`, + validate: (v) => { + const s = String(v).trim(); + if (!s) return "Script name is required"; + if (s in existing || s in toAdd) return "Script name already exists"; + return; + }, + }); + if (isCancel(newName)) { + cancel("Cancelled."); + return { added: [], kept: Object.keys(desired) }; + } + const nextName = String(newName).trim(); + const value = toAdd[scriptName]!; + delete toAdd[scriptName]; + toAdd[nextName] = value; + } + // For "overwrite", keep it in toAdd with the original name. + } + } + + const added = Object.keys(toAdd); + if (added.length > 0) { + args.pkg.scripts = { ...existing, ...toAdd }; + await writePackageJson(args.projectRoot, args.pkg); + } + + const kept = conflicting.filter((k) => !(k in toAdd)); + return { added, kept }; +}; + type ParsedAddArgs = { kind?: "connector" | "extractor" | "battery"; name?: string; @@ -134,7 +234,7 @@ export async function addCommand(args: string[]) { const nonInteractive = parsed.yes || !process.stdin.isTTY; - const pkg = await readPackageJson(root); + const pkg = (await readPackageJson(root)) as PackageJsonWithScripts; // Batteries if (kind === "battery") { @@ -169,6 +269,352 @@ export async function addCommand(args: string[]) { await writeJsonFile(configPath, { ...config, batteries }); + // Battery-specific scaffolding + if (battery === "eval") { + const datasetAbs = path.join(root, ".unrag/eval/datasets/sample.json"); + const configAbs = path.join(root, ".unrag/eval/config.json"); + const scriptAbs = path.join(root, "scripts/unrag-eval.ts"); + + const sampleDataset = { + version: "1", + id: "sample", + description: "Tiny dataset to validate retrieval changes.", + defaults: { + topK: 10, + scopePrefix: "eval:sample:", + mode: "retrieve", + thresholds: { min: { recallAtK: 0.75 } }, + }, + documents: [ + { + sourceId: "eval:sample:doc:refund-policy", + content: "Refunds are available within 30 days of purchase, provided you have a receipt.", + }, + { + sourceId: "eval:sample:doc:contact-support", + content: "Contact support by emailing support@example.com. Response times are typically under 24 hours.", + }, + ], + queries: [ + { + id: "q_refund_window", + query: "How long do I have to request a refund?", + relevant: { sourceIds: ["eval:sample:doc:refund-policy"] }, + }, + { + id: "q_contact_support", + query: "How do I contact support?", + relevant: { sourceIds: ["eval:sample:doc:contact-support"] }, + }, + ], + }; + + const evalConfig = { + thresholds: { min: { recallAtK: 0.75 } }, + cleanup: "none", + ingest: true, + }; + + const installImportBase = `../${config.installDir.replace(/\\/g, "/")}`; + const script = `/** + * Unrag eval runner entrypoint (generated). + * + * You own this file — customize it freely. + */ + +import path from "node:path"; +import { access, readFile } from "node:fs/promises"; + +import { createUnragEngine } from "../unrag.config"; +import { + runEval, + readEvalReportFromFile, + writeEvalReport, + writeEvalSummaryMd, + diffEvalReports, + writeEvalDiffJson, + writeEvalDiffMd, + type EvalMode, + type EvalThresholds, + type EvalCleanupPolicy, +} from "${installImportBase}/eval"; + +type CliArgs = { + dataset?: string; + baseline?: string; + outputDir?: string; + mode?: EvalMode; + topK?: number; + rerankTopK?: number; + scopePrefix?: string; + ingest?: boolean; + cleanup?: EvalCleanupPolicy; + thresholds?: Partial; + ci?: boolean; + allowAssets?: boolean; + allowNonEvalPrefix?: boolean; + yes?: boolean; + includeNdcg?: boolean; +}; + +async function fileExists(p: string): Promise { + try { + await access(p); + return true; + } catch { + return false; + } +} + +async function loadEnvFilesBestEffort(projectRoot: string) { + const nodeEnv = process.env.NODE_ENV ?? "development"; + const candidates = [ + ".env", + ".env.local", + \`.env.\${nodeEnv}\`, + \`.env.\${nodeEnv}.local\`, + ]; + for (const rel of candidates) { + const abs = path.join(projectRoot, rel); + if (!(await fileExists(abs))) continue; + const raw = await readFile(abs, "utf8").catch(() => ""); + for (const line of raw.split(/\\r?\\n/)) { + const s = line.trim(); + if (!s || s.startsWith("#")) continue; + const eq = s.indexOf("="); + if (eq < 0) continue; + const key = s.slice(0, eq).trim(); + const value = s.slice(eq + 1).trim().replace(/^"|"$/g, ""); + if (!key) continue; + if (process.env[key] === undefined) process.env[key] = value; + } + } +} + +function parseThresholdExpr(expr: string): Partial { + const s = String(expr ?? "").trim(); + const eq = s.indexOf("="); + if (eq < 0) throw new Error(\`Invalid --threshold: "\${s}" (expected key=value)\`); + const key = s.slice(0, eq).trim(); + const value = Number(s.slice(eq + 1).trim()); + if (!Number.isFinite(value)) throw new Error(\`Invalid --threshold value: "\${s}"\`); + + const out: Partial = {}; + if (key === "min.hitAtK") out.min = { hitAtK: value }; + else if (key === "min.recallAtK") out.min = { recallAtK: value }; + else if (key === "min.mrrAtK") out.min = { mrrAtK: value }; + else if (key === "max.p95TotalMs") out.max = { p95TotalMs: value }; + else throw new Error(\`Unknown threshold key: "\${key}"\`); + return out; +} + +function mergeThresholds(a: Partial, b: Partial): Partial { + return { + min: { ...(a.min ?? {}), ...(b.min ?? {}) }, + max: { ...(a.max ?? {}), ...(b.max ?? {}) }, + }; +} + +function parseArgs(argv: string[]): CliArgs { + const out: CliArgs = {}; + const thresholds: Partial[] = []; + + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--dataset") out.dataset = argv[++i]; + else if (a === "--baseline") out.baseline = argv[++i]; + else if (a === "--outputDir" || a === "--output-dir") out.outputDir = argv[++i]; + else if (a === "--mode") out.mode = argv[++i] as EvalMode; + else if (a === "--topK" || a === "--top-k") out.topK = Number(argv[++i]); + else if (a === "--rerankTopK" || a === "--rerank-top-k") out.rerankTopK = Number(argv[++i]); + else if (a === "--scopePrefix" || a === "--scope-prefix") out.scopePrefix = argv[++i]; + else if (a === "--no-ingest") out.ingest = false; + else if (a === "--cleanup") out.cleanup = argv[++i] as EvalCleanupPolicy; + else if (a === "--threshold") thresholds.push(parseThresholdExpr(argv[++i] ?? "")); + else if (a === "--ci") out.ci = true; + else if (a === "--allow-assets") out.allowAssets = true; + else if (a === "--allow-non-eval-prefix" || a === "--allow-custom-prefix") out.allowNonEvalPrefix = true; + else if (a === "--yes" || a === "-y") out.yes = true; + else if (a === "--include-ndcg") out.includeNdcg = true; + else if (a === "--help" || a === "-h") { + printHelp(); + process.exit(0); + } + } + + for (const t of thresholds) out.thresholds = mergeThresholds(out.thresholds ?? {}, t); + return out; +} + +function printHelp() { + console.log( + [ + "unrag-eval — retrieval eval harness", + "", + "Usage:", + " bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json", + "", + "Options:", + " --dataset Dataset JSON path (required)", + " --baseline Baseline report for diffing", + " --output-dir Output dir (default: .unrag/eval/runs/-)", + " --mode retrieve|retrieve+rerank Override mode", + " --top-k Override topK", + " --rerank-top-k In rerank mode, retrieve N candidates before reranking (default: topK*3)", + " --scope-prefix Override scopePrefix", + " --no-ingest Skip dataset document ingest", + " --cleanup none|on-success|always Cleanup policy when ingesting", + " --threshold Repeatable thresholds (e.g. min.recallAtK=0.75)", + " --ci CI mode (non-interactive)", + " --yes, -y Allow dangerous operations when explicitly enabled", + " --allow-assets Allow documents[].assets ingestion (advanced)", + " --allow-custom-prefix Allow scopePrefix outside eval:* (dangerous)", + " --include-ndcg Compute nDCG@k (optional)", + ].join("\\n") + ); +} + +async function readConfigFile(projectRoot: string): Promise { + const abs = path.join(projectRoot, ".unrag/eval/config.json"); + if (!(await fileExists(abs))) return null; + const raw = await readFile(abs, "utf8"); + try { + return JSON.parse(raw); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + throw new Error(\`Failed to parse .unrag/eval/config.json: \${msg}\`); + } +} + +function sanitizeMode(v: any): EvalMode | undefined { + if (v === "retrieve" || v === "retrieve+rerank") return v; + return undefined; +} + +function sanitizeCleanup(v: any): EvalCleanupPolicy | undefined { + if (v === "none" || v === "on-success" || v === "always") return v; + return undefined; +} + +async function main() { + const projectRoot = path.join(process.cwd()); + await loadEnvFilesBestEffort(projectRoot); + + const cli = parseArgs(process.argv.slice(2)); + const cfg = await readConfigFile(projectRoot); + + const datasetPath = cli.dataset ?? cfg?.dataset ?? ".unrag/eval/datasets/sample.json"; + if (!datasetPath) throw new Error("--dataset is required"); + + const engine = createUnragEngine(); + + const thresholds: Partial = mergeThresholds(cfg?.thresholds ?? {}, cli.thresholds ?? {}); + + const result = await runEval({ + engine, + datasetPath, + mode: cli.mode ?? sanitizeMode(cfg?.mode), + topK: cli.topK ?? (typeof cfg?.topK === "number" ? cfg.topK : undefined), + rerankTopK: cli.rerankTopK ?? (typeof cfg?.rerankTopK === "number" ? cfg.rerankTopK : undefined), + scopePrefix: cli.scopePrefix ?? (typeof cfg?.scopePrefix === "string" ? cfg.scopePrefix : undefined), + ingest: cli.ingest ?? (typeof cfg?.ingest === "boolean" ? cfg.ingest : undefined), + cleanup: cli.cleanup ?? sanitizeCleanup(cfg?.cleanup) ?? "none", + includeNdcg: cli.includeNdcg ?? Boolean(cfg?.includeNdcg), + allowAssets: cli.allowAssets ?? Boolean(cfg?.allowAssets), + allowNonEvalPrefix: cli.allowNonEvalPrefix ?? Boolean(cfg?.allowNonEvalPrefix), + confirmedDangerousDelete: Boolean(cli.yes), + thresholds, + }); + + const ts = new Date().toISOString().replace(/[:.]/g, "-"); + const outputDir = + cli.outputDir ?? + cfg?.outputDir ?? + path.join(".unrag/eval/runs", \`\${ts}-\${result.report.dataset.id}\`); + + const reportPath = await writeEvalReport(outputDir, result.report); + const summaryPath = await writeEvalSummaryMd(outputDir, result.report); + + let diffPaths: { json: string; md: string } | null = null; + const baselinePath = cli.baseline ?? cfg?.baseline; + if (baselinePath) { + const baseline = await readEvalReportFromFile(baselinePath); + const diff = diffEvalReports({ baseline, candidate: result.report, baselinePath, candidatePath: reportPath }); + const diffJson = await writeEvalDiffJson(outputDir, diff); + const diffMd = await writeEvalDiffMd(outputDir, diff); + diffPaths = { json: diffJson, md: diffMd }; + } + + console.log( + [ + \`[unrag:eval] Wrote report: \${reportPath}\`, + \`[unrag:eval] Wrote summary: \${summaryPath}\`, + diffPaths ? \`[unrag:eval] Wrote diff: \${diffPaths.json} (+ \${diffPaths.md})\` : "", + result.thresholdFailures.length > 0 + ? \`[unrag:eval] Threshold failures:\\n- \${result.thresholdFailures.join("\\n- ")}\` + : \`[unrag:eval] Thresholds: pass\`, + ] + .filter(Boolean) + .join("\\n") + ); + + process.exitCode = result.exitCode; +} + +main().catch((err) => { + const msg = err instanceof Error ? err.stack ?? err.message : String(err); + console.error(\`[unrag:eval] Error: \${msg}\`); + process.exitCode = 2; +}); +`; + + if (await shouldWriteFile(datasetAbs, root, nonInteractive)) { + await writeTextFile(datasetAbs, JSON.stringify(sampleDataset, null, 2) + "\n"); + } + if (await shouldWriteFile(configAbs, root, nonInteractive)) { + await writeTextFile(configAbs, JSON.stringify(evalConfig, null, 2) + "\n"); + } + if (await shouldWriteFile(scriptAbs, root, nonInteractive)) { + await writeTextFile(scriptAbs, script); + } + + const scriptsToAdd: Record = { + "unrag:eval": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json`, + "unrag:eval:ci": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci`, + }; + + const scriptsResult = await addPackageJsonScripts({ + projectRoot: root, + pkg, + scripts: scriptsToAdd, + nonInteractive, + }); + + outro( + [ + `Installed battery: ${battery}.`, + "", + `- Code: ${path.join(config.installDir, "eval")}`, + "", + `- Dataset: ${path.relative(root, datasetAbs)}`, + `- Script: ${path.relative(root, scriptAbs)}`, + "", + scriptsResult.added.length > 0 + ? `Added scripts: ${scriptsResult.added.join(", ")}` + : "Added scripts: none", + scriptsResult.kept.length > 0 ? `Kept existing scripts: ${scriptsResult.kept.join(", ")}` : "", + "", + "Next:", + " bun run unrag:eval", + " bun run unrag:eval:ci", + ] + .filter(Boolean) + .join("\n") + ); + + return; + } + // Generate wiring snippet based on the battery const wiringSnippet = battery === "reranker" ? [ diff --git a/packages/unrag/cli/commands/init.ts b/packages/unrag/cli/commands/init.ts index 3e674cc..7ada9d6 100644 --- a/packages/unrag/cli/commands/init.ts +++ b/packages/unrag/cli/commands/init.ts @@ -12,11 +12,18 @@ import { fileURLToPath } from "node:url"; import { copyExtractorFiles, copyConnectorFiles, + copyBatteryFiles, copyRegistryFiles, type RegistrySelection, } from "../lib/registry"; import { readJsonFile, writeJsonFile } from "../lib/json"; -import { findUp, normalizePosixPath, tryFindProjectRoot } from "../lib/fs"; +import { + ensureDir, + exists, + findUp, + normalizePosixPath, + tryFindProjectRoot, +} from "../lib/fs"; import { readRegistryManifest } from "../lib/manifest"; import { fetchPreset, type PresetPayloadV1 } from "../lib/preset"; import { @@ -30,11 +37,20 @@ import { readPackageJson, type ConnectorName, type ExtractorName, + type BatteryName, type EmbeddingProviderName, depsForEmbeddingProvider, + depsForBattery, writePackageJson, } from "../lib/packageJson"; import { patchTsconfigPaths } from "../lib/tsconfig"; +import { writeFile } from "node:fs/promises"; +import { + EVAL_CONFIG_DEFAULT, + EVAL_PACKAGE_JSON_SCRIPTS, + EVAL_SAMPLE_DATASET_V1, + renderEvalRunnerScript, +} from "../lib/evalBatteryScaffold"; type InitConfig = { installDir: string; @@ -44,6 +60,7 @@ type InitConfig = { version: number; connectors?: string[]; extractors?: string[]; + batteries?: string[]; }; const CONFIG_FILE = "unrag.json"; @@ -173,6 +190,11 @@ const toConnectors = (xs: string[] | undefined): ConnectorName[] => .map((s) => String(s).trim()) .filter(Boolean) as ConnectorName[]; +const toBatteries = (xs: string[] | undefined): BatteryName[] => + (Array.isArray(xs) ? xs : []) + .map((s) => String(s).trim()) + .filter(Boolean) as BatteryName[]; + export async function initCommand(args: string[]) { const root = await tryFindProjectRoot(process.cwd()); if (!root) { @@ -506,10 +528,47 @@ export async function initCommand(args: string[]) { Object.assign(connectorDevDeps, r.devDeps); } + const batteriesFromPreset = preset + ? Array.from(new Set(toBatteries(preset.modules?.batteries))).sort() + : []; + const availableBatteryIds = new Set( + (manifest.batteries ?? []) + .filter((b: any) => b.status === "available") + .map((b: any) => String(b.id)) as BatteryName[] + ); + if (preset) { + const unknown = batteriesFromPreset.filter((b) => !availableBatteryIds.has(b)); + if (unknown.length > 0) { + throw new Error(`Preset contains unknown/unavailable batteries: ${unknown.join(", ")}`); + } + } + + // Install battery modules (vendor code) before updating deps. + if (batteriesFromPreset.length > 0) { + for (const battery of batteriesFromPreset) { + await copyBatteryFiles({ + projectRoot: root, + registryRoot, + installDir, + battery, + yes: nonInteractive, + overwrite: overwritePolicy, + }); + } + } + + const batteryDeps: Record = {}; + const batteryDevDeps: Record = {}; + for (const b of batteriesFromPreset) { + const r = depsForBattery(b); + Object.assign(batteryDeps, r.deps); + Object.assign(batteryDevDeps, r.devDeps); + } + const merged = mergeDeps( pkg, - { ...deps, ...embeddingDeps.deps, ...extractorDeps, ...connectorDeps }, - { ...devDeps, ...embeddingDeps.devDeps, ...extractorDevDeps, ...connectorDevDeps } + { ...deps, ...embeddingDeps.deps, ...extractorDeps, ...connectorDeps, ...batteryDeps }, + { ...devDeps, ...embeddingDeps.devDeps, ...extractorDevDeps, ...connectorDevDeps, ...batteryDevDeps } ); if (merged.changes.length > 0) { await writePackageJson(root, merged.pkg); @@ -533,9 +592,53 @@ export async function initCommand(args: string[]) { ...(richMediaEnabled ? selectedExtractors : []), ]) ).sort(), + batteries: Array.from( + new Set([...(existing?.batteries ?? []), ...batteriesFromPreset]) + ).sort(), }; await writeJsonFile(path.join(root, CONFIG_FILE), config); + // Battery-specific scaffolding (preset installs are non-interactive). + const writeTextFile = async (absPath: string, content: string) => { + await ensureDir(path.dirname(absPath)); + await writeFile(absPath, content, "utf8"); + }; + const writeIfMissing = async (absPath: string, content: string) => { + if (await exists(absPath)) return false; + await writeTextFile(absPath, content); + return true; + }; + + if (batteriesFromPreset.includes("eval")) { + const datasetAbs = path.join(root, ".unrag/eval/datasets/sample.json"); + const evalConfigAbs = path.join(root, ".unrag/eval/config.json"); + const scriptAbs = path.join(root, "scripts/unrag-eval.ts"); + + await writeIfMissing( + datasetAbs, + JSON.stringify(EVAL_SAMPLE_DATASET_V1, null, 2) + "\n" + ); + await writeIfMissing( + evalConfigAbs, + JSON.stringify(EVAL_CONFIG_DEFAULT, null, 2) + "\n" + ); + await writeIfMissing(scriptAbs, renderEvalRunnerScript({ installDir })); + + // Add package.json scripts, non-destructively. + const pkg2: any = await readPackageJson(root); + const existingScripts = (pkg2.scripts ?? {}) as Record; + const toAdd: Record = {}; + for (const [name, cmd] of Object.entries(EVAL_PACKAGE_JSON_SCRIPTS)) { + if (!(name in existingScripts)) { + toAdd[name] = cmd; + } + } + if (Object.keys(toAdd).length > 0) { + pkg2.scripts = { ...existingScripts, ...toAdd }; + await writePackageJson(root, pkg2); + } + } + const pm = await detectPackageManager(root); const installLine = merged.changes.length === 0 diff --git a/packages/unrag/cli/lib/evalBatteryScaffold.ts b/packages/unrag/cli/lib/evalBatteryScaffold.ts new file mode 100644 index 0000000..62a759c --- /dev/null +++ b/packages/unrag/cli/lib/evalBatteryScaffold.ts @@ -0,0 +1,308 @@ +export const EVAL_SAMPLE_DATASET_V1 = { + version: "1", + id: "sample", + description: "Tiny dataset to validate retrieval changes.", + defaults: { + topK: 10, + scopePrefix: "eval:sample:", + mode: "retrieve", + thresholds: { min: { recallAtK: 0.75 } }, + }, + documents: [ + { + sourceId: "eval:sample:doc:refund-policy", + content: "Refunds are available within 30 days of purchase, provided you have a receipt.", + }, + { + sourceId: "eval:sample:doc:contact-support", + content: "Contact support by emailing support@example.com. Response times are typically under 24 hours.", + }, + ], + queries: [ + { + id: "q_refund_window", + query: "How long do I have to request a refund?", + relevant: { sourceIds: ["eval:sample:doc:refund-policy"] }, + }, + { + id: "q_contact_support", + query: "How do I contact support?", + relevant: { sourceIds: ["eval:sample:doc:contact-support"] }, + }, + ], +} as const; + +export const EVAL_CONFIG_DEFAULT = { + thresholds: { min: { recallAtK: 0.75 } }, + cleanup: "none", + ingest: true, +} as const; + +export const EVAL_PACKAGE_JSON_SCRIPTS: Record = { + "unrag:eval": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json`, + "unrag:eval:ci": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci`, +} as const; + +export function renderEvalRunnerScript(opts: { installDir: string }): string { + const installImportBase = `../${opts.installDir.replace(/\\/g, "/")}`; + + return `/** + * Unrag eval runner entrypoint (generated). + * + * You own this file — customize it freely. + */ + +import path from "node:path"; +import { access, readFile } from "node:fs/promises"; + +import { createUnragEngine } from "../unrag.config"; +import { + runEval, + readEvalReportFromFile, + writeEvalReport, + writeEvalSummaryMd, + diffEvalReports, + writeEvalDiffJson, + writeEvalDiffMd, + type EvalMode, + type EvalThresholds, + type EvalCleanupPolicy, +} from "${installImportBase}/eval"; + +type CliArgs = { + dataset?: string; + baseline?: string; + outputDir?: string; + mode?: EvalMode; + topK?: number; + rerankTopK?: number; + scopePrefix?: string; + ingest?: boolean; + cleanup?: EvalCleanupPolicy; + thresholds?: Partial; + ci?: boolean; + allowAssets?: boolean; + allowNonEvalPrefix?: boolean; + yes?: boolean; + includeNdcg?: boolean; +}; + +async function fileExists(p: string): Promise { + try { + await access(p); + return true; + } catch { + return false; + } +} + +async function loadEnvFilesBestEffort(projectRoot: string) { + const nodeEnv = process.env.NODE_ENV ?? "development"; + const candidates = [ + ".env", + ".env.local", + \`.env.\${nodeEnv}\`, + \`.env.\${nodeEnv}.local\`, + ]; + for (const rel of candidates) { + const abs = path.join(projectRoot, rel); + if (!(await fileExists(abs))) continue; + try { + const raw = await readFile(abs, "utf8"); + for (const line of raw.split(/\\r?\\n/)) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + const eq = trimmed.indexOf("="); + if (eq < 0) continue; + const key = trimmed.slice(0, eq).trim(); + const value = trimmed.slice(eq + 1).trim().replace(/^"|"$/g, ""); + if (!key) continue; + if (process.env[key] == null) process.env[key] = value; + } + } catch { + // ignore + } + } +} + +function parseThresholdExpr(expr: string): Partial { + // Accept both: + // - "min.recallAtK=0.75" + // - "recallAtK=0.75" (shorthand for min) + const [lhsRaw, rhsRaw] = String(expr ?? "").split("="); + const lhs = (lhsRaw ?? "").trim(); + const rhs = Number(String(rhsRaw ?? "").trim()); + if (!lhs || Number.isNaN(rhs)) return {}; + + const parts = lhs.split(".").map((p) => p.trim()).filter(Boolean); + const level = parts.length === 2 ? parts[0] : "min"; + const metric = parts.length === 2 ? parts[1] : parts[0]; + if (level !== "min") return {}; + + const allowed = new Set(["hitAtK", "precisionAtK", "recallAtK", "mrrAtK", "ndcgAtK"]); + if (!allowed.has(metric)) return {}; + return { min: { [metric]: rhs } } as any; +} + +function mergeThresholds( + a: Partial | undefined, + b: Partial | undefined +): Partial | undefined { + if (!a && !b) return undefined; + const out: any = { ...(a ?? {}) }; + if (b?.min) out.min = { ...(out.min ?? {}), ...(b.min as any) }; + return out; +} + +function parseArgs(argv: string[]): CliArgs { + const out: CliArgs = {}; + const thresholds: Partial[] = []; + + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--dataset") out.dataset = argv[++i]; + else if (a === "--baseline") out.baseline = argv[++i]; + else if (a === "--outputDir" || a === "--output-dir") out.outputDir = argv[++i]; + else if (a === "--mode") out.mode = argv[++i] as EvalMode; + else if (a === "--topK" || a === "--top-k") out.topK = Number(argv[++i]); + else if (a === "--rerankTopK" || a === "--rerank-top-k") out.rerankTopK = Number(argv[++i]); + else if (a === "--scopePrefix" || a === "--scope-prefix") out.scopePrefix = argv[++i]; + else if (a === "--no-ingest") out.ingest = false; + else if (a === "--cleanup") out.cleanup = argv[++i] as EvalCleanupPolicy; + else if (a === "--threshold") thresholds.push(parseThresholdExpr(argv[++i] ?? "")); + else if (a === "--ci") out.ci = true; + else if (a === "--allow-assets") out.allowAssets = true; + else if (a === "--allow-non-eval-prefix" || a === "--allow-custom-prefix") out.allowNonEvalPrefix = true; + else if (a === "--yes" || a === "-y") out.yes = true; + else if (a === "--include-ndcg") out.includeNdcg = true; + else if (a === "--help" || a === "-h") { + printHelp(); + process.exit(0); + } + } + + for (const t of thresholds) out.thresholds = mergeThresholds(out.thresholds ?? {}, t); + return out; +} + +function printHelp() { + console.log( + [ + "unrag-eval — retrieval eval harness", + "", + "Usage:", + " bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json", + "", + "Options:", + " --dataset Dataset JSON path (required)", + " --baseline Baseline report for diffing", + " --output-dir Output dir (default: .unrag/eval/runs/-)", + " --mode retrieve|retrieve+rerank Override mode", + " --top-k Override topK", + " --rerank-top-k In rerank mode, retrieve N candidates before reranking (default: topK*3)", + " --scope-prefix Override scopePrefix", + " --no-ingest Skip dataset document ingest", + " --cleanup none|on-success|always Cleanup policy when ingesting", + " --threshold Repeatable thresholds (e.g. min.recallAtK=0.75)", + " --ci CI mode (non-interactive)", + " --yes, -y Allow dangerous operations when explicitly enabled", + " --allow-assets Allow documents[].assets ingestion (advanced)", + " --allow-custom-prefix Allow scopePrefix outside eval:* (dangerous)", + " --include-ndcg Compute nDCG@k (optional)", + ].join("\\n") + ); +} + +async function readConfigFile(projectRoot: string): Promise { + const abs = path.join(projectRoot, ".unrag/eval/config.json"); + if (!(await fileExists(abs))) return null; + const raw = await readFile(abs, "utf8"); + try { + return JSON.parse(raw); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + throw new Error(\`Failed to parse .unrag/eval/config.json: \${msg}\`); + } +} + +function sanitizeMode(v: any): EvalMode | undefined { + if (v === "retrieve" || v === "retrieve+rerank") return v; + return undefined; +} + +function sanitizeCleanup(v: any): EvalCleanupPolicy | undefined { + if (v === "none" || v === "on-success" || v === "always") return v; + return undefined; +} + +async function main() { + const projectRoot = path.join(process.cwd()); + await loadEnvFilesBestEffort(projectRoot); + + const cli = parseArgs(process.argv.slice(2)); + const cfg = await readConfigFile(projectRoot); + + const datasetPath = cli.dataset ?? cfg?.dataset ?? ".unrag/eval/datasets/sample.json"; + if (!datasetPath) throw new Error("--dataset is required"); + + const engine = createUnragEngine(); + + const mode = sanitizeMode(cli.mode ?? cfg?.mode) ?? undefined; + const cleanup = sanitizeCleanup(cli.cleanup ?? cfg?.cleanup) ?? undefined; + + const result = await runEval({ + engine, + datasetPath, + mode, + topK: typeof cli.topK === "number" ? cli.topK : undefined, + rerankTopK: typeof cli.rerankTopK === "number" ? cli.rerankTopK : undefined, + scopePrefix: typeof cli.scopePrefix === "string" ? cli.scopePrefix : undefined, + ingest: typeof cli.ingest === "boolean" ? cli.ingest : (typeof cfg?.ingest === "boolean" ? cfg.ingest : undefined), + cleanup, + thresholds: mergeThresholds(cfg?.thresholds, cli.thresholds), + ci: Boolean(cli.ci), + allowAssets: Boolean(cli.allowAssets), + allowNonEvalPrefix: Boolean(cli.allowNonEvalPrefix), + yes: Boolean(cli.yes), + includeNdcg: Boolean(cli.includeNdcg), + }); + + const outputDir = cli.outputDir ?? cfg?.outputDir ?? result.outputDir; + + const reportPath = await writeEvalReport(outputDir, result.report); + const summaryPath = await writeEvalSummaryMd(outputDir, result.report); + + let diffPaths: { json: string; md: string } | null = null; + const baselinePath = cli.baseline ?? cfg?.baseline; + if (baselinePath) { + const baseline = await readEvalReportFromFile(baselinePath); + const diff = diffEvalReports({ baseline, candidate: result.report, baselinePath, candidatePath: reportPath }); + const diffJson = await writeEvalDiffJson(outputDir, diff); + const diffMd = await writeEvalDiffMd(outputDir, diff); + diffPaths = { json: diffJson, md: diffMd }; + } + + console.log( + [ + \`[unrag:eval] Wrote report: \${reportPath}\`, + \`[unrag:eval] Wrote summary: \${summaryPath}\`, + diffPaths ? \`[unrag:eval] Wrote diff: \${diffPaths.json} (+ \${diffPaths.md})\` : "", + result.thresholdFailures.length > 0 + ? \`[unrag:eval] Threshold failures:\\n- \${result.thresholdFailures.join("\\n- ")}\` + : \`[unrag:eval] Thresholds: pass\`, + ] + .filter(Boolean) + .join("\\n") + ); + + process.exitCode = result.exitCode; +} + +main().catch((err) => { + const msg = err instanceof Error ? err.stack ?? err.message : String(err); + console.error(\`[unrag:eval] Error: \${msg}\`); + process.exitCode = 2; +}); +`; +} + diff --git a/packages/unrag/cli/lib/packageJson.ts b/packages/unrag/cli/lib/packageJson.ts index e578111..65d4e7b 100644 --- a/packages/unrag/cli/lib/packageJson.ts +++ b/packages/unrag/cli/lib/packageJson.ts @@ -207,7 +207,7 @@ export function depsForEmbeddingProvider(provider: EmbeddingProviderName) { return { deps, devDeps }; } -export type BatteryName = "reranker"; +export type BatteryName = "reranker" | "eval"; export function depsForBattery(battery: BatteryName) { const deps: Record = {}; @@ -218,6 +218,10 @@ export function depsForBattery(battery: BatteryName) { deps["@ai-sdk/cohere"] = "^3.0.1"; } + if (battery === "eval") { + // Intentionally no deps: runner is dependency-free and uses project wiring. + } + return { deps, devDeps }; } diff --git a/packages/unrag/cli/lib/preset.ts b/packages/unrag/cli/lib/preset.ts index 1aa61a5..bf6e952 100644 --- a/packages/unrag/cli/lib/preset.ts +++ b/packages/unrag/cli/lib/preset.ts @@ -11,6 +11,7 @@ export type PresetPayloadV1 = { modules: { extractors: string[]; connectors: string[]; + batteries?: string[]; }; config?: unknown; }; @@ -25,6 +26,9 @@ function isPresetPayloadV1(x: unknown): x is PresetPayloadV1 { if (!["drizzle", "prisma", "raw-sql"].includes(String(o.install.storeAdapter))) return false; if (typeof o.install.aliasBase !== "string") return false; if (!Array.isArray(o.modules.extractors) || !Array.isArray(o.modules.connectors)) return false; + if ("batteries" in o.modules && o.modules.batteries != null && !Array.isArray(o.modules.batteries)) { + return false; + } return true; } diff --git a/packages/unrag/registry/eval/dataset.ts b/packages/unrag/registry/eval/dataset.ts new file mode 100644 index 0000000..6cae293 --- /dev/null +++ b/packages/unrag/registry/eval/dataset.ts @@ -0,0 +1,224 @@ +import { readFile } from "node:fs/promises"; + +export type EvalMode = "retrieve" | "retrieve+rerank"; + +export type EvalDatasetV1 = { + version: "1"; + id: string; + description?: string; + defaults: { + topK?: number; + /** Required in this implementation for isolation + deterministic cleanup. */ + scopePrefix: string; + mode?: EvalMode; + /** + * In `retrieve+rerank` mode, how many candidates to retrieve before reranking. + * If omitted, the runner will default to `topK * 3` (clamped to at least `topK`). + */ + rerankTopK?: number; + /** + * Optional default thresholds (lowest precedence). + * CLI/config should override this. + */ + thresholds?: Partial; + }; + documents?: EvalDatasetDocument[]; + queries: EvalDatasetQuery[]; +}; + +export type EvalDatasetDocument = { + sourceId: string; + content?: string; + loaderRef?: string; + metadata?: Record; + /** + * Optional rich media inputs (advanced). By default, the runner requires an explicit opt-in. + * Shape is compatible with Unrag's `AssetInput[]` JSON form. + */ + assets?: unknown; +}; + +export type EvalDatasetQuery = { + id: string; + query: string; + topK?: number; + scopePrefix?: string; + /** + * In `retrieve+rerank` mode, overrides dataset `defaults.rerankTopK` for this query. + */ + rerankTopK?: number; + relevant: { + sourceIds: string[]; + }; + notes?: string; +}; + +export type EvalThresholds = { + min: Partial<{ + hitAtK: number; + recallAtK: number; + mrrAtK: number; + }>; + max: Partial<{ + p95TotalMs: number; + }>; +}; + +function err(path: string, msg: string): Error { + return new Error(`[unrag:eval] Invalid dataset at ${path}: ${msg}`); +} + +function isObject(x: unknown): x is Record { + return Boolean(x) && typeof x === "object" && !Array.isArray(x); +} + +function asNonEmptyString(x: unknown, path: string): string { + if (typeof x !== "string" || x.trim().length === 0) { + throw err(path, "must be a non-empty string"); + } + return x; +} + +function asOptionalNumber(x: unknown, path: string): number | undefined { + if (x === undefined) return undefined; + if (typeof x !== "number" || !Number.isFinite(x)) { + throw err(path, "must be a finite number"); + } + return x; +} + +function asStringArray(x: unknown, path: string): string[] { + if (!Array.isArray(x)) throw err(path, "must be an array"); + const out: string[] = []; + for (let i = 0; i < x.length; i++) { + const v = x[i]; + if (typeof v !== "string" || v.trim().length === 0) { + throw err(`${path}[${i}]`, "must be a non-empty string"); + } + out.push(v); + } + return out; +} + +function parseThresholds(x: unknown, path: string): Partial | undefined { + if (x === undefined) return undefined; + if (!isObject(x)) throw err(path, "must be an object"); + const min = isObject(x.min) ? x.min : undefined; + const max = isObject(x.max) ? x.max : undefined; + + const out: Partial = {}; + if (min) { + out.min = {}; + if (min.hitAtK !== undefined) out.min.hitAtK = asOptionalNumber(min.hitAtK, `${path}.min.hitAtK`); + if (min.recallAtK !== undefined) out.min.recallAtK = asOptionalNumber(min.recallAtK, `${path}.min.recallAtK`); + if (min.mrrAtK !== undefined) out.min.mrrAtK = asOptionalNumber(min.mrrAtK, `${path}.min.mrrAtK`); + } + if (max) { + out.max = {}; + if (max.p95TotalMs !== undefined) out.max.p95TotalMs = asOptionalNumber(max.p95TotalMs, `${path}.max.p95TotalMs`); + } + return out; +} + +export function parseEvalDataset(json: unknown): EvalDatasetV1 { + if (!isObject(json)) throw err("$", "must be an object"); + + const version = json.version; + if (version !== "1") throw err("$.version", 'must be "1"'); + + const id = asNonEmptyString(json.id, "$.id"); + const description = + json.description === undefined ? undefined : asNonEmptyString(json.description, "$.description"); + + if (!isObject(json.defaults)) throw err("$.defaults", "must be an object"); + const defaults = json.defaults; + const scopePrefix = asNonEmptyString(defaults.scopePrefix, "$.defaults.scopePrefix"); + const topK = asOptionalNumber(defaults.topK, "$.defaults.topK"); + const rerankTopK = asOptionalNumber(defaults.rerankTopK, "$.defaults.rerankTopK"); + const mode = + defaults.mode === undefined + ? undefined + : ((): EvalMode => { + const v = asNonEmptyString(defaults.mode, "$.defaults.mode"); + if (v !== "retrieve" && v !== "retrieve+rerank") { + throw err("$.defaults.mode", 'must be "retrieve" or "retrieve+rerank"'); + } + return v; + })(); + + const thresholds = parseThresholds(defaults.thresholds, "$.defaults.thresholds"); + + const documents = (() => { + if (json.documents === undefined) return undefined; + if (!Array.isArray(json.documents)) throw err("$.documents", "must be an array"); + const out: EvalDatasetDocument[] = []; + for (let i = 0; i < json.documents.length; i++) { + const d = json.documents[i]; + if (!isObject(d)) throw err(`$.documents[${i}]`, "must be an object"); + const sourceId = asNonEmptyString(d.sourceId, `$.documents[${i}].sourceId`); + const content = d.content === undefined ? undefined : asNonEmptyString(d.content, `$.documents[${i}].content`); + const loaderRef = d.loaderRef === undefined ? undefined : asNonEmptyString(d.loaderRef, `$.documents[${i}].loaderRef`); + if (!content && !loaderRef) { + throw err(`$.documents[${i}]`, 'must include "content" or "loaderRef"'); + } + const metadata = d.metadata === undefined ? undefined : (isObject(d.metadata) ? (d.metadata as Record) : (() => { throw err(`$.documents[${i}].metadata`, "must be an object"); })()); + const assets = d.assets; + out.push({ sourceId, content, loaderRef, metadata, assets }); + } + return out; + })(); + + if (!Array.isArray(json.queries) || json.queries.length === 0) { + throw err("$.queries", "must be a non-empty array"); + } + const queries: EvalDatasetQuery[] = []; + for (let i = 0; i < json.queries.length; i++) { + const q = json.queries[i]; + if (!isObject(q)) throw err(`$.queries[${i}]`, "must be an object"); + const qid = asNonEmptyString(q.id, `$.queries[${i}].id`); + const query = asNonEmptyString(q.query, `$.queries[${i}].query`); + const qTopK = asOptionalNumber(q.topK, `$.queries[${i}].topK`); + const qScopePrefix = q.scopePrefix === undefined ? undefined : asNonEmptyString(q.scopePrefix, `$.queries[${i}].scopePrefix`); + const qRerankTopK = asOptionalNumber(q.rerankTopK, `$.queries[${i}].rerankTopK`); + if (!isObject(q.relevant)) throw err(`$.queries[${i}].relevant`, "must be an object"); + const relevantSourceIds = asStringArray(q.relevant.sourceIds, `$.queries[${i}].relevant.sourceIds`); + const notes = q.notes === undefined ? undefined : asNonEmptyString(q.notes, `$.queries[${i}].notes`); + queries.push({ + id: qid, + query, + topK: qTopK, + scopePrefix: qScopePrefix, + rerankTopK: qRerankTopK, + relevant: { sourceIds: relevantSourceIds }, + notes, + }); + } + + return { + version: "1", + id, + ...(description ? { description } : {}), + defaults: { + scopePrefix, + ...(topK !== undefined ? { topK } : {}), + ...(mode ? { mode } : {}), + ...(rerankTopK !== undefined ? { rerankTopK } : {}), + ...(thresholds ? { thresholds } : {}), + }, + ...(documents ? { documents } : {}), + queries, + }; +} + +export async function readEvalDatasetFromFile(datasetPath: string): Promise { + const raw = await readFile(datasetPath, "utf8"); + let json: unknown; + try { + json = JSON.parse(raw); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + throw new Error(`[unrag:eval] Failed to parse dataset JSON (${datasetPath}): ${msg}`); + } + return parseEvalDataset(json); +} + diff --git a/packages/unrag/registry/eval/index.ts b/packages/unrag/registry/eval/index.ts new file mode 100644 index 0000000..e1cd9d3 --- /dev/null +++ b/packages/unrag/registry/eval/index.ts @@ -0,0 +1,39 @@ +/** + * Eval harness battery module. + * + * Install via `unrag add battery eval`. + * + * This module is designed to be vendored into user repos and executed via a + * project-local script (e.g. `scripts/unrag-eval.ts`) so users can audit and customize. + */ + +export { readEvalDatasetFromFile, parseEvalDataset } from "./dataset"; +export type { + EvalDatasetV1, + EvalDatasetDocument, + EvalDatasetQuery, + EvalMode, + EvalThresholds, +} from "./dataset"; + +export { computeMetricsAtK, uniqueSourceIdsInOrder } from "./metrics"; +export type { EvalMetricsAtK } from "./metrics"; + +export { runEval } from "./runner"; +export type { EvalRunArgs, EvalRunOutput } from "./runner"; + +export { + readEvalReportFromFile, + writeEvalReport, + writeEvalSummaryMd, + diffEvalReports, + writeEvalDiffJson, + writeEvalDiffMd, +} from "./report"; +export type { + EvalReportV1, + EvalQueryResult, + EvalCleanupPolicy, + EvalDiffV1, +} from "./report"; + diff --git a/packages/unrag/registry/eval/metrics.ts b/packages/unrag/registry/eval/metrics.ts new file mode 100644 index 0000000..3b62eb6 --- /dev/null +++ b/packages/unrag/registry/eval/metrics.ts @@ -0,0 +1,85 @@ +export type EvalMetricsAtK = { + hitAtK: number; + recallAtK: number; + precisionAtK: number; + mrrAtK: number; + ndcgAtK?: number; +}; + +export function uniqueSourceIdsInOrder(sourceIds: string[]): string[] { + const seen = new Set(); + const out: string[] = []; + for (const id of sourceIds) { + if (!id) continue; + if (seen.has(id)) continue; + seen.add(id); + out.push(id); + } + return out; +} + +export function computeMetricsAtK(args: { + /** Ranked list of retrieved sourceIds (deduped internally). */ + retrievedSourceIds: string[]; + /** Set of relevant document sourceIds. */ + relevantSourceIds: string[]; + k: number; + includeNdcg?: boolean; +}): EvalMetricsAtK { + const k = Math.max(1, Math.floor(args.k)); + const relevant = new Set(args.relevantSourceIds); + const ranked = uniqueSourceIdsInOrder(args.retrievedSourceIds).slice(0, k); + + let hits = 0; + let firstRelevantRank: number | null = null; + for (let i = 0; i < ranked.length; i++) { + const sid = ranked[i]!; + if (relevant.has(sid)) { + hits++; + if (firstRelevantRank === null) firstRelevantRank = i + 1; // 1-indexed + } + } + + const hitAtK = hits > 0 ? 1 : 0; + const recallAtK = + args.relevantSourceIds.length === 0 ? 0 : hits / args.relevantSourceIds.length; + const precisionAtK = hits / k; + const mrrAtK = firstRelevantRank ? 1 / firstRelevantRank : 0; + + const out: EvalMetricsAtK = { hitAtK, recallAtK, precisionAtK, mrrAtK }; + + if (args.includeNdcg) { + out.ndcgAtK = computeNdcgAtK({ rankedSourceIds: ranked, relevant, k }); + } + + return out; +} + +function computeNdcgAtK(args: { + rankedSourceIds: string[]; + relevant: Set; + k: number; +}): number { + const k = Math.max(1, Math.floor(args.k)); + const ranked = args.rankedSourceIds.slice(0, k); + + // Binary relevance DCG + let dcg = 0; + for (let i = 0; i < ranked.length; i++) { + const rel = args.relevant.has(ranked[i]!) ? 1 : 0; + if (rel === 0) continue; + // rank position is (i+1); discount uses log2(i+2) + dcg += rel / Math.log2(i + 2); + } + + // Ideal DCG: all relevant docs at the top (binary) + const idealRelevantCount = Math.min(args.relevant.size, k); + let idcg = 0; + for (let i = 0; i < idealRelevantCount; i++) { + idcg += 1 / Math.log2(i + 2); + } + + if (idcg === 0) return 0; + return dcg / idcg; +} + diff --git a/packages/unrag/registry/eval/report.ts b/packages/unrag/registry/eval/report.ts new file mode 100644 index 0000000..e704644 --- /dev/null +++ b/packages/unrag/registry/eval/report.ts @@ -0,0 +1,342 @@ +import path from "node:path"; +import { mkdir, writeFile, readFile } from "node:fs/promises"; + +import type { EvalMode, EvalThresholds } from "./dataset"; +import type { EvalMetricsAtK } from "./metrics"; + +export type EvalCleanupPolicy = "none" | "on-success" | "always"; + +export type EvalReportV1 = { + version: "1"; + createdAt: string; // ISO string + dataset: { + id: string; + version: "1"; + description?: string; + }; + config: { + mode: EvalMode; + topK: number; + /** + * In `retrieve+rerank` mode, the default candidate retrieval size. + * If omitted, the runner default is derived per-query as `topK * 3`. + */ + rerankTopK?: number; + scopePrefix: string; + ingest: boolean; + cleanup: EvalCleanupPolicy; + includeNdcg: boolean; + }; + engine: { + embeddingModel?: string; + rerankerName?: string; + rerankerModel?: string; + }; + results: { + queries: EvalQueryResult[]; + aggregates: EvalAggregateBlock; + timings: EvalTimingAggregates; + thresholdsApplied?: Partial; + thresholdFailures?: string[]; + passed?: boolean; + }; +}; + +export type EvalQueryResult = { + id: string; + query: string; + topK: number; + /** In `retrieve+rerank` mode, how many candidates were retrieved before reranking. */ + rerankTopK?: number; + scopePrefix: string; + relevant: { sourceIds: string[] }; + retrieved: { + sourceIds: string[]; + metrics: EvalMetricsAtK; + durationsMs: { + embeddingMs: number; + retrievalMs: number; + totalMs: number; + }; + }; + reranked?: { + sourceIds: string[]; + metrics: EvalMetricsAtK; + durationsMs: { + rerankMs: number; + totalMs: number; + }; + meta?: { + rerankerName?: string; + model?: string; + }; + warnings?: string[]; + }; + notes?: string; +}; + +export type EvalAggregateBlock = { + retrieved: EvalAggregatesForStage; + reranked?: EvalAggregatesForStage; +}; + +export type EvalAggregatesForStage = { + mean: EvalMetricsAtK; + median: EvalMetricsAtK; +}; + +export type EvalTimingAggregates = { + embeddingMs: Percentiles; + retrievalMs: Percentiles; + retrieveTotalMs: Percentiles; + rerankMs?: Percentiles; + rerankTotalMs?: Percentiles; + /** End-to-end total per query (retrieve total + rerank total when present). */ + totalMs: Percentiles; +}; + +export type Percentiles = { + p50: number; + p95: number; +}; + +export async function ensureDir(dir: string): Promise { + await mkdir(dir, { recursive: true }); +} + +export async function writeEvalReport(outputDir: string, report: EvalReportV1): Promise { + await ensureDir(outputDir); + const outPath = path.join(outputDir, "report.json"); + await writeFile(outPath, JSON.stringify(report, null, 2) + "\n", "utf8"); + return outPath; +} + +export async function writeEvalSummaryMd(outputDir: string, report: EvalReportV1): Promise { + await ensureDir(outputDir); + const outPath = path.join(outputDir, "summary.md"); + const lines: string[] = []; + lines.push(`# Unrag Eval Report`); + lines.push(``); + lines.push(`- Dataset: \`${report.dataset.id}\``); + lines.push(`- Mode: \`${report.config.mode}\``); + lines.push(`- topK: \`${report.config.topK}\``); + if (report.config.mode === "retrieve+rerank") { + lines.push( + `- rerankTopK: \`${typeof report.config.rerankTopK === "number" ? report.config.rerankTopK : "topK*3"}\`` + ); + } + lines.push(`- scopePrefix: \`${report.config.scopePrefix}\``); + lines.push(`- ingest: \`${report.config.ingest}\``); + lines.push(`- createdAt: \`${report.createdAt}\``); + lines.push(``); + + const stageLines = (label: string, a: EvalAggregatesForStage) => { + lines.push(`## ${label}`); + lines.push(``); + lines.push(`| metric | mean | median |`); + lines.push(`| --- | ---: | ---: |`); + lines.push(`| hit@k | ${a.mean.hitAtK.toFixed(3)} | ${a.median.hitAtK.toFixed(3)} |`); + lines.push(`| recall@k | ${a.mean.recallAtK.toFixed(3)} | ${a.median.recallAtK.toFixed(3)} |`); + lines.push(`| precision@k | ${a.mean.precisionAtK.toFixed(3)} | ${a.median.precisionAtK.toFixed(3)} |`); + lines.push(`| mrr@k | ${a.mean.mrrAtK.toFixed(3)} | ${a.median.mrrAtK.toFixed(3)} |`); + if (report.config.includeNdcg) { + lines.push(`| ndcg@k | ${(a.mean.ndcgAtK ?? 0).toFixed(3)} | ${(a.median.ndcgAtK ?? 0).toFixed(3)} |`); + } + lines.push(``); + }; + + stageLines("Retrieved", report.results.aggregates.retrieved); + if (report.results.aggregates.reranked) stageLines("Reranked", report.results.aggregates.reranked); + + // Worst queries by recall@k (post-rerank if present) + const sortKey = (q: EvalQueryResult) => + (q.reranked?.metrics.recallAtK ?? q.retrieved.metrics.recallAtK); + const worst = [...report.results.queries].sort((a, b) => sortKey(a) - sortKey(b)).slice(0, 10); + lines.push(`## Worst queries`); + lines.push(``); + lines.push(`| id | recall@k | hit@k | mrr@k |`); + lines.push(`| --- | ---: | ---: | ---: |`); + for (const q of worst) { + const m = q.reranked?.metrics ?? q.retrieved.metrics; + lines.push(`| \`${q.id}\` | ${m.recallAtK.toFixed(3)} | ${m.hitAtK.toFixed(0)} | ${m.mrrAtK.toFixed(3)} |`); + } + lines.push(``); + + if (Array.isArray(report.results.thresholdFailures) && report.results.thresholdFailures.length > 0) { + lines.push(`## Threshold failures`); + lines.push(``); + for (const f of report.results.thresholdFailures) lines.push(`- ${f}`); + lines.push(``); + } + + await writeFile(outPath, lines.join("\n") + "\n", "utf8"); + return outPath; +} + +export async function readEvalReportFromFile(reportPath: string): Promise { + const raw = await readFile(reportPath, "utf8"); + let json: unknown; + try { + json = JSON.parse(raw); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + throw new Error(`[unrag:eval] Failed to parse report JSON (${reportPath}): ${msg}`); + } + if (!json || typeof json !== "object") { + throw new Error(`[unrag:eval] Invalid report JSON (${reportPath}): must be an object`); + } + const r = json as any; + if (r.version !== "1") { + throw new Error(`[unrag:eval] Unsupported report version (${reportPath}): ${String(r.version)}`); + } + return r as EvalReportV1; +} + +export function percentiles(values: number[]): Percentiles { + const xs = values.filter((v) => Number.isFinite(v)).slice().sort((a, b) => a - b); + if (xs.length === 0) return { p50: 0, p95: 0 }; + return { + p50: quantile(xs, 0.5), + p95: quantile(xs, 0.95), + }; +} + +function quantile(sorted: number[], q: number): number { + if (sorted.length === 0) return 0; + const pos = (sorted.length - 1) * q; + const base = Math.floor(pos); + const rest = pos - base; + const a = sorted[base]!; + const b = sorted[Math.min(base + 1, sorted.length - 1)]!; + return a + rest * (b - a); +} + +export type EvalDiffV1 = { + version: "1"; + createdAt: string; + baseline: { reportPath: string; datasetId: string; createdAt?: string }; + candidate: { reportPath?: string; datasetId: string; createdAt?: string }; + deltas: { + retrieved: Partial; + reranked?: Partial; + p95TotalMs?: number; + }; + worstRegressions: Array<{ + id: string; + deltaRecallAtK: number; + baselineRecallAtK: number; + candidateRecallAtK: number; + }>; +}; + +export function diffEvalReports(args: { + baseline: EvalReportV1; + candidate: EvalReportV1; + baselinePath: string; + candidatePath?: string; +}): EvalDiffV1 { + const b = args.baseline; + const c = args.candidate; + + const deltaStage = (bm: EvalMetricsAtK, cm: EvalMetricsAtK): Partial => ({ + hitAtK: cm.hitAtK - bm.hitAtK, + recallAtK: cm.recallAtK - bm.recallAtK, + precisionAtK: cm.precisionAtK - bm.precisionAtK, + mrrAtK: cm.mrrAtK - bm.mrrAtK, + ...(typeof bm.ndcgAtK === "number" || typeof cm.ndcgAtK === "number" + ? { ndcgAtK: (cm.ndcgAtK ?? 0) - (bm.ndcgAtK ?? 0) } + : {}), + }); + + const retrieved = deltaStage(b.results.aggregates.retrieved.mean, c.results.aggregates.retrieved.mean); + + const reranked = + b.results.aggregates.reranked && c.results.aggregates.reranked + ? deltaStage(b.results.aggregates.reranked.mean, c.results.aggregates.reranked.mean) + : undefined; + + const p95TotalMs = c.results.timings.totalMs.p95 - b.results.timings.totalMs.p95; + + // Worst regressions by recall@k (post-rerank if present) + const baselineById = new Map( + b.results.queries.map((q) => [ + q.id, + q.reranked?.metrics.recallAtK ?? q.retrieved.metrics.recallAtK, + ]) + ); + const candidateById = new Map( + c.results.queries.map((q) => [ + q.id, + q.reranked?.metrics.recallAtK ?? q.retrieved.metrics.recallAtK, + ]) + ); + + const ids = Array.from(new Set([...baselineById.keys(), ...candidateById.keys()])); + const regressions = ids + .map((id) => { + const br = baselineById.get(id) ?? 0; + const cr = candidateById.get(id) ?? 0; + return { + id, + deltaRecallAtK: cr - br, + baselineRecallAtK: br, + candidateRecallAtK: cr, + }; + }) + .sort((a, b) => a.deltaRecallAtK - b.deltaRecallAtK) + .slice(0, 20); + + return { + version: "1", + createdAt: new Date().toISOString(), + baseline: { reportPath: args.baselinePath, datasetId: b.dataset.id, createdAt: b.createdAt }, + candidate: { reportPath: args.candidatePath, datasetId: c.dataset.id, createdAt: c.createdAt }, + deltas: { retrieved, ...(reranked ? { reranked } : {}), p95TotalMs }, + worstRegressions: regressions, + }; +} + +export async function writeEvalDiffJson(outputDir: string, diff: EvalDiffV1): Promise { + await ensureDir(outputDir); + const outPath = path.join(outputDir, "diff.json"); + await writeFile(outPath, JSON.stringify(diff, null, 2) + "\n", "utf8"); + return outPath; +} + +export async function writeEvalDiffMd(outputDir: string, diff: EvalDiffV1): Promise { + await ensureDir(outputDir); + const outPath = path.join(outputDir, "diff.md"); + const lines: string[] = []; + lines.push(`# Unrag Eval Diff`); + lines.push(``); + lines.push(`- Baseline: \`${diff.baseline.datasetId}\``); + lines.push(`- Candidate: \`${diff.candidate.datasetId}\``); + lines.push(``); + lines.push(`## Aggregate deltas (mean)`); + lines.push(``); + lines.push(`| metric | retrieved Δ | reranked Δ |`); + lines.push(`| --- | ---: | ---: |`); + const fmt = (n: number | undefined) => (typeof n === "number" ? n.toFixed(3) : "—"); + lines.push(`| hit@k | ${fmt(diff.deltas.retrieved.hitAtK)} | ${fmt(diff.deltas.reranked?.hitAtK)} |`); + lines.push(`| recall@k | ${fmt(diff.deltas.retrieved.recallAtK)} | ${fmt(diff.deltas.reranked?.recallAtK)} |`); + lines.push(`| precision@k | ${fmt(diff.deltas.retrieved.precisionAtK)} | ${fmt(diff.deltas.reranked?.precisionAtK)} |`); + lines.push(`| mrr@k | ${fmt(diff.deltas.retrieved.mrrAtK)} | ${fmt(diff.deltas.reranked?.mrrAtK)} |`); + lines.push(``); + if (typeof diff.deltas.p95TotalMs === "number") { + lines.push(`- p95 total ms Δ: \`${diff.deltas.p95TotalMs.toFixed(1)}ms\``); + lines.push(``); + } + lines.push(`## Worst recall regressions`); + lines.push(``); + lines.push(`| id | Δ recall@k | baseline | candidate |`); + lines.push(`| --- | ---: | ---: | ---: |`); + for (const r of diff.worstRegressions) { + lines.push( + `| \`${r.id}\` | ${r.deltaRecallAtK.toFixed(3)} | ${r.baselineRecallAtK.toFixed(3)} | ${r.candidateRecallAtK.toFixed(3)} |` + ); + } + lines.push(``); + await writeFile(outPath, lines.join("\n") + "\n", "utf8"); + return outPath; +} + diff --git a/packages/unrag/registry/eval/runner.ts b/packages/unrag/registry/eval/runner.ts new file mode 100644 index 0000000..7580f61 --- /dev/null +++ b/packages/unrag/registry/eval/runner.ts @@ -0,0 +1,450 @@ +import type { AssetInput, Metadata } from "../core/types"; +import type { ContextEngine } from "../core/context-engine"; + +import { + readEvalDatasetFromFile, + type EvalDatasetV1, + type EvalMode, + type EvalThresholds, +} from "./dataset"; +import { computeMetricsAtK, uniqueSourceIdsInOrder, type EvalMetricsAtK } from "./metrics"; +import { + percentiles, + type EvalCleanupPolicy, + type EvalQueryResult, + type EvalReportV1, +} from "./report"; + +export type EvalRunArgs = { + engine: ContextEngine; + datasetPath: string; + /** Overrides dataset defaults. */ + mode?: EvalMode; + /** Overrides dataset defaults. */ + topK?: number; + /** + * In `retrieve+rerank` mode, overrides dataset `defaults.rerankTopK`. + * If omitted, the runner will default to `topK * 3` per query (clamped to at least `topK`). + */ + rerankTopK?: number; + /** Overrides dataset defaults. */ + scopePrefix?: string; + /** If true, ingest dataset documents before running queries (default: true when dataset has documents). */ + ingest?: boolean; + cleanup?: EvalCleanupPolicy; + includeNdcg?: boolean; + /** + * Allow documents[].assets and pass them through to `engine.ingest`. + * Default false because URL-based assets introduce network variance / SSRF risks. + */ + allowAssets?: boolean; + /** + * Safety guardrail: scope prefixes should usually be namespaced as `eval:...`. + * When false, non-eval prefixes are rejected. + */ + allowNonEvalPrefix?: boolean; + /** + * Safety guardrail: when deleting a non-eval prefix, require explicit confirmation. + * (The generated script will only set this true for non-interactive `--yes` runs.) + */ + confirmedDangerousDelete?: boolean; + /** Optional baseline report loaded by the caller (diffing lives outside runner core). */ + baselineReportPath?: string; + /** Optional thresholds (higher precedence than dataset defaults). */ + thresholds?: Partial; + /** + * Loader hook for documents with `loaderRef` instead of inline `content`. + * The generated script provides a stub that users can customize. + */ + loadDocumentByRef?: (ref: string) => Promise; +}; + +export type EvalRunOutput = { + report: EvalReportV1; + exitCode: 0 | 1; + thresholdFailures: string[]; +}; + +const now = () => performance.now(); + +export async function runEval(args: EvalRunArgs): Promise { + const dataset = await readEvalDatasetFromFile(args.datasetPath); + + const includeNdcg = Boolean(args.includeNdcg); + const cleanup: EvalCleanupPolicy = args.cleanup ?? "none"; + + const mode: EvalMode = args.mode ?? dataset.defaults.mode ?? "retrieve"; + const topK: number = args.topK ?? dataset.defaults.topK ?? 10; + const scopePrefix: string = (args.scopePrefix ?? dataset.defaults.scopePrefix).trim(); + if (!scopePrefix) throw new Error(`[unrag:eval] Missing scopePrefix (dataset.defaults.scopePrefix)`); + + // Guardrails around delete-by-prefix + const isEvalNamespaced = scopePrefix.startsWith("eval:"); + if (!isEvalNamespaced && !args.allowNonEvalPrefix) { + throw new Error( + `[unrag:eval] Refusing to run with scopePrefix="${scopePrefix}" because it does not start with "eval:". ` + + `Use --allow-non-eval-prefix and --yes only if you understand the delete-by-prefix risk.` + ); + } + + const datasetHasDocs = Array.isArray(dataset.documents) && dataset.documents.length > 0; + const ingest = args.ingest ?? datasetHasDocs; + + const thresholdConfig: Partial = deepMergeThresholds( + dataset.defaults.thresholds ?? {}, + args.thresholds ?? {} + ); + + // Optional ingest stage (isolated by scopePrefix) + if (ingest && datasetHasDocs) { + if (!isEvalNamespaced && !args.confirmedDangerousDelete) { + throw new Error( + `[unrag:eval] Refusing to delete non-eval scopePrefix="${scopePrefix}" without confirmation. ` + + `Re-run with --yes (and keep the prefix narrowly scoped).` + ); + } + + await args.engine.delete({ sourceIdPrefix: scopePrefix }); + + for (const doc of dataset.documents ?? []) { + const sourceId = doc.sourceId; + if (!sourceId.startsWith(scopePrefix)) { + throw new Error( + `[unrag:eval] Dataset document sourceId "${sourceId}" does not start with scopePrefix "${scopePrefix}". ` + + `To keep eval isolated, ensure dataset documents are namespaced under defaults.scopePrefix.` + ); + } + + const content = await resolveDocumentContent(doc, args.loadDocumentByRef); + + if (doc.assets !== undefined && !args.allowAssets) { + throw new Error( + `[unrag:eval] Dataset includes documents[].assets but assets are disabled by default for safety. ` + + `Re-run with --allow-assets if you understand the SSRF/network variance implications.` + ); + } + + await args.engine.ingest({ + sourceId, + content, + metadata: normalizeMetadata(doc.metadata), + assets: (doc.assets as AssetInput[] | undefined) ?? undefined, + }); + } + } + + // Query loop + const queryResults: EvalQueryResult[] = []; + let embeddingModel: string | undefined; + + for (const q of dataset.queries) { + const qTopK = q.topK ?? topK; + const qScopePrefix = (q.scopePrefix ?? scopePrefix).trim(); + const qRerankTopK = + mode === "retrieve+rerank" + ? clampRerankTopK({ + topK: qTopK, + rerankTopK: + q.rerankTopK ?? + args.rerankTopK ?? + dataset.defaults.rerankTopK ?? + qTopK * 3, + }) + : undefined; + + const retrieved = await args.engine.retrieve({ + query: q.query, + topK: mode === "retrieve+rerank" ? qRerankTopK : qTopK, + scope: { sourceId: qScopePrefix }, + }); + embeddingModel = embeddingModel ?? retrieved.embeddingModel; + + const retrievedSourceIds = uniqueSourceIdsInOrder(retrieved.chunks.map((c) => c.sourceId)); + const retrievedMetrics = computeMetricsAtK({ + retrievedSourceIds, + relevantSourceIds: q.relevant.sourceIds, + k: qTopK, + includeNdcg, + }); + + let rerankedBlock: EvalQueryResult["reranked"] | undefined; + if (mode === "retrieve+rerank") { + const rerankStart = now(); + const reranked = await args.engine.rerank({ + query: q.query, + candidates: retrieved.chunks, + topK: qTopK, + onMissingReranker: "throw", + onMissingText: "skip", + }); + const rerankTotalMs = now() - rerankStart; + + const rerankedSourceIds = uniqueSourceIdsInOrder(reranked.chunks.map((c) => c.sourceId)); + const rerankedMetrics = computeMetricsAtK({ + retrievedSourceIds: rerankedSourceIds, + relevantSourceIds: q.relevant.sourceIds, + k: qTopK, + includeNdcg, + }); + + rerankedBlock = { + sourceIds: rerankedSourceIds, + metrics: rerankedMetrics, + durationsMs: { + rerankMs: reranked.durations.rerankMs, + totalMs: Math.max(reranked.durations.totalMs, rerankTotalMs), + }, + meta: { + rerankerName: reranked.meta?.rerankerName, + model: reranked.meta?.model, + }, + warnings: reranked.warnings, + }; + } + + queryResults.push({ + id: q.id, + query: q.query, + topK: qTopK, + ...(qRerankTopK ? { rerankTopK: qRerankTopK } : {}), + scopePrefix: qScopePrefix, + relevant: { sourceIds: q.relevant.sourceIds }, + retrieved: { + sourceIds: retrievedSourceIds, + metrics: retrievedMetrics, + durationsMs: retrieved.durations, + }, + ...(rerankedBlock ? { reranked: rerankedBlock } : {}), + ...(q.notes ? { notes: q.notes } : {}), + }); + } + + // Cleanup policy (post-run) + if (ingest && datasetHasDocs) { + const shouldCleanup = + cleanup === "always" || + (cleanup === "on-success" && true); // errors would have thrown already + if (shouldCleanup) { + await args.engine.delete({ sourceIdPrefix: scopePrefix }); + } + } + + // Aggregates + const retrievedAgg = aggregatesFor(queryResults.map((q) => q.retrieved.metrics)); + const rerankedAgg = + mode === "retrieve+rerank" + ? aggregatesFor( + queryResults.map((q) => q.reranked?.metrics).filter(Boolean) as EvalMetricsAtK[] + ) + : undefined; + + const timingAgg = buildTimingAggregates(mode, queryResults); + + // Threshold evaluation (apply to final stage: reranked if present else retrieved) + const { failures: thresholdFailures, passed } = evaluateThresholds({ + thresholds: thresholdConfig, + mode, + aggregates: rerankedAgg ?? retrievedAgg, + p95TotalMs: timingAgg.totalMs.p95, + }); + + const createdAt = new Date().toISOString(); + const report: EvalReportV1 = { + version: "1", + createdAt, + dataset: { + id: dataset.id, + version: "1", + ...(dataset.description ? { description: dataset.description } : {}), + }, + config: { + mode, + topK, + ...(mode === "retrieve+rerank" && (args.rerankTopK ?? dataset.defaults.rerankTopK) !== undefined + ? { rerankTopK: clampRerankTopK({ topK, rerankTopK: args.rerankTopK ?? dataset.defaults.rerankTopK! }) } + : {}), + scopePrefix, + ingest, + cleanup, + includeNdcg, + }, + engine: { + embeddingModel, + rerankerName: queryResults.find((q) => q.reranked)?.reranked?.meta?.rerankerName, + rerankerModel: queryResults.find((q) => q.reranked)?.reranked?.meta?.model, + }, + results: { + queries: queryResults, + aggregates: { + retrieved: retrievedAgg, + ...(rerankedAgg ? { reranked: rerankedAgg } : {}), + }, + timings: timingAgg, + thresholdsApplied: thresholdConfig, + thresholdFailures, + passed, + }, + }; + + return { + report, + thresholdFailures, + exitCode: passed ? 0 : 1, + }; +} + +async function resolveDocumentContent( + doc: EvalDatasetV1["documents"][number], + loadByRef: EvalRunArgs["loadDocumentByRef"] +): Promise { + if (typeof doc.content === "string" && doc.content.trim().length > 0) return doc.content; + if (typeof doc.loaderRef === "string" && doc.loaderRef.trim().length > 0) { + if (!loadByRef) { + throw new Error( + `[unrag:eval] Dataset document uses loaderRef="${doc.loaderRef}" but no loadDocumentByRef hook was provided.` + ); + } + const content = await loadByRef(doc.loaderRef); + if (typeof content !== "string" || content.trim().length === 0) { + throw new Error( + `[unrag:eval] loadDocumentByRef("${doc.loaderRef}") returned empty content.` + ); + } + return content; + } + throw new Error(`[unrag:eval] Dataset document is missing both content and loaderRef.`); +} + +function normalizeMetadata(input: unknown): Metadata | undefined { + if (!input || typeof input !== "object" || Array.isArray(input)) return undefined; + const out: Record = {}; + for (const [k, v] of Object.entries(input as Record)) { + if (v === undefined) continue; + if (v === null || typeof v === "string" || typeof v === "number" || typeof v === "boolean") { + out[k] = v; + continue; + } + if (Array.isArray(v)) { + const xs = v.filter( + (x) => x === null || typeof x === "string" || typeof x === "number" || typeof x === "boolean" + ); + out[k] = xs; + continue; + } + // Drop unsupported nested objects for stability/diff-friendliness. + } + return out as Metadata; +} + +function aggregatesFor(metrics: EvalMetricsAtK[]) { + const mean = { + hitAtK: meanOf(metrics.map((m) => m.hitAtK)), + recallAtK: meanOf(metrics.map((m) => m.recallAtK)), + precisionAtK: meanOf(metrics.map((m) => m.precisionAtK)), + mrrAtK: meanOf(metrics.map((m) => m.mrrAtK)), + ...(metrics.some((m) => typeof m.ndcgAtK === "number") + ? { ndcgAtK: meanOf(metrics.map((m) => m.ndcgAtK ?? 0)) } + : {}), + }; + + const median = { + hitAtK: medianOf(metrics.map((m) => m.hitAtK)), + recallAtK: medianOf(metrics.map((m) => m.recallAtK)), + precisionAtK: medianOf(metrics.map((m) => m.precisionAtK)), + mrrAtK: medianOf(metrics.map((m) => m.mrrAtK)), + ...(metrics.some((m) => typeof m.ndcgAtK === "number") + ? { ndcgAtK: medianOf(metrics.map((m) => m.ndcgAtK ?? 0)) } + : {}), + }; + + return { mean, median }; +} + +function meanOf(xs: number[]): number { + const ys = xs.filter((v) => Number.isFinite(v)); + if (ys.length === 0) return 0; + return ys.reduce((a, b) => a + b, 0) / ys.length; +} + +function medianOf(xs: number[]): number { + const ys = xs.filter((v) => Number.isFinite(v)).slice().sort((a, b) => a - b); + if (ys.length === 0) return 0; + const mid = Math.floor(ys.length / 2); + if (ys.length % 2 === 1) return ys[mid]!; + return (ys[mid - 1]! + ys[mid]!) / 2; +} + +function buildTimingAggregates(mode: EvalMode, qs: EvalQueryResult[]) { + const embedding = qs.map((q) => q.retrieved.durationsMs.embeddingMs); + const retrieval = qs.map((q) => q.retrieved.durationsMs.retrievalMs); + const retrieveTotal = qs.map((q) => q.retrieved.durationsMs.totalMs); + + const total = qs.map((q) => q.retrieved.durationsMs.totalMs + (q.reranked?.durationsMs.totalMs ?? 0)); + + if (mode !== "retrieve+rerank") { + return { + embeddingMs: percentiles(embedding), + retrievalMs: percentiles(retrieval), + retrieveTotalMs: percentiles(retrieveTotal), + totalMs: percentiles(total), + }; + } + + const rerankMs = qs.map((q) => q.reranked?.durationsMs.rerankMs ?? 0); + const rerankTotalMs = qs.map((q) => q.reranked?.durationsMs.totalMs ?? 0); + + return { + embeddingMs: percentiles(embedding), + retrievalMs: percentiles(retrieval), + retrieveTotalMs: percentiles(retrieveTotal), + rerankMs: percentiles(rerankMs), + rerankTotalMs: percentiles(rerankTotalMs), + totalMs: percentiles(total), + }; +} + +function deepMergeThresholds( + base: Partial, + override: Partial +): Partial { + const out: Partial = { + min: { ...(base.min ?? {}) }, + max: { ...(base.max ?? {}) }, + }; + if (override.min) Object.assign(out.min!, override.min); + if (override.max) Object.assign(out.max!, override.max); + return out; +} + +function evaluateThresholds(args: { + thresholds: Partial; + mode: EvalMode; + aggregates: { mean: EvalMetricsAtK; median: EvalMetricsAtK }; + p95TotalMs: number; +}): { failures: string[]; passed: boolean } { + const failures: string[] = []; + const min = args.thresholds.min ?? {}; + const max = args.thresholds.max ?? {}; + + if (typeof min.hitAtK === "number" && args.aggregates.mean.hitAtK < min.hitAtK) { + failures.push(`min.hitAtK: expected >= ${min.hitAtK}, got ${args.aggregates.mean.hitAtK.toFixed(3)}`); + } + if (typeof min.recallAtK === "number" && args.aggregates.mean.recallAtK < min.recallAtK) { + failures.push(`min.recallAtK: expected >= ${min.recallAtK}, got ${args.aggregates.mean.recallAtK.toFixed(3)}`); + } + if (typeof min.mrrAtK === "number" && args.aggregates.mean.mrrAtK < min.mrrAtK) { + failures.push(`min.mrrAtK: expected >= ${min.mrrAtK}, got ${args.aggregates.mean.mrrAtK.toFixed(3)}`); + } + if (typeof max.p95TotalMs === "number" && args.p95TotalMs > max.p95TotalMs) { + failures.push(`max.p95TotalMs: expected <= ${max.p95TotalMs}, got ${args.p95TotalMs.toFixed(1)}ms`); + } + + return { failures, passed: failures.length === 0 }; +} + +function clampRerankTopK(args: { topK: number; rerankTopK: number }): number { + const topK = Math.max(1, Math.floor(args.topK)); + const requested = Math.floor(args.rerankTopK); + if (!Number.isFinite(requested) || requested <= 0) return topK * 3; + return Math.max(topK, requested); +} diff --git a/packages/unrag/registry/manifest.json b/packages/unrag/registry/manifest.json index c916fba..a26122c 100644 --- a/packages/unrag/registry/manifest.json +++ b/packages/unrag/registry/manifest.json @@ -355,6 +355,15 @@ "envVars": [ { "name": "COHERE_API_KEY", "required": true, "notes": "Cohere API key for reranking." } ] + }, + { + "id": "eval", + "displayName": "Eval Harness", + "description": "Deterministic retrieval evaluation runner (reports + CI thresholds + baseline diffs).", + "status": "available", + "docsPath": "/docs/batteries/eval", + "deps": {}, + "devDeps": {} } ] } diff --git a/packages/unrag/test/add-battery-eval.test.ts b/packages/unrag/test/add-battery-eval.test.ts new file mode 100644 index 0000000..08c95c8 --- /dev/null +++ b/packages/unrag/test/add-battery-eval.test.ts @@ -0,0 +1,76 @@ +import { test, expect, describe, beforeEach, afterEach } from "bun:test"; +import path from "node:path"; +import { mkdir, rm, writeFile, readFile } from "node:fs/promises"; +import { addCommand } from "../cli/commands/add"; + +const workspaceTmpRoot = path.join(process.cwd(), "tmp", "test-runs"); + +async function writeJson(filePath: string, data: unknown) { + await mkdir(path.dirname(filePath), { recursive: true }); + await writeFile(filePath, JSON.stringify(data, null, 2) + "\n", "utf8"); +} + +async function readJson(filePath: string): Promise { + const raw = await readFile(filePath, "utf8"); + return JSON.parse(raw) as T; +} + +async function pathExists(p: string) { + try { + await readFile(p); + return true; + } catch { + return false; + } +} + +describe("unrag add battery eval", () => { + let runDir: string; + let originalCwd: string; + + beforeEach(async () => { + originalCwd = process.cwd(); + runDir = path.join(workspaceTmpRoot, crypto.randomUUID()); + await rm(runDir, { recursive: true, force: true }); + await mkdir(runDir, { recursive: true }); + }); + + afterEach(async () => { + process.chdir(originalCwd); + await rm(runDir, { recursive: true, force: true }); + }); + + test("vendors eval battery and generates scaffolding + scripts", async () => { + await writeJson(path.join(runDir, "package.json"), { + name: "proj", + private: true, + type: "module", + dependencies: {}, + scripts: {}, + }); + + await writeJson(path.join(runDir, "unrag.json"), { + installDir: "lib/unrag", + storeAdapter: "raw-sql", + aliasBase: "@unrag", + version: 1, + batteries: [], + }); + + process.chdir(runDir); + await addCommand(["battery", "eval", "--yes", "--no-install"]); + + expect(await pathExists(path.join(runDir, "lib/unrag/eval/index.ts"))).toBe(true); + expect(await pathExists(path.join(runDir, ".unrag/eval/datasets/sample.json"))).toBe(true); + expect(await pathExists(path.join(runDir, ".unrag/eval/config.json"))).toBe(true); + expect(await pathExists(path.join(runDir, "scripts/unrag-eval.ts"))).toBe(true); + + const pkg = await readJson<{ scripts?: Record }>(path.join(runDir, "package.json")); + expect(pkg.scripts?.["unrag:eval"]).toContain("scripts/unrag-eval.ts"); + expect(pkg.scripts?.["unrag:eval:ci"]).toContain("--ci"); + + const cfg = await readJson<{ batteries?: string[] }>(path.join(runDir, "unrag.json")); + expect(cfg.batteries).toEqual(["eval"]); + }); +}); + diff --git a/packages/unrag/test/eval-dataset.test.ts b/packages/unrag/test/eval-dataset.test.ts new file mode 100644 index 0000000..d1342af --- /dev/null +++ b/packages/unrag/test/eval-dataset.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, test } from "bun:test"; +import { parseEvalDataset } from "../registry/eval/dataset"; + +describe("eval dataset parsing", () => { + test("parses minimal valid dataset", () => { + const ds = parseEvalDataset({ + version: "1", + id: "mini", + defaults: { scopePrefix: "eval:mini:", topK: 5, mode: "retrieve" }, + queries: [ + { + id: "q1", + query: "hello", + relevant: { sourceIds: ["eval:mini:doc:a"] }, + }, + ], + }); + + expect(ds.version).toBe("1"); + expect(ds.id).toBe("mini"); + expect(ds.defaults.scopePrefix).toBe("eval:mini:"); + expect(ds.queries.length).toBe(1); + }); + + test("parses rerankTopK (defaults + per-query override)", () => { + const ds = parseEvalDataset({ + version: "1", + id: "mini", + defaults: { scopePrefix: "eval:mini:", topK: 5, mode: "retrieve+rerank", rerankTopK: 15 }, + queries: [ + { + id: "q1", + query: "hello", + rerankTopK: 20, + relevant: { sourceIds: ["eval:mini:doc:a"] }, + }, + ], + }); + + expect(ds.defaults.rerankTopK).toBe(15); + expect(ds.queries[0]?.rerankTopK).toBe(20); + }); + + test("requires defaults.scopePrefix", () => { + expect(() => + parseEvalDataset({ + version: "1", + id: "bad", + defaults: {}, + queries: [ + { + id: "q1", + query: "hello", + relevant: { sourceIds: ["x"] }, + }, + ], + }) + ).toThrow(); + }); + + test("requires documents to include content or loaderRef", () => { + expect(() => + parseEvalDataset({ + version: "1", + id: "bad-docs", + defaults: { scopePrefix: "eval:bad:" }, + documents: [{ sourceId: "eval:bad:doc:1" }], + queries: [ + { id: "q1", query: "hello", relevant: { sourceIds: ["eval:bad:doc:1"] } }, + ], + }) + ).toThrow(); + }); +}); + diff --git a/packages/unrag/test/eval-metrics.test.ts b/packages/unrag/test/eval-metrics.test.ts new file mode 100644 index 0000000..78ee4ea --- /dev/null +++ b/packages/unrag/test/eval-metrics.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, test } from "bun:test"; +import { computeMetricsAtK, uniqueSourceIdsInOrder } from "../registry/eval/metrics"; + +describe("eval metrics", () => { + test("uniqueSourceIdsInOrder de-dupes while preserving first occurrence", () => { + const out = uniqueSourceIdsInOrder(["a", "b", "a", "c", "b"]); + expect(out).toEqual(["a", "b", "c"]); + }); + + test("computes hit/precision/recall/mrr correctly", () => { + const m = computeMetricsAtK({ + retrievedSourceIds: ["x", "a", "b", "c"], + relevantSourceIds: ["a", "c"], + k: 3, + }); + + // top3 = [x,a,b] -> hits=1 + expect(m.hitAtK).toBe(1); + expect(m.recallAtK).toBeCloseTo(1 / 2, 6); + expect(m.precisionAtK).toBeCloseTo(1 / 3, 6); + expect(m.mrrAtK).toBeCloseTo(1 / 2, 6); // first relevant at rank 2 + }); + + test("nDCG is 0 when no relevant docs exist", () => { + const m = computeMetricsAtK({ + retrievedSourceIds: ["a", "b"], + relevantSourceIds: [], + k: 2, + includeNdcg: true, + }); + expect(m.ndcgAtK).toBe(0); + }); +}); + diff --git a/packages/unrag/test/eval-report-diff.test.ts b/packages/unrag/test/eval-report-diff.test.ts new file mode 100644 index 0000000..d87c821 --- /dev/null +++ b/packages/unrag/test/eval-report-diff.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, test } from "bun:test"; +import { diffEvalReports, type EvalReportV1 } from "../registry/eval/report"; + +function reportWithRecall(recallAtK: number): EvalReportV1 { + return { + version: "1", + createdAt: new Date().toISOString(), + dataset: { id: "ds", version: "1" }, + config: { + mode: "retrieve", + topK: 10, + scopePrefix: "eval:ds:", + ingest: false, + cleanup: "none", + includeNdcg: false, + }, + engine: {}, + results: { + queries: [ + { + id: "q1", + query: "x", + topK: 10, + scopePrefix: "eval:ds:", + relevant: { sourceIds: ["eval:ds:doc:a"] }, + retrieved: { + sourceIds: ["eval:ds:doc:a"], + metrics: { hitAtK: 1, recallAtK, precisionAtK: recallAtK / 10, mrrAtK: 1 }, + durationsMs: { embeddingMs: 1, retrievalMs: 1, totalMs: 2 }, + }, + }, + ], + aggregates: { + retrieved: { + mean: { hitAtK: 1, recallAtK, precisionAtK: recallAtK / 10, mrrAtK: 1 }, + median: { hitAtK: 1, recallAtK, precisionAtK: recallAtK / 10, mrrAtK: 1 }, + }, + }, + timings: { + embeddingMs: { p50: 1, p95: 1 }, + retrievalMs: { p50: 1, p95: 1 }, + retrieveTotalMs: { p50: 2, p95: 2 }, + totalMs: { p50: 2, p95: 2 }, + }, + }, + }; +} + +describe("eval report diff", () => { + test("diff computes deltas", () => { + const baseline = reportWithRecall(0.5); + const candidate = reportWithRecall(0.8); + + const diff = diffEvalReports({ + baseline, + candidate, + baselinePath: "/baseline/report.json", + candidatePath: "/candidate/report.json", + }); + + expect(diff.deltas.retrieved.recallAtK).toBeCloseTo(0.3, 6); + expect(diff.worstRegressions.length).toBeGreaterThan(0); + }); +}); + diff --git a/packages/unrag/test/eval-runner-thresholds.test.ts b/packages/unrag/test/eval-runner-thresholds.test.ts new file mode 100644 index 0000000..41a26c6 --- /dev/null +++ b/packages/unrag/test/eval-runner-thresholds.test.ts @@ -0,0 +1,221 @@ +import { describe, expect, test, beforeEach, afterEach } from "bun:test"; +import path from "node:path"; +import { mkdir, rm, writeFile } from "node:fs/promises"; + +import { runEval } from "../registry/eval/runner"; + +const workspaceTmpRoot = path.join(process.cwd(), "tmp", "test-runs"); + +async function writeJson(filePath: string, data: unknown) { + await mkdir(path.dirname(filePath), { recursive: true }); + await writeFile(filePath, JSON.stringify(data, null, 2) + "\n", "utf8"); +} + +describe("eval runner thresholds", () => { + let runDir: string; + + beforeEach(async () => { + runDir = path.join(workspaceTmpRoot, crypto.randomUUID()); + await rm(runDir, { recursive: true, force: true }); + await mkdir(runDir, { recursive: true }); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + test("fails thresholds with exitCode=1", async () => { + const datasetPath = path.join(runDir, "dataset.json"); + await writeJson(datasetPath, { + version: "1", + id: "t", + defaults: { scopePrefix: "eval:t:", topK: 2, mode: "retrieve" }, + queries: [ + { + id: "q1", + query: "refund window", + relevant: { sourceIds: ["eval:t:doc:refund"] }, + }, + ], + }); + + const engine = { + retrieve: async () => ({ + chunks: [ + { + id: "c1", + documentId: "d1", + sourceId: "eval:t:doc:other", + index: 0, + content: "other", + tokenCount: 1, + metadata: {}, + score: 0.9, + }, + ], + embeddingModel: "test-embed", + durations: { embeddingMs: 1, retrievalMs: 1, totalMs: 2 }, + }), + rerank: async () => { + throw new Error("not used"); + }, + ingest: async () => { + throw new Error("not used"); + }, + delete: async () => {}, + } as any; + + const result = await runEval({ + engine, + datasetPath, + ingest: false, + thresholds: { min: { recallAtK: 0.75 } }, + }); + + expect(result.exitCode).toBe(1); + expect(result.thresholdFailures.length).toBeGreaterThan(0); + }); + + test("passes thresholds when recall meets minimum", async () => { + const datasetPath = path.join(runDir, "dataset.json"); + await writeJson(datasetPath, { + version: "1", + id: "t2", + defaults: { scopePrefix: "eval:t2:", topK: 2, mode: "retrieve" }, + queries: [ + { + id: "q1", + query: "refund window", + relevant: { sourceIds: ["eval:t2:doc:refund"] }, + }, + ], + }); + + const engine = { + retrieve: async () => ({ + chunks: [ + { + id: "c1", + documentId: "d1", + sourceId: "eval:t2:doc:refund", + index: 0, + content: "refund policy", + tokenCount: 2, + metadata: {}, + score: 0.9, + }, + ], + embeddingModel: "test-embed", + durations: { embeddingMs: 1, retrievalMs: 1, totalMs: 2 }, + }), + rerank: async () => { + throw new Error("not used"); + }, + ingest: async () => { + throw new Error("not used"); + }, + delete: async () => {}, + } as any; + + const result = await runEval({ + engine, + datasetPath, + ingest: false, + thresholds: { min: { recallAtK: 0.5 } }, + }); + + expect(result.exitCode).toBe(0); + expect(result.thresholdFailures.length).toBe(0); + expect(result.report.engine.embeddingModel).toBe("test-embed"); + }); + + test("in retrieve+rerank mode, retrieves rerankTopK candidates and thresholds apply to reranked stage", async () => { + const datasetPath = path.join(runDir, "dataset.json"); + await writeJson(datasetPath, { + version: "1", + id: "t3", + defaults: { scopePrefix: "eval:t3:", topK: 2, rerankTopK: 6, mode: "retrieve+rerank" }, + queries: [ + { + id: "q1", + query: "refund window", + relevant: { sourceIds: ["eval:t3:doc:refund"] }, + }, + ], + }); + + let observedRetrieveTopK: number | null = null; + let observedRerankCandidateCount: number | null = null; + let observedRerankTopK: number | null = null; + + const makeChunk = (sourceId: string, id: string) => ({ + id, + documentId: "d1", + sourceId, + index: 0, + content: `content for ${sourceId}`, + tokenCount: 1, + metadata: {}, + score: 0.9, + }); + + const candidates = [ + makeChunk("eval:t3:doc:other-1", "c1"), + makeChunk("eval:t3:doc:other-2", "c2"), + makeChunk("eval:t3:doc:other-3", "c3"), + makeChunk("eval:t3:doc:other-4", "c4"), + makeChunk("eval:t3:doc:refund", "c5"), // relevant is outside topK=2 + makeChunk("eval:t3:doc:other-5", "c6"), + ]; + + const engine = { + retrieve: async (input: any) => { + observedRetrieveTopK = input?.topK ?? null; + return { + chunks: candidates, + embeddingModel: "test-embed", + durations: { embeddingMs: 1, retrievalMs: 1, totalMs: 2 }, + }; + }, + rerank: async (input: any) => { + observedRerankCandidateCount = Array.isArray(input?.candidates) ? input.candidates.length : null; + observedRerankTopK = input?.topK ?? null; + return { + chunks: [candidates[4], candidates[0]].filter(Boolean), // promote relevant + ranking: [], + meta: { rerankerName: "test-reranker", model: "test-model" }, + durations: { rerankMs: 3, totalMs: 3 }, + warnings: [], + }; + }, + ingest: async () => { + throw new Error("not used"); + }, + delete: async () => {}, + } as any; + + const result = await runEval({ + engine, + datasetPath, + ingest: false, + thresholds: { min: { recallAtK: 1 } }, + }); + + expect(observedRetrieveTopK).toBe(6); + expect(observedRerankCandidateCount).toBe(6); + expect(observedRerankTopK).toBe(2); + + expect(result.report.config.mode).toBe("retrieve+rerank"); + expect(result.report.config.rerankTopK).toBe(6); + expect(result.report.results.queries[0]?.rerankTopK).toBe(6); + + // Retrieved@2 misses the relevant doc; reranked@2 finds it. + expect(result.report.results.queries[0]?.retrieved.metrics.recallAtK).toBe(0); + expect(result.report.results.queries[0]?.reranked?.metrics.recallAtK).toBe(1); + + // Thresholds should apply to reranked stage in retrieve+rerank mode. + expect(result.exitCode).toBe(0); + expect(result.thresholdFailures.length).toBe(0); + }); +}); + diff --git a/packages/unrag/test/init.test.ts b/packages/unrag/test/init.test.ts index 3436a42..39efd78 100644 --- a/packages/unrag/test/init.test.ts +++ b/packages/unrag/test/init.test.ts @@ -284,6 +284,60 @@ describe("unrag@latest init", () => { expect(pkg.dependencies?.ai).toBeTruthy(); expect(pkg.dependencies?.["@ai-sdk/openai"]).toBeTruthy(); }); + + test("installs batteries from preset (eval) as part of init", async () => { + await writeJson(path.join(runDir, "package.json"), { + name: "proj", + private: true, + type: "module", + dependencies: {}, + }); + + process.chdir(runDir); + + const originalFetch = globalThis.fetch; + globalThis.fetch = (async () => { + return new Response( + JSON.stringify({ + version: 1, + createdAt: new Date().toISOString(), + install: { installDir: "lib/unrag", storeAdapter: "drizzle", aliasBase: "@unrag" }, + modules: { extractors: [], connectors: [], batteries: ["eval"] }, + config: { + defaults: { chunking: { chunkSize: 200, chunkOverlap: 40 }, retrieval: { topK: 8 } }, + embedding: { provider: "ai", config: { type: "text", model: "openai/text-embedding-3-small", timeoutMs: 15000 } }, + engine: { storage: { storeChunkContent: true, storeDocumentContent: true } }, + }, + }), + { status: 200, headers: { "content-type": "application/json" } } + ); + }) as any; + + try { + await initCommand([ + "--preset", + "https://example.com/preset.json", + "--no-install", + ]); + } finally { + globalThis.fetch = originalFetch; + } + + // Battery module code should be vendored. + expect(await pathExists(path.join(runDir, "lib/unrag", "eval", "index.ts"))).toBe(true); + + // Eval scaffolding should be created. + expect(await pathExists(path.join(runDir, ".unrag/eval/datasets/sample.json"))).toBe(true); + expect(await pathExists(path.join(runDir, ".unrag/eval/config.json"))).toBe(true); + expect(await pathExists(path.join(runDir, "scripts/unrag-eval.ts"))).toBe(true); + + const unragJson = await readJson(path.join(runDir, "unrag.json")); + expect(unragJson.batteries).toEqual(["eval"]); + + const pkg = await readJson(path.join(runDir, "package.json")); + expect(pkg.scripts?.["unrag:eval"]).toBeTruthy(); + expect(pkg.scripts?.["unrag:eval:ci"]).toBeTruthy(); + }); }); diff --git a/specs/EVAL_HARNESS_SPEC.md b/specs/EVAL_HARNESS_SPEC.md deleted file mode 100644 index 1b88ace..0000000 --- a/specs/EVAL_HARNESS_SPEC.md +++ /dev/null @@ -1,337 +0,0 @@ -## Unrag Eval Harness — Detailed Spec (Sprint 1) - -### Summary -The **Eval Harness** is an **optional, vendored module + lightweight CLI workflow** that helps users **measure retrieval quality and regressions** for Unrag-based RAG systems. - -It focuses on **retrieval evaluation** (and optionally **retrieval + reranking**) rather than end-to-end “answer quality” grading. - -### Why this exists (value) -- **Prevent regressions** when changing embedding models, chunking, store config, or prompts upstream. -- **Make tuning measurable**: chunk size/overlap, topK, scope strategy, metadata patterns. -- **Support CI gates** (e.g. “recall@10 must be ≥ 0.75”). -- **Keep Unrag’s philosophy**: users own the code; harness is **auditable** and **customizable**. - ---- - -## Goals / Non-goals - -### Goals -- **Standard dataset format** for retrieval evaluation. -- **Deterministic evaluation runner** that produces machine-readable JSON + human-readable summaries. -- Metrics for: - - **Document-level relevance** (primary) - - **Chunk-level relevance** (optional) - - **Latency** (secondary) -- Workflows for: - - local runs - - CI runs with thresholds - - comparing two runs (baseline vs candidate) -- First-class support for evaluating: - - **Vector retrieval only** - - **Vector retrieval + rerank** (Sprint 1 companion) - -### Non-goals (explicitly out of scope for Sprint 1) -- LLM-as-judge answer grading, citations scoring, hallucination grading. -- Multi-step agent evaluation, tool-use evaluation. -- Hybrid search evaluation (planned Sprint 2). -- Metadata filter evaluation (planned Sprint 2, though datasets should be future-proof). -- Hosting a dashboard or uploading results to a SaaS (export files only). - ---- - -## High-level architecture - -### Core concept -The harness runs a series of **queries** against an Unrag engine and scores the returned results against **ground-truth relevance labels**. - -### Execution stages -1. **Dataset load**: parse dataset file(s). -2. **Index setup** (optional): ingest documents from dataset into the configured store using an isolated scope/prefix. -3. **Query loop**: - - call `engine.retrieve({ query, topK, scope })` - - optionally apply reranker on retrieved candidates -4. **Scoring**: - - compute metrics per query - - aggregate metrics across the dataset -5. **Reporting**: - - write JSON report (canonical output) - - optionally write markdown summary + diff vs baseline -6. **Exit code**: - - success/fail based on thresholds (CI-friendly) - ---- - -## Integration with Unrag (what gets vendored) - -### Expected vendored files (proposed) -These are *targets*; exact paths can be finalized during implementation: -- `lib/unrag/eval/runner.ts`: dataset runner and orchestration -- `lib/unrag/eval/dataset.ts`: dataset parsing + validation -- `lib/unrag/eval/metrics.ts`: metric implementations -- `lib/unrag/eval/report.ts`: report types + writers -- `scripts/unrag-eval.ts`: project entrypoint script that loads `createUnragEngine()` and executes the runner -- `.unrag/eval/`: default location for datasets and run artifacts - -### How the harness gets installed (options) -Pick one (implementation choice). The spec supports either: -- **Option A (preferred)**: `unrag add eval` (new “battery” kind alongside connectors/extractors) -- **Option B**: `unrag eval setup` (new command that writes scripts + vendored eval module) - ---- - -## UX / Commands (spec) - -### Setup -- `bunx unrag@latest eval setup` - - Generates `scripts/unrag-eval.ts` - - Creates `.unrag/eval/` with example dataset(s) - - Adds `package.json` scripts: - - `unrag:eval` - - `unrag:eval:ci` - -### Run locally -- `bun run unrag:eval -- --dataset .unrag/eval/datasets/sample.json` - -### CI run (strict) -- `bun run unrag:eval:ci -- --dataset .unrag/eval/datasets/sample.json` - - writes `.unrag/eval/runs//*.json` - - returns non-zero exit code if thresholds fail - -### Compare two runs -- `bun run unrag:eval -- --dataset ... --baseline .unrag/eval/runs//report.json` - - produces `diff.json` + `diff.md` - ---- - -## Dataset format (v1) - -### Design principles -- **Human-editable** and **diff-friendly** -- Avoid brittle “exact chunk id” ground truth (chunk IDs are generated at ingest) -- Prefer **document-level labels** using Unrag’s stable identifier: `sourceId` -- Allow future expansion for metadata filters and hybrid search - -### File format -- JSON (required for v1) -- YAML optional later (non-goal for v1) - -### Dataset schema (conceptual) -Top-level fields: -- `version`: `"1"` -- `id`: dataset identifier -- `description`: optional -- `defaults`: - - `topK`: default retrieval topK used when query doesn’t specify it - - `scopePrefix`: default `sourceId` prefix applied to retrieval (and ingestion scope) - - `mode`: `"retrieve"` or `"retrieve+rerank"` (default `"retrieve"`) -- `documents` (optional): documents to ingest for the dataset -- `queries`: list of evaluation queries - -#### `documents[]` -Each document is the unit of ground truth relevance. -- `sourceId` (required): stable logical id -- `content` (required unless `loaderRef` provided): text content to ingest -- `metadata` (optional): stored metadata (JSON) -- `assets` (optional): Unrag assets (URLs/bytes not recommended in eval; see safety) -- `loaderRef` (optional): string key allowing the project script to load content (e.g. from filesystem) - -#### `queries[]` -- `id` (required): stable query id -- `query` (required): query text -- `topK` (optional): overrides dataset default -- `scopePrefix` (optional): overrides dataset default -- `relevant` (required): ground truth definition - - `sourceIds`: list of relevant `sourceId`s (document-level) - - `anyOfSourceIdPrefixes` (optional): list of acceptable prefixes (future-proofing) - - `metadataConstraints` (optional, future): intended for Sprint 2 (metadata filters) -- `notes` (optional): human notes - -### Example dataset (illustrative) -This is a *spec example*; the actual file generated by setup can differ: - -```json -{ - "version": "1", - "id": "help-center-mini", - "description": "Tiny dataset to validate retrieval changes.", - "defaults": { - "topK": 10, - "scopePrefix": "eval:help-center:", - "mode": "retrieve" - }, - "documents": [ - { - "sourceId": "eval:help-center:doc:refund-policy", - "content": "Refunds are available within 30 days..." - } - ], - "queries": [ - { - "id": "q_refund_window", - "query": "How long do I have to request a refund?", - "relevant": { "sourceIds": ["eval:help-center:doc:refund-policy"] } - } - ] -} -``` - ---- - -## Ground truth & scoring model - -### Primary relevance unit: document-level -Unrag retrieves **chunks**, but chunking strategies change frequently. -Therefore, the harness will score relevance primarily at the **document level**: -- A retrieved chunk counts as relevant if `chunk.sourceId` is in `relevant.sourceIds` -- Multiple relevant documents are supported - -### Secondary relevance unit: chunk-level (optional v1) -If users want more granular checks, allow **chunk matchers**: -- `containsText`: any retrieved chunk content contains a substring/regex (fragile; optional) -- `metadataKeyEquals`: metadata keys match expected values (more stable) - -Note: chunk-level must be opt-in and clearly documented as more brittle. - ---- - -## Metrics (v1) - -### Per-query metrics (document-level) -Given retrieved ranked list \(R\) of size \(k\) and set of relevant documents \(G\): -- **hit@k**: 1 if any relevant document appears in top-k; else 0 -- **recall@k**: \(|R \cap G| / |G|\) using unique `sourceId`s in top-k -- **precision@k**: \(|R \cap G| / k\) -- **MRR@k**: reciprocal rank of first relevant item within k; else 0 -- **nDCG@k** (optional): binary relevance; discounted gain by rank - -### Aggregated metrics -Across queries: -- Mean and median for each metric -- Optional breakdowns: - - by query tag/group (if dataset includes tags later) - - by scopePrefix (if multiple) - -### Operational metrics (secondary) -Capture per-query timings: -- embedding time -- store retrieval time -- rerank time (if enabled) -- total - -Report p50/p95 across dataset. - ---- - -## Rerank evaluation (Sprint 1 integration) - -### Modes -- **retrieve**: score the store’s returned ranking as-is -- **retrieve+rerank**: retrieve topK candidates, apply reranker, then score reranked order - -### What to report -For each query, include: -- metrics before rerank -- metrics after rerank -- delta - -This makes reranker impact measurable and regression-safe. - ---- - -## Indexing / ingestion behavior in eval - -### Default approach (recommended) -The harness can ingest dataset documents into the user’s existing Postgres store, but it must be **isolated**: -- enforce a required `scopePrefix` like `eval:::` or similar -- or ingest documents with sourceIds already namespaced in dataset - -### Cleanup policy -Configurable: -- `cleanup: "none" | "on-success" | "always"` - -### Safety and reproducibility constraints -- Avoid URL-based assets by default (network variance). -- If assets are used, require explicit opt-in flag and document SSRF/allowlist concerns. - ---- - -## Reports and artifacts - -### Canonical output: JSON report -Write a single JSON report file per run: -- path: `.unrag/eval/runs/-/report.json` -- includes: - - dataset info (id, version) - - runner config (topK, mode, scopePrefix, engine/provider names) - - per-query results (metrics + retrieved ids + timings) - - aggregates (means, medians, p50/p95 timings) - - thresholds applied + pass/fail outcome - -### Optional human output -- `.unrag/eval/runs/.../summary.md`: - - headline metrics - - worst queries - - regressions vs baseline (if provided) - -### Diff report (baseline comparison) -Given baseline report \(B\) and candidate report \(C\): -- produce `diff.json` with metric deltas and worst regressions -- produce `diff.md` summary for PR review - ---- - -## CI thresholds (gating) - -### Threshold config (v1) -Allow thresholds to be specified: -- via CLI flags OR -- in a `.unrag/eval/config.json` file OR -- in dataset `defaults.thresholds` (lowest priority) - -Supported thresholds: -- `min.hitAtK` (e.g. ≥ 0.90) -- `min.recallAtK` -- `min.mrrAtK` -- `max.p95TotalMs` (optional) - -CI behavior: -- exit code **0** if all thresholds pass -- exit code **1** if thresholds fail -- exit code **2** if runner errored (invalid dataset, config, runtime failure) - ---- - -## Configuration surface (runner) - -### Runner inputs (conceptual) -- `datasetPath` -- `mode`: retrieve | retrieve+rerank -- `topK` -- `scopePrefix` -- `ingest`: - - enabled - - cleanup policy -- `baselineReportPath` (optional) -- `thresholds` (optional) -- `outputDir` - -### Engine wiring -The harness **must not own** DB connection logic. -It should call the project’s `createUnragEngine()` from `unrag.config.ts` and run through it. - ---- - -## Compatibility & future-proofing (Sprint 2) -This spec is intentionally shaped to support: -- **metadata filters**: dataset can include `metadataConstraints` per query; runner can pass these into retrieval once Unrag supports it. -- **hybrid search**: runner can add a “retrieval strategy” field and compare vector-only vs hybrid. - ---- - -## Open questions (to resolve during implementation) -- **Installation shape**: `unrag add eval` vs `unrag eval setup`. -- **Where to store thresholds**: dataset vs separate config file. -- **Default scope strategy**: whether runner should auto-prefix sourceIds per run or require dataset to include namespaced `sourceId`s. -- **nDCG** inclusion: include in v1 or defer. -- **Loader hooks**: how to support large corpora without embedding raw document text in dataset JSON. -