BetterStacks · pingSubhajit · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/apps/web/app/api/presets/route.ts b/apps/web/app/api/presets/route.ts
@@ -29,6 +29,7 @@ type WizardStateV1 = {
   modules: {
     extractors: string[];
     connectors: string[];
+    batteries?: string[];
   };
   defaults: {
     chunkSize: number;
@@ -61,6 +62,7 @@ type PresetPayloadV1 = {
   modules: {
     extractors: string[];
     connectors: string[];
+    batteries: string[];
   };
   config: {
     defaults: {
@@ -93,6 +95,9 @@ function isWizardStateV1(x: unknown): x is WizardStateV1 {
   const o = x as any;
   if (o.v !== 1) return false;
   if (!o.install || !o.modules || !o.defaults || !o.embedding || !o.storage) return false;
+  if (o.modules && "batteries" in o.modules && o.modules.batteries != null && !Array.isArray(o.modules.batteries)) {
+    return false;
+  }
   return true;
 }
 
@@ -107,6 +112,9 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 {
   const connectors = Array.isArray(input.modules.connectors)
     ? input.modules.connectors.map(String).filter(Boolean)
     : [];
+  const batteries = Array.isArray((input.modules as any).batteries)
+    ? (input.modules as any).batteries.map(String).filter(Boolean)
+    : [];
 
   const chunkSize = Number(input.defaults.chunkSize) || 200;
   const chunkOverlap = Number(input.defaults.chunkOverlap) || 40;
@@ -140,7 +148,7 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 {
   return {
     v: 1,
     install: { installDir, storeAdapter, aliasBase },
-    modules: { extractors, connectors },
+    modules: { extractors, connectors, batteries },
     defaults: { chunkSize, chunkOverlap, topK },
     embedding: { type: embeddingType, provider: embeddingProvider, model, timeoutMs },
     storage: { storeChunkContent, storeDocumentContent },
@@ -160,6 +168,7 @@ function makePresetFromWizard(state: WizardStateV1): PresetPayloadV1 {
     modules: {
       extractors: state.modules.extractors,
       connectors: state.modules.connectors,
+      batteries: (state.modules.batteries ?? []).map(String).filter(Boolean),
     },
     config: {
       defaults: {
@@ -242,6 +251,11 @@ export async function POST(req: NextRequest) {
       .filter((c: any) => c.status === "available")
       .map((c: any) => String(c.id))
   );
+  const allowedBatteries = new Set(
+    (manifest.batteries ?? [])
+      .filter((b: any) => b.status === "available")
+      .map((b: any) => String(b.id))
+  );
 
   const unknownExtractors = state.modules.extractors.filter((x) => !allowedExtractors.has(x));
   if (unknownExtractors.length > 0) {
@@ -259,6 +273,15 @@ export async function POST(req: NextRequest) {
     );
   }
 
+  const batteryIds = (state.modules.batteries ?? []).map(String).filter(Boolean);
+  const unknownBatteries = batteryIds.filter((x) => !allowedBatteries.has(x));
+  if (unknownBatteries.length > 0) {
+    return NextResponse.json(
+      { error: "Unknown or unavailable batteries", unknownBatteries },
+      { status: 400 }
+    );
+  }
+
   const preset = makePresetFromWizard(state);
   const id = newPresetId();
   const key = `unrag:preset:${id}`;

diff --git a/apps/web/app/docs/[[...slug]]/page.tsx b/apps/web/app/docs/[[...slug]]/page.tsx
@@ -9,16 +9,27 @@ import { notFound } from 'next/navigation';
 import { getMDXComponents } from '@/mdx-components';
 import type { Metadata } from 'next';
 import { createRelativeLink } from 'fumadocs-ui/mdx';
+import SystemBanner from '@/components/ui/system-banner';
 
 export default async function Page(props: PageProps<'/docs/[[...slug]]'>) {
   const params = await props.params;
   const page = source.getPage(params.slug);
   if (!page) notFound();
 
   const MDX = page.data.body;
+  const slug = params.slug ?? [];
+  const isExperimentalFeature =
+    slug[0] === 'eval' 
+    || (slug[0] === 'batteries' && slug[1] === 'eval');
 
   return (
     <DocsPage toc={page.data.toc} full={page.data.full}>
+      <SystemBanner
+        text="Experimental Feature"
+        color="bg-amber-600"
+        size="xs"
+        show={isExperimentalFeature}
+      />
       <DocsTitle>{page.data.title}</DocsTitle>
       <DocsDescription>{page.data.description}</DocsDescription>
       <DocsBody>

diff --git a/apps/web/app/install/install-wizard-client.tsx b/apps/web/app/install/install-wizard-client.tsx
@@ -984,6 +984,7 @@ function BatteryCard({
   onToggle: () => void;
 }) {
   const isAvailable = status === 'available';
+  const isExperimental = id === 'eval';
 
   return (
     <ClickableCard
@@ -1015,6 +1016,11 @@ function BatteryCard({
             >
               {isAvailable ? 'available' : 'coming soon'}
             </span>
+            {isExperimental ? (
+              <span className="text-[10px] px-2 py-0.5 rounded-full border bg-amber-500/10 text-amber-400/80 border-amber-500/20">
+                experimental
+              </span>
+            ) : null}
             {isAvailable && docsHref ? (
               <div className="ml-auto">
                 <DocsIconLink href={docsHref} label={`${displayName || id} docs`} />
@@ -1197,6 +1203,16 @@ export default function InstallWizardClient() {
     }));
   }, [state.embedding.type, state.embedding.provider, state.embedding.model, selectedEmbeddingModelOption, setState]);
 
+  function pmExecBase(pm: 'bun' | 'npm' | 'pnpm' | 'yarn') {
+    return pm === 'bun'
+      ? 'bunx'
+      : pm === 'pnpm'
+        ? 'pnpm dlx'
+        : pm === 'yarn'
+          ? 'yarn dlx'
+          : 'npx';
+  }
+
   const commandPreview = useMemo(() => {
     if (presetId) {
       return `bunx unrag@latest init --yes --preset ${presetId}`;
@@ -1216,14 +1232,7 @@ export default function InstallWizardClient() {
 
   const installCommand = useMemo(() => {
     if (!presetId) return null;
-    const base =
-      pkgManager === 'bun'
-        ? 'bunx'
-        : pkgManager === 'pnpm'
-          ? 'pnpm dlx'
-          : pkgManager === 'yarn'
-            ? 'yarn dlx'
-            : 'npx';
+    const base = pmExecBase(pkgManager);
     return `${base} unrag@latest init --yes --preset ${presetId}`;
   }, [pkgManager, presetId]);
 

diff --git a/apps/web/components/ui/system-banner.tsx b/apps/web/components/ui/system-banner.tsx
@@ -0,0 +1,50 @@
+interface SystemBannerProps {
+  text?: string;
+  color?: string;
+  size?: "xs" | "sm" | "md" | "lg";
+  show?: boolean;
+}
+
+const sizeClasses: Record<NonNullable<SystemBannerProps["size"]>, string> = {
+  xs: "text-[10px] px-1 py-0.5",
+  sm: "text-xs px-2 py-0.5",
+  md: "text-sm px-3 py-1",
+  lg: "text-base px-4 py-1.5"
+};
+
+export default function SystemBanner({
+  text = "Development Mode",
+  color = "bg-orange-500",
+  size = "xs",
+  show = true
+}: SystemBannerProps) {
+  if (!show) return null;
+  return (
+    <div
+      className={`
+        fixed top-0 left-0 w-full h-0.5 z-50 flex justify-center
+        ${typeof color === "string" && color.startsWith("#") ? "" : color}
+      `}
+      style={
+        typeof color === "string" && color.startsWith("#")
+          ? { backgroundColor: color }
+          : undefined
+      }
+    >
+      <span
+        className={`
+          absolute -bottom-4 text-white font-bold rounded shadow-md
+          ${sizeClasses[size]}
+          ${typeof color === "string" && color.startsWith("#") ? "" : color}
+        `}
+        style={
+          typeof color === "string" && color.startsWith("#")
+            ? { backgroundColor: color }
+            : undefined
+        }
+      >
+        {text}
+      </span>
+    </div>
+  );
+}
diff --git a/apps/web/content/docs/batteries/eval.mdx b/apps/web/content/docs/batteries/eval.mdx
@@ -0,0 +1,96 @@
+---
+title: Eval Harness
+description: Measure and improve retrieval quality with deterministic evaluation, metrics, and CI integration.
+---
+
+<Callout type="warn">
+Evaluation is currently **experimental**. It’s safe to use, but expect some CLI flags, report fields, and defaults to change as the harness matures.
+</Callout>
+
+The eval harness is a battery that adds retrieval evaluation capabilities to your Unrag installation. It gives you a structured way to define test datasets, run your retrieval pipeline against them, compute standard metrics (hit@k, recall@k, precision@k, MRR@k), and track quality changes over time.
+
+Unlike the reranker battery which adds a new method to your engine, the eval harness is primarily a development and CI tool. You use it to measure how well your retrieval works, catch regressions before they reach production, and make informed decisions when tuning chunking, embeddings, or adding reranking.
+
+## Installing the eval battery
+
+```bash
+bunx unrag@latest add battery eval
+```
+
+This creates several files:
+
+<Files>
+  <Folder name="lib/unrag" defaultOpen>
+    <Folder name="eval">
+      <File name="index.ts" />
+      <File name="dataset.ts" />
+      <File name="metrics.ts" />
+      <File name="runner.ts" />
+      <File name="report.ts" />
+    </Folder>
+  </Folder>
+  <Folder name=".unrag/eval" defaultOpen>
+    <Folder name="datasets" defaultOpen>
+      <File name="sample.json" />
+    </Folder>
+    <File name="config.json" />
+  </Folder>
+  <Folder name="scripts" defaultOpen>
+    <File name="unrag-eval.ts" />
+  </Folder>
+</Files>
+
+It also adds two npm scripts to your `package.json`:
+
+```json
+{
+  "scripts": {
+    "unrag:eval": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json",
+    "unrag:eval:ci": "bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci"
+  }
+}
+```
+
+## Running your first eval
+
+After installation, run the sample evaluation:
+
+```bash
+bun run unrag:eval
+```
+
+The harness will ingest the sample documents, run the test queries, and write report files. You'll see output like:
+
+```
+[unrag:eval] Wrote report: .unrag/eval/runs/<timestamp>-sample/report.json
+[unrag:eval] Wrote summary: .unrag/eval/runs/<timestamp>-sample/summary.md
+[unrag:eval] Thresholds: pass
+```
+
+## Full documentation
+
+The eval harness is a substantial feature with its own documentation section covering everything from dataset design to CI integration:
+
+<Cards>
+  <Card title="Evaluation Overview" href="/docs/eval">
+    Why retrieval evaluation matters and how the harness works
+  </Card>
+  <Card title="Getting Started" href="/docs/eval/getting-started">
+    Complete setup guide with your first evaluation
+  </Card>
+  <Card title="Dataset Format" href="/docs/eval/datasets">
+    How to structure documents, queries, and ground truth
+  </Card>
+  <Card title="Understanding Metrics" href="/docs/eval/metrics">
+    What each metric measures and how to interpret results
+  </Card>
+  <Card title="Running Evals" href="/docs/eval/running-evals">
+    All configuration options and CLI flags
+  </Card>
+  <Card title="CI Integration" href="/docs/eval/ci-integration">
+    Automated quality gates and threshold checking
+  </Card>
+  <Card title="Comparing Runs" href="/docs/eval/comparing-runs">
+    Baseline diffs and tracking changes over time
+  </Card>
+</Cards>
diff --git a/apps/web/content/docs/batteries/index.mdx b/apps/web/content/docs/batteries/index.mdx
@@ -12,6 +12,7 @@ The difference is scope. Extractors transform content types (PDFs into text). Co
 | Battery | Description | Status |
 |---------|-------------|--------|
 | [Reranker](/docs/batteries/reranker) | Second-stage reranking using Cohere or custom models | Available |
+| [Eval Harness](/docs/batteries/eval) | Deterministic retrieval evaluation with metrics and CI integration | Experimental |
 
 ## Installing a battery
 
@@ -29,7 +30,7 @@ After installation, you wire the battery into your engine configuration. Each ba
 
 The core Unrag engine handles the fundamental RAG operations: chunking text, generating embeddings, storing vectors, and running similarity search. These operations cover most use cases and keep the default installation small.
 
-But production RAG systems often need more. Reranking can significantly improve precision by reordering initial retrieval results using a more expensive relevance model. Evaluation harnesses help you measure and improve retrieval quality. Hybrid search combines vector similarity with keyword matching.
+But production RAG systems often need more. Reranking can significantly improve precision by reordering initial retrieval results using a more expensive relevance model. Hybrid search combines vector similarity with keyword matching.
 
 Rather than bundling these features into the core (adding complexity and dependencies everyone pays for), Unrag provides them as optional batteries. Install what you need, skip what you don't. The code is vendored into your project, so you can read it, understand it, and modify it if your requirements differ from the defaults.
 

diff --git a/apps/web/content/docs/batteries/meta.json b/apps/web/content/docs/batteries/meta.json
@@ -2,5 +2,5 @@
   "title": "Batteries",
   "badge": "New",
   "description": "Optional modules that add capabilities like reranking to your Unrag installation.",
-  "pages": ["index", "reranker"]
+  "pages": ["index", "reranker", "eval"]
 }
diff --git a/apps/web/content/docs/batteries/reranker.mdx b/apps/web/content/docs/batteries/reranker.mdx
@@ -1,6 +1,5 @@
 ---
 title: Reranker
-new: true
 description: Improve retrieval precision with second-stage reranking using Cohere or custom models.
 ---