diff --git a/.changeset/0000-export-command-change.md b/.changeset/0000-export-command-change.md
new file mode 100644
index 00000000..12562b8b
--- /dev/null
+++ b/.changeset/0000-export-command-change.md
@@ -0,0 +1,5 @@
+---
+"evalite": major
+---
+
+Export command now uses the storage specified in the config and auto-runs if empty.
diff --git a/.changeset/0000-in-memory-default.md b/.changeset/0000-in-memory-default.md
new file mode 100644
index 00000000..d5bb9e51
--- /dev/null
+++ b/.changeset/0000-in-memory-default.md
@@ -0,0 +1,5 @@
+---
+"evalite": major
+---
+
+Changed default storage to in-memory. SQLite still available via config.
diff --git a/.changeset/0000-optional-scorer-name.md b/.changeset/0000-optional-scorer-name.md
new file mode 100644
index 00000000..7fa0a370
--- /dev/null
+++ b/.changeset/0000-optional-scorer-name.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Made scorer `name` field optional. When using pre-built scorers, name and description are now automatically extracted from the scorer's return value.
diff --git a/.changeset/0000-remove-implicit-vitest-config.md b/.changeset/0000-remove-implicit-vitest-config.md
new file mode 100644
index 00000000..b48ea4d0
--- /dev/null
+++ b/.changeset/0000-remove-implicit-vitest-config.md
@@ -0,0 +1,34 @@
+---
+"evalite": major
+---
+
+Removed implicit reading of vitest.config.ts/vite.config.ts files. Users must now explicitly pass Vite config via evalite.config.ts using the new `viteConfig` option. This change makes configuration more explicit and less confusing.
+
+**Migration Guide:**
+
+Before:
+
+```ts
+// vitest.config.ts was automatically read
+export default defineConfig({
+ test: {
+ testTimeout: 60000,
+ },
+});
+```
+
+After:
+
+```ts
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+import viteConfig from "./vite.config.ts";
+
+export default defineConfig({
+ viteConfig: viteConfig,
+ // Note: testTimeout, maxConcurrency, and setupFiles
+ // must be at root level, not in viteConfig.test
+ testTimeout: 60000,
+ setupFiles: ["./setup.ts"],
+});
+```
diff --git a/.changeset/0000-remove-streaming.md b/.changeset/0000-remove-streaming.md
new file mode 100644
index 00000000..1c2835a6
--- /dev/null
+++ b/.changeset/0000-remove-streaming.md
@@ -0,0 +1,5 @@
+---
+"evalite": minor
+---
+
+Removed streaming text support from tasks. Process streams before returning from task() (e.g., await result.text for AI SDK).
diff --git a/.changeset/0000-rerun-button.md b/.changeset/0000-rerun-button.md
new file mode 100644
index 00000000..4a21ed41
--- /dev/null
+++ b/.changeset/0000-rerun-button.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Added rerun button to UI in watch and serve modes
diff --git a/.changeset/0000-variant-only.md b/.changeset/0000-variant-only.md
new file mode 100644
index 00000000..33562353
--- /dev/null
+++ b/.changeset/0000-variant-only.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Added `only` option to variants in `evalite.each()` to selectively run specific variants.
diff --git a/.changeset/0234-auto-dotenv-support.md b/.changeset/0234-auto-dotenv-support.md
new file mode 100644
index 00000000..97589b1b
--- /dev/null
+++ b/.changeset/0234-auto-dotenv-support.md
@@ -0,0 +1,5 @@
+---
+"evalite": minor
+---
+
+Support .env files by default via dotenv/config. Environment variables from .env files are now automatically loaded without any configuration needed. Users no longer need to manually add `setupFiles: ["dotenv/config"]` to their evalite.config.ts.
diff --git a/.changeset/angry-dogs-sort.md b/.changeset/angry-dogs-sort.md
new file mode 100644
index 00000000..95d5cef1
--- /dev/null
+++ b/.changeset/angry-dogs-sort.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Made it so passing UI messages (from AI SDK) directly into Evalite spawns a custom UI.
diff --git a/.changeset/long-olives-give.md b/.changeset/long-olives-give.md
new file mode 100644
index 00000000..6fe11fd6
--- /dev/null
+++ b/.changeset/long-olives-give.md
@@ -0,0 +1,5 @@
+---
+"evalite": major
+---
+
+Moved storage API from evals -> suites, results -> evals. This will likely cause issues for existing SQLite databases when released, so will need migration.
diff --git a/.changeset/pre.json b/.changeset/pre.json
new file mode 100644
index 00000000..38595685
--- /dev/null
+++ b/.changeset/pre.json
@@ -0,0 +1,26 @@
+{
+ "mode": "pre",
+ "tag": "beta",
+ "initialVersions": {
+ "evalite": "0.19.0",
+ "evalite-docs": "0.0.1",
+ "evalite-tests": "0.0.11",
+ "example": "0.0.11"
+ },
+ "changesets": [
+ "0000-export-command-change",
+ "0000-in-memory-default",
+ "0000-optional-scorer-name",
+ "0000-remove-implicit-vitest-config",
+ "0000-remove-streaming",
+ "0000-rerun-button",
+ "0000-variant-only",
+ "0234-auto-dotenv-support",
+ "angry-dogs-sort",
+ "long-olives-give",
+ "real-phones-join",
+ "table-rendering",
+ "thick-birds-design",
+ "wet-clocks-camp"
+ ]
+}
diff --git a/.changeset/real-phones-join.md b/.changeset/real-phones-join.md
new file mode 100644
index 00000000..c6b27aab
--- /dev/null
+++ b/.changeset/real-phones-join.md
@@ -0,0 +1,5 @@
+---
+"evalite-ui": patch
+---
+
+Added an overlay to the backdrop when viewing a trace
diff --git a/.changeset/sixty-jeans-melt.md b/.changeset/sixty-jeans-melt.md
new file mode 100644
index 00000000..cfa5f74c
--- /dev/null
+++ b/.changeset/sixty-jeans-melt.md
@@ -0,0 +1,5 @@
+---
+"evalite": major
+---
+
+Dropped compatibility with autoevals, and implemented our own built-in library of scorers.
diff --git a/.changeset/table-rendering.md b/.changeset/table-rendering.md
new file mode 100644
index 00000000..dbdb1bfb
--- /dev/null
+++ b/.changeset/table-rendering.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+UI now renders simple arrays of objects and flat objects as markdown tables instead of JSON trees for better readability
diff --git a/.changeset/thick-birds-design.md b/.changeset/thick-birds-design.md
new file mode 100644
index 00000000..8b6fcf5a
--- /dev/null
+++ b/.changeset/thick-birds-design.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Made better-sqlite3 an optional peer dependency
diff --git a/.changeset/wet-clocks-camp.md b/.changeset/wet-clocks-camp.md
new file mode 100644
index 00000000..b833f033
--- /dev/null
+++ b/.changeset/wet-clocks-camp.md
@@ -0,0 +1,5 @@
+---
+"evalite-ui": minor
+---
+
+Add the ability to search and filter evals in the UI
diff --git a/.github/workflows/preview.yml b/.github/workflows/preview.yml
index f3be05f8..00257408 100644
--- a/.github/workflows/preview.yml
+++ b/.github/workflows/preview.yml
@@ -10,11 +10,14 @@ jobs:
preview:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
with:
fetch-depth: 0
- uses: pnpm/action-setup@v4
+
+ - run: git status
+
- uses: actions/setup-node@v4
with:
node-version: 22.x
diff --git a/apps/docs/.gitignore b/apps/docs/.gitignore
new file mode 100644
index 00000000..9e429e49
--- /dev/null
+++ b/apps/docs/.gitignore
@@ -0,0 +1,26 @@
+# deps
+/node_modules
+
+# generated content
+.source
+
+# test & build
+/coverage
+/.next/
+/out/
+/build
+*.tsbuildinfo
+
+# misc
+.DS_Store
+*.pem
+/.pnp
+.pnp.js
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# others
+.env*.local
+.vercel
+next-env.d.ts
\ No newline at end of file
diff --git a/apps/docs/README.md b/apps/docs/README.md
new file mode 100644
index 00000000..9b7bba9e
--- /dev/null
+++ b/apps/docs/README.md
@@ -0,0 +1,45 @@
+# docs
+
+This is a Next.js application generated with
+[Create Fumadocs](https://github.com/fuma-nama/fumadocs).
+
+Run development server:
+
+```bash
+npm run dev
+# or
+pnpm dev
+# or
+yarn dev
+```
+
+Open http://localhost:3000 with your browser to see the result.
+
+## Explore
+
+In the project, you can see:
+
+- `lib/source.ts`: Code for content source adapter, [`loader()`](https://fumadocs.dev/docs/headless/source-api) provides the interface to access your content.
+- `lib/layout.shared.tsx`: Shared options for layouts, optional but preferred to keep.
+
+| Route | Description |
+| ------------------------- | ------------------------------------------------------ |
+| `app/(home)` | The route group for your landing page and other pages. |
+| `app/docs` | The documentation layout and pages. |
+| `app/api/search/route.ts` | The Route Handler for search. |
+
+### Fumadocs MDX
+
+A `source.config.ts` config file has been included, you can customise different options like frontmatter schema.
+
+Read the [Introduction](https://fumadocs.dev/docs/mdx) for further details.
+
+## Learn More
+
+To learn more about Next.js and Fumadocs, take a look at the following
+resources:
+
+- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js
+ features and API.
+- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
+- [Fumadocs](https://fumadocs.dev) - learn about Fumadocs
diff --git a/apps/docs/app/(home)/components/cta-section.tsx b/apps/docs/app/(home)/components/cta-section.tsx
new file mode 100644
index 00000000..ca37708d
--- /dev/null
+++ b/apps/docs/app/(home)/components/cta-section.tsx
@@ -0,0 +1,84 @@
+import Link from "next/link";
+import { buttonVariants } from "@/components/ui/button";
+import { cn } from "@/lib/cn";
+import { ArrowRight, BookOpen, Code, Rocket } from "lucide-react";
+
+export function CTASection() {
+ return (
+
+
+
+
+
+ Start building better AI apps
+
+
+ Get started with Evalite in minutes. Write your first eval and see
+ results instantly.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ TypeScript Native
+
+
+ Write evals in TypeScript with full type safety and IntelliSense
+ support.
+
+
+
+
+
+
+
+
+
+ Local Development
+
+
+ Run everything locally. No API keys, no cloud services, just
+ your code.
+
+
+ );
+}
diff --git a/apps/docs/app/(home)/components/testimonials-section.tsx b/apps/docs/app/(home)/components/testimonials-section.tsx
new file mode 100644
index 00000000..2337070a
--- /dev/null
+++ b/apps/docs/app/(home)/components/testimonials-section.tsx
@@ -0,0 +1,217 @@
+"use client";
+
+import { motion, useAnimationFrame, useMotionValue } from "motion/react";
+import { useEffect, useRef, useState } from "react";
+
+interface Testimonial {
+ id: number;
+ name: string;
+ content: string;
+ avatar?: string;
+}
+
+const testimonials: Testimonial[] = [
+ {
+ id: 1,
+ name: "Pontus Abrahamsson",
+ avatar:
+ "https://pbs.twimg.com/profile_images/1755611130368770048/JwLEqyeo_400x400.jpg",
+ content:
+ "If you're building an AI assistant, test with Evalite: - Write evals - Run a local server on localhost - Capture traces and more.",
+ },
+ {
+ id: 2,
+ name: "Alex Rivera",
+ avatar:
+ "https://pbs.twimg.com/profile_images/1330495170287169537/fX2ugXxX_400x400.jpg",
+ content:
+ "Evalite is great for writing lightweight evals, especially for generateObject / generateText calls.",
+ },
+ {
+ id: 3,
+ name: "sockthedev",
+ avatar:
+ "https://pbs.twimg.com/profile_images/1569584517161324544/po3hKnjN_400x400.jpg",
+ content:
+ "evalite is incredible for helping to iterate and improve your prompts for ai workflows. i was able to make insane gains on the security and dependability of my prompts for something really critical to the business. so nice to gain confidence. fantastic stuff @mattpocockuk",
+ },
+ {
+ id: 4,
+ name: "Alexander Hirdman",
+ avatar:
+ "https://pbs.twimg.com/profile_images/1951566988934893568/GOtdBhrb_400x400.jpg",
+ content:
+ "Finally got to implement some evals with @mattpocockuk evalite. So good, now I don’t know how I managed without it. Simple but powerful.",
+ },
+];
+
+function TestimonialCard({ testimonial }: { testimonial: Testimonial }) {
+ return (
+
+ );
+}
diff --git a/apps/docs/components/ui/button.tsx b/apps/docs/components/ui/button.tsx
new file mode 100644
index 00000000..f515ee5c
--- /dev/null
+++ b/apps/docs/components/ui/button.tsx
@@ -0,0 +1,28 @@
+import { cva, type VariantProps } from "class-variance-authority";
+
+const variants = {
+ primary: "bg-fd-primary text-fd-primary-foreground hover:bg-fd-primary/80",
+ outline: "border hover:bg-fd-accent hover:text-fd-accent-foreground",
+ ghost: "hover:bg-fd-accent hover:text-fd-accent-foreground",
+ secondary:
+ "border bg-fd-secondary text-fd-secondary-foreground hover:bg-fd-accent hover:text-fd-accent-foreground",
+} as const;
+
+export const buttonVariants = cva(
+ "inline-flex items-center justify-center rounded-md p-2 text-sm font-medium transition-colors duration-100 disabled:pointer-events-none disabled:opacity-50 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-fd-ring",
+ {
+ variants: {
+ variant: variants,
+ // fumadocs use `color` instead of `variant`
+ color: variants,
+ size: {
+ sm: "gap-1 px-2 py-1.5 text-xs",
+ icon: "p-1.5 [&_svg]:size-5",
+ "icon-sm": "p-1.5 [&_svg]:size-4.5",
+ "icon-xs": "p-1 [&_svg]:size-4",
+ },
+ },
+ }
+);
+
+export type ButtonProps = VariantProps;
diff --git a/apps/docs/content/docs/api/cli.mdx b/apps/docs/content/docs/api/cli.mdx
new file mode 100644
index 00000000..24a51109
--- /dev/null
+++ b/apps/docs/content/docs/api/cli.mdx
@@ -0,0 +1,177 @@
+---
+title: CLI
+---
+
+The `evalite` command-line interface for running evaluations.
+
+## Commands
+
+### `evalite` (default)
+
+Alias for `evalite run`. Runs evals once and exits.
+
+```bash
+evalite
+```
+
+### `evalite run`
+
+Run evals once and exit. Default command when no subcommand specified.
+
+```bash
+evalite run
+evalite run path/to/eval.eval.ts
+```
+
+**Positional Arguments:**
+
+- `[path]` (optional) - Path filter to run specific eval files. If not provided, runs all `.eval.ts` files.
+
+**Flags:**
+
+- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100.
+- `--outputPath ` - Path to write test results in JSON format after evaluation completes.
+- `--hideTable` - Hides the detailed table output in the CLI.
+
+**Examples:**
+
+```bash
+# Run all evals
+evalite run
+
+# Run specific eval file
+evalite run example.eval.ts
+
+# Fail if score drops below 80%
+evalite run --threshold 80
+
+# Export results to JSON
+evalite run --outputPath results.json
+
+# Hide detailed table
+evalite run --hideTable
+```
+
+### `evalite watch`
+
+Watch evals for file changes and re-run automatically. Starts the UI server at `http://localhost:3006`.
+
+```bash
+evalite watch
+evalite watch path/to/eval.eval.ts
+```
+
+**Positional Arguments:**
+
+- `[path]` (optional) - Path filter to watch specific eval files.
+
+**Flags:**
+
+- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100.
+- `--hideTable` - Hides the detailed table output in the CLI.
+
+**Note:** `--outputPath` is not supported in watch mode.
+
+**Examples:**
+
+```bash
+# Watch all evals
+evalite watch
+
+# Watch specific eval
+evalite watch example.eval.ts
+
+# Watch with hidden table (useful for debugging with console.log)
+evalite watch --hideTable
+```
+
+### `evalite serve`
+
+Run evals once and serve the UI without watching for changes. Useful when evals take a long time to run.
+
+```bash
+evalite serve
+evalite serve path/to/eval.eval.ts
+```
+
+**Positional Arguments:**
+
+- `[path]` (optional) - Path filter to run specific eval files.
+
+**Flags:**
+
+- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100.
+- `--outputPath ` - Path to write test results in JSON format after evaluation completes.
+- `--hideTable` - Hides the detailed table output in the CLI.
+
+**Examples:**
+
+```bash
+# Run once and serve UI
+evalite serve
+
+# Serve specific eval results
+evalite serve example.eval.ts
+```
+
+### `evalite export`
+
+Export static UI bundle for CI artifacts. Exports a standalone HTML bundle that can be viewed offline or uploaded as a CI artifact.
+
+```bash
+evalite export
+```
+
+**Flags:**
+
+- `--output ` - Output directory for static export. Default: `./evalite-export`
+- `--runId ` - Specific run ID to export. Default: latest run
+
+**Examples:**
+
+```bash
+# Export latest run to default directory
+evalite export
+
+# Export to custom directory
+evalite export --output ./my-export
+
+# Export specific run
+evalite export --runId 123
+
+# Export and specify both options
+evalite export --output ./artifacts --runId 42
+```
+
+**Note:** If no runs exist in storage, `evalite export` will automatically run evaluations first.
+
+## Global Flags
+
+All commands support these flags:
+
+- `--help` - Show help for the command
+- `--version` - Show version information
+
+## Configuration
+
+CLI behavior can be configured via [evalite.config.ts](/api/define-config):
+
+```typescript
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+
+export default defineConfig({
+ scoreThreshold: 80, // Default threshold for all runs
+ hideTable: true, // Hide table by default
+ server: {
+ port: 3006, // UI server port
+ },
+});
+```
+
+## See Also
+
+- [runEvalite()](/api/run-evalite) - Run evals programmatically from Node.js
+- [defineConfig()](/api/define-config) - Configure Evalite behavior
+- [Watch Mode](/tips/watch-mode) - Tips for using watch mode effectively
+- [CI/CD](/tips/run-evals-on-ci-cd) - Running evals in continuous integration
diff --git a/apps/docs/content/docs/api/create-scorer.mdx b/apps/docs/content/docs/api/create-scorer.mdx
new file mode 100644
index 00000000..143429a4
--- /dev/null
+++ b/apps/docs/content/docs/api/create-scorer.mdx
@@ -0,0 +1,219 @@
+---
+title: createScorer()
+---
+
+Create a reusable scorer function for evaluating LLM outputs.
+
+## Signature
+
+```typescript
+createScorer(opts: {
+ name: string;
+ description?: string;
+ scorer: (input: {
+ input: TInput;
+ output: TOutput;
+ expected?: TExpected;
+ }) => Promise | number | { score: number; metadata?: unknown };
+}): Scorer
+```
+
+## Parameters
+
+### `opts.name`
+
+**Type:** `string` (required)
+
+The name of the scorer. Displayed in the UI and test output.
+
+```typescript
+createScorer({
+ name: "Exact Match",
+ scorer: ({ output, expected }) => (output === expected ? 1 : 0),
+});
+```
+
+### `opts.description`
+
+**Type:** `string` (optional)
+
+A description of what the scorer evaluates. Helps document scoring logic.
+
+```typescript
+createScorer({
+ name: "Length Check",
+ description: "Checks if output is at least 10 characters",
+ scorer: ({ output }) => (output.length >= 10 ? 1 : 0),
+});
+```
+
+### `opts.scorer`
+
+**Type:** `(input: { input, output, expected }) => number | { score: number; metadata?: unknown }`
+
+The scoring function. Receives input, output, and expected values. Must return:
+
+- A number between 0 and 1, or
+- An object with `score` (0-1) and optional `metadata`
+
+```typescript
+createScorer({
+ name: "Word Count",
+ scorer: ({ output }) => {
+ const wordCount = output.split(" ").length;
+ return {
+ score: wordCount >= 10 ? 1 : 0,
+ metadata: { wordCount },
+ };
+ },
+});
+```
+
+## Return Value
+
+Returns a `Scorer` function that can be used in the `scorers` array of [evalite()](/api/evalite).
+
+## Usage
+
+### Basic Scorer
+
+```typescript
+import { createScorer, evalite } from "evalite";
+
+const exactMatch = createScorer({
+ name: "Exact Match",
+ scorer: ({ output, expected }) => {
+ return output === expected ? 1 : 0;
+ },
+});
+
+evalite("My Eval", {
+ data: [{ input: "Hello", expected: "Hi" }],
+ task: async (input) => callLLM(input),
+ scorers: [exactMatch],
+});
+```
+
+### Scorer with Metadata
+
+```typescript
+const lengthChecker = createScorer({
+ name: "Length Check",
+ description: "Validates output length is within acceptable range",
+ scorer: ({ output }) => {
+ const length = output.length;
+ const isValid = length >= 10 && length <= 100;
+
+ return {
+ score: isValid ? 1 : 0,
+ metadata: {
+ length,
+ minLength: 10,
+ maxLength: 100,
+ },
+ };
+ },
+});
+```
+
+### Async Scorer
+
+Scorers can be async for LLM-based evaluation:
+
+```typescript
+const llmScorer = createScorer({
+ name: "LLM Judge",
+ description: "Uses GPT-4 to evaluate output quality",
+ scorer: async ({ output, expected }) => {
+ const response = await openai.chat.completions.create({
+ model: "gpt-4",
+ messages: [
+ {
+ role: "system",
+ content: "Rate the output quality from 0 to 1.",
+ },
+ {
+ role: "user",
+ content: `Output: ${output}\nExpected: ${expected}`,
+ },
+ ],
+ });
+
+ const score = parseFloat(response.choices[0].message.content);
+ return score;
+ },
+});
+```
+
+### Reusable Scorers
+
+Create a library of scorers to reuse across evals:
+
+```typescript
+// scorers.ts
+import { createScorer } from "evalite";
+
+export const hasEmoji = createScorer({
+ name: "Has Emoji",
+ scorer: ({ output }) => (/\p{Emoji}/u.test(output) ? 1 : 0),
+});
+
+export const containsKeyword = (keyword: string) =>
+ createScorer({
+ name: `Contains "${keyword}"`,
+ scorer: ({ output }) => (output.includes(keyword) ? 1 : 0),
+ });
+
+// my-eval.eval.ts
+import { evalite } from "evalite";
+import { hasEmoji, containsKeyword } from "./scorers";
+
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => callLLM(input),
+ scorers: [hasEmoji, containsKeyword("greeting")],
+});
+```
+
+## Inline Scorers
+
+You can also define scorers inline without `createScorer()`:
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello", expected: "Hi" }],
+ task: async (input) => callLLM(input),
+ scorers: [
+ // Inline scorer (same shape as createScorer opts)
+ {
+ name: "Exact Match",
+ scorer: ({ output, expected }) => (output === expected ? 1 : 0),
+ },
+ ],
+});
+```
+
+Both approaches are equivalent. Use `createScorer()` when you want to reuse the scorer across multiple evals.
+
+## Using Third-Party Scorers
+
+Evalite is compatible with scorers from [autoevals](https://github.com/braintrustdata/autoevals):
+
+```typescript
+import { evalite } from "evalite";
+import { Levenshtein, Factuality } from "autoevals";
+
+evalite("My Eval", {
+ data: [{ input: "Hello", expected: "Hi there!" }],
+ task: async (input) => callLLM(input),
+ scorers: [
+ Levenshtein, // String similarity
+ Factuality, // Fact checking
+ ],
+});
+```
+
+## See Also
+
+- [Scorers Guide](/guides/scorers) - Overview of scoring strategies
+- [evalite()](/api/evalite) - Using scorers in evals
diff --git a/apps/docs/content/docs/api/define-config.mdx b/apps/docs/content/docs/api/define-config.mdx
new file mode 100644
index 00000000..99b3f5a0
--- /dev/null
+++ b/apps/docs/content/docs/api/define-config.mdx
@@ -0,0 +1,219 @@
+---
+title: defineConfig()
+---
+
+Type-safe helper for defining Evalite configuration in `evalite.config.ts`.
+
+## Signature
+
+```typescript
+defineConfig(config: {
+ storage?: () => Evalite.Storage | Promise;
+ server?: {
+ port?: number;
+ };
+ scoreThreshold?: number;
+ hideTable?: boolean;
+ testTimeout?: number;
+ maxConcurrency?: number;
+ trialCount?: number;
+ setupFiles?: string[];
+}): Evalite.Config
+```
+
+## Usage
+
+Create an `evalite.config.ts` file in your project root:
+
+```typescript
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+
+export default defineConfig({
+ testTimeout: 60000,
+ maxConcurrency: 100,
+ scoreThreshold: 80,
+});
+```
+
+## Options
+
+### `storage`
+
+**Type:** `() => Evalite.Storage | Promise`
+
+**Default:** In-memory storage (data lost after process exits)
+
+Factory function to create a custom storage backend. Use `createSqliteStorage()` for persistent storage.
+
+```typescript
+import { defineConfig } from "evalite/config";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+export default defineConfig({
+ storage: () => createSqliteStorage("./custom.db"),
+});
+```
+
+See [Storage](/api/storage) for more details.
+
+### `server.port`
+
+**Type:** `number`
+
+**Default:** `3006`
+
+Port for the Evalite UI server.
+
+```typescript
+export default defineConfig({
+ server: {
+ port: 8080,
+ },
+});
+```
+
+### `scoreThreshold`
+
+**Type:** `number` (0-100)
+
+**Default:** `100`
+
+Minimum average score threshold. If the average score falls below this threshold, the process will exit with code 1.
+
+```typescript
+export default defineConfig({
+ scoreThreshold: 80, // Fail if average score < 80
+});
+```
+
+Useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold.
+
+### `hideTable`
+
+**Type:** `boolean`
+
+**Default:** `false`
+
+Hide the detailed results table in terminal output. Keeps the score summary but removes the detailed table.
+
+```typescript
+export default defineConfig({
+ hideTable: true,
+});
+```
+
+Useful when debugging with `console.log` to see logs more clearly.
+
+### `testTimeout`
+
+**Type:** `number` (milliseconds)
+
+**Default:** `30000` (30 seconds)
+
+Maximum time a test can run before timing out.
+
+```typescript
+export default defineConfig({
+ testTimeout: 60000, // 60 seconds
+});
+```
+
+### `maxConcurrency`
+
+**Type:** `number`
+
+**Default:** `5`
+
+Maximum number of test cases to run in parallel.
+
+```typescript
+export default defineConfig({
+ maxConcurrency: 100, // Run up to 100 tests in parallel
+});
+```
+
+Useful for optimizing performance and managing API rate limits.
+
+### `trialCount`
+
+**Type:** `number`
+
+**Default:** `1`
+
+Number of times to run each test case. Useful for measuring variance in non-deterministic evaluations.
+
+```typescript
+export default defineConfig({
+ trialCount: 3, // Run each test case 3 times
+});
+```
+
+Can also be set per-eval in the [evalite()](/api/evalite) function.
+
+### `setupFiles`
+
+**Type:** `string[]`
+
+**Default:** `[]`
+
+Array of file paths to run before tests. Useful for loading custom environment setup.
+
+```typescript
+export default defineConfig({
+ setupFiles: ["./custom-setup.ts"],
+});
+```
+
+**Note:** `.env` files are loaded automatically via `dotenv/config` - no need to configure them here.
+
+## Complete Example
+
+```typescript
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+export default defineConfig({
+ // Persistent storage
+ storage: () => createSqliteStorage("./evalite.db"),
+
+ // Server configuration
+ server: {
+ port: 3006,
+ },
+
+ // Quality threshold
+ scoreThreshold: 75,
+
+ // Test execution
+ testTimeout: 60000,
+ maxConcurrency: 50,
+ trialCount: 1,
+
+ // UI preferences
+ hideTable: false,
+
+ // Setup
+ setupFiles: ["./test-setup.ts"],
+});
+```
+
+## Supported File Names
+
+Evalite will look for configuration in these files (in order):
+
+- `evalite.config.ts`
+- `evalite.config.mts`
+- `evalite.config.js`
+- `evalite.config.mjs`
+
+## Vitest Integration
+
+Since Evalite is built on Vitest, you can also use `vitest.config.ts` for backward compatibility. However, `evalite.config.ts` is the recommended approach and takes precedence when both files exist.
+
+## See Also
+
+- [Configuration Guide](/guides/configuration) - Overview of configuration options
+- [Storage](/api/storage) - Custom storage backends
+- [CLI](/api/cli) - Command-line flags that override config
diff --git a/apps/docs/content/docs/api/evalite-file.mdx b/apps/docs/content/docs/api/evalite-file.mdx
new file mode 100644
index 00000000..2cc15a0e
--- /dev/null
+++ b/apps/docs/content/docs/api/evalite-file.mdx
@@ -0,0 +1,246 @@
+---
+title: EvaliteFile
+---
+
+Utilities for working with images, audio, video, and other media files in evaluations.
+
+## Overview
+
+`EvaliteFile` provides methods for referencing files in your evals. Evalite automatically handles file storage and display in the UI.
+
+## Methods
+
+### `EvaliteFile.fromPath()`
+
+Reference a file on disk without loading it into memory.
+
+**Signature:**
+
+```typescript
+EvaliteFile.fromPath(path: string): Evalite.File
+```
+
+**Parameters:**
+
+- `path` - File path relative to your project root or absolute path
+
+**Returns:** An `Evalite.File` object that can be used in data, task outputs, traces, or columns.
+
+**Example:**
+
+```typescript
+import { evalite, EvaliteFile } from "evalite";
+
+evalite("Image Analysis", {
+ data: [
+ {
+ input: EvaliteFile.fromPath("./images/cat.jpg"),
+ expected: "A cat sitting on a couch",
+ },
+ ],
+ task: async (input) => {
+ console.log(input.path); // "./images/cat.jpg"
+
+ // Use the file path with your LLM
+ const response = await analyzeLLM(input.path);
+ return response;
+ },
+ scorers: [],
+});
+```
+
+### `EvaliteFile.isEvaliteFile()`
+
+Check if a value is an `Evalite.File` object.
+
+**Signature:**
+
+```typescript
+EvaliteFile.isEvaliteFile(value: unknown): value is Evalite.File
+```
+
+**Example:**
+
+```typescript
+import { EvaliteFile } from "evalite";
+
+const file = EvaliteFile.fromPath("./image.jpg");
+console.log(EvaliteFile.isEvaliteFile(file)); // true
+console.log(EvaliteFile.isEvaliteFile("./image.jpg")); // false
+```
+
+## Automatic File Detection
+
+Evalite automatically detects and handles `Uint8Array` (Buffer) objects without requiring `EvaliteFile`:
+
+```typescript
+import { evalite } from "evalite";
+import { readFileSync } from "fs";
+
+evalite("Image Eval", {
+ data: [
+ {
+ // Evalite automatically handles Buffers
+ input: readFileSync("./image.jpg"),
+ expected: readFileSync("./expected.jpg"),
+ },
+ ],
+ task: async (input) => {
+ // Return a Buffer - Evalite handles it automatically
+ return readFileSync("./output.jpg");
+ },
+ scorers: [],
+});
+```
+
+When Evalite detects a `Uint8Array`, it:
+
+1. Saves the file to `./node_modules/.evalite/files/`
+2. References the cached file in the UI
+3. Displays the file based on its type (image, audio, video, etc.)
+
+## Usage in Different Contexts
+
+### In Data (Input/Expected)
+
+```typescript
+evalite("My Eval", {
+ data: [
+ {
+ input: EvaliteFile.fromPath("./input.jpg"),
+ expected: EvaliteFile.fromPath("./expected.jpg"),
+ },
+ ],
+ task: async (input) => {
+ // ...
+ },
+});
+```
+
+### In Task Output
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Generate an image" }],
+ task: async (input) => {
+ const imageBuffer = await generateImage(input);
+ // Return Buffer or EvaliteFile
+ return imageBuffer; // Automatically handled
+ },
+});
+```
+
+### In Traces
+
+```typescript
+import { reportTrace } from "evalite/traces";
+
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ const imageInput = readFileSync("./input.jpg");
+
+ reportTrace({
+ input: imageInput, // File in trace
+ output: "Analysis complete",
+ });
+
+ return "Done";
+ },
+});
+```
+
+### In Columns
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ return "Output";
+ },
+ columns: () => [
+ {
+ label: "Debug Image",
+ value: EvaliteFile.fromPath("./debug.jpg"),
+ },
+ ],
+});
+```
+
+## When to Use fromPath() vs Buffers
+
+**Use `EvaliteFile.fromPath()` when:**
+
+- File is already on disk
+- Want to avoid loading large files into memory
+- Need to reference the file path in your task
+
+**Use Buffers (automatic detection) when:**
+
+- File is generated in memory
+- File comes from an API response
+- Working with base64 or other in-memory formats
+
+## Complete Example
+
+```typescript
+import { evalite, EvaliteFile } from "evalite";
+import { readFileSync } from "fs";
+import { reportTrace } from "evalite/traces";
+
+evalite("Multi-Modal Analysis", {
+ data: async () => {
+ return [
+ {
+ // Mix of file references and buffers
+ input: {
+ image: EvaliteFile.fromPath("./images/cat.jpg"),
+ audio: readFileSync("./audio/meow.mp3"),
+ },
+ expected: "A cat meowing",
+ },
+ ];
+ },
+ task: async (input) => {
+ // Trace with file
+ reportTrace({
+ input: input.image,
+ output: "Processing...",
+ });
+
+ const result = await analyzeMultiModal(input);
+
+ return result;
+ },
+ columns: ({ output }) => [
+ {
+ label: "Visualization",
+ value: readFileSync("./viz.png"),
+ },
+ ],
+ scorers: [
+ {
+ name: "Match",
+ scorer: ({ output, expected }) => {
+ return output === expected ? 1 : 0;
+ },
+ },
+ ],
+});
+```
+
+## File Storage
+
+All files (whether from `EvaliteFile.fromPath()` or auto-detected Buffers) are stored in:
+
+```
+./node_modules/.evalite/files/
+```
+
+This cache is gitignored by default. Files are referenced by content hash to avoid duplicates.
+
+## See Also
+
+- [Images and Media Guide](/tips/images-and-media) - Working with multi-modal data
+- [evalite()](/api/evalite) - Main evaluation function
+- [Traces](/api/traces) - Adding traces to track nested calls
diff --git a/apps/docs/content/docs/api/evalite.mdx b/apps/docs/content/docs/api/evalite.mdx
new file mode 100644
index 00000000..4b6ddc32
--- /dev/null
+++ b/apps/docs/content/docs/api/evalite.mdx
@@ -0,0 +1,244 @@
+---
+title: evalite()
+---
+
+The main function for defining evaluations in `.eval.ts` files.
+
+## Signature
+
+```typescript
+evalite(
+ evalName: string,
+ opts: {
+ data: Array<{ input: TInput; expected?: TExpected; only?: boolean }>
+ | (() => Promise>);
+ task: (input: TInput) => Promise | TOutput;
+ scorers?: Array | ScorerOpts>;
+ columns?: (opts: { input: TInput; output: TOutput; expected?: TExpected }) =>
+ Promise> |
+ Array<{ label: string; value: unknown }>;
+ trialCount?: number;
+ }
+): void
+```
+
+## Parameters
+
+### `evalName`
+
+**Type:** `string`
+
+The name of your evaluation. This appears in the UI and test output.
+
+```typescript
+evalite("Greeting Generator", {
+ // ...
+});
+```
+
+### `opts.data`
+
+**Type:** `Array<{ input: TInput; expected?: TExpected; only?: boolean }>` or `() => Promise>`
+
+The dataset for your evaluation. Each item becomes a separate test case.
+
+Can be an array or an async function that returns an array.
+
+```typescript
+// Static array
+evalite("My Eval", {
+ data: [
+ { input: "Hello", expected: "Hi there!" },
+ { input: "Goodbye", expected: "See you later!" },
+ ],
+ // ...
+});
+
+// Async function
+evalite("My Eval", {
+ data: async () => {
+ const dataset = await fetch("/api/dataset").then((r) => r.json());
+ return dataset;
+ },
+ // ...
+});
+```
+
+**`only` flag:** Mark specific data points to run exclusively during development:
+
+```typescript
+evalite("My Eval", {
+ data: [
+ { input: "test1", only: true }, // Only this will run
+ { input: "test2" },
+ { input: "test3" },
+ ],
+ // ...
+});
+```
+
+### `opts.task`
+
+**Type:** `(input: TInput) => Promise | TOutput`
+
+The function to test. Receives input from data, returns output to be scored.
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ const response = await openai.chat.completions.create({
+ model: "gpt-4",
+ messages: [{ role: "user", content: input }],
+ });
+ return response.choices[0].message.content;
+ },
+ // ...
+});
+```
+
+### `opts.scorers`
+
+**Type:** `Array` (optional)
+
+Functions that evaluate the output quality. Each scorer returns a score between 0 and 1.
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello", expected: "Hi" }],
+ task: async (input) => callLLM(input),
+ scorers: [
+ // Inline scorer
+ {
+ name: "Exact Match",
+ scorer: ({ output, expected }) => {
+ return output === expected ? 1 : 0;
+ },
+ },
+ // Using createScorer
+ createScorer({
+ name: "Length Check",
+ scorer: ({ output }) => {
+ return output.length > 10 ? 1 : 0;
+ },
+ }),
+ ],
+});
+```
+
+See [createScorer()](/api/create-scorer) for more details.
+
+### `opts.columns`
+
+**Type:** `(opts: { input, output, expected }) => Promise> | Array<{ label, value }>` (optional)
+
+Custom columns to display in the UI alongside input/output/expected.
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => callLLM(input),
+ columns: ({ output }) => [
+ { label: "Word Count", value: output.split(" ").length },
+ { label: "Has Emoji", value: /\p{Emoji}/u.test(output) },
+ ],
+});
+```
+
+### `opts.trialCount`
+
+**Type:** `number` (optional, default: `1`)
+
+Number of times to run each test case. Useful for measuring variance in non-deterministic evaluations.
+
+```typescript
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => callLLM(input),
+ trialCount: 5, // Run each data point 5 times
+});
+```
+
+Can also be set globally in [defineConfig()](/api/define-config).
+
+## Methods
+
+### `evalite.skip()`
+
+Skip an entire evaluation.
+
+```typescript
+evalite.skip("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => callLLM(input),
+});
+```
+
+### `evalite.each()`
+
+Run the same evaluation with different variants (e.g., comparing models or prompts).
+
+```typescript
+evalite.each([
+ { name: "gpt-4", input: "gpt-4" },
+ { name: "gpt-3.5-turbo", input: "gpt-3.5-turbo" },
+])("Model Comparison", {
+ data: [{ input: "Hello" }],
+ task: async (input, model) => {
+ const response = await openai.chat.completions.create({
+ model,
+ messages: [{ role: "user", content: input }],
+ });
+ return response.choices[0].message.content;
+ },
+});
+```
+
+**`only` flag:** Mark specific variants to run exclusively during development:
+
+```typescript
+evalite.each([
+ { name: "gpt-4", input: "gpt-4" },
+ { name: "gpt-3.5-turbo", input: "gpt-3.5-turbo", only: true }, // Only this variant will run
+])("Model Comparison", {
+ data: [{ input: "Hello" }],
+ task: async (input, model) => {
+ // ...
+ },
+});
+```
+
+See [Comparing Different Approaches](/tips/comparing-different-approaches) for more details.
+
+## Example
+
+```typescript
+// example.eval.ts
+import { evalite } from "evalite";
+import { Levenshtein } from "autoevals";
+
+evalite("Greeting Generator", {
+ data: async () => {
+ return [
+ { input: "Hello", expected: "Hi there!" },
+ { input: "Good morning", expected: "Good morning to you!" },
+ { input: "Howdy", expected: "Howdy partner!" },
+ ];
+ },
+ task: async (input) => {
+ const response = await openai.chat.completions.create({
+ model: "gpt-4",
+ messages: [
+ {
+ role: "system",
+ content: "Generate a friendly greeting response.",
+ },
+ { role: "user", content: input },
+ ],
+ });
+ return response.choices[0].message.content;
+ },
+ scorers: [Levenshtein],
+ columns: ({ output }) => [{ label: "Length", value: output.length }],
+});
+```
diff --git a/apps/docs/content/docs/api/meta.json b/apps/docs/content/docs/api/meta.json
new file mode 100644
index 00000000..2f4a1912
--- /dev/null
+++ b/apps/docs/content/docs/api/meta.json
@@ -0,0 +1,13 @@
+{
+ "title": "Reference",
+ "pages": [
+ "evalite",
+ "cli",
+ "define-config",
+ "create-scorer",
+ "evalite-file",
+ "traces",
+ "run-evalite",
+ "storage"
+ ]
+}
diff --git a/apps/docs/content/docs/api/run-evalite.mdx b/apps/docs/content/docs/api/run-evalite.mdx
new file mode 100644
index 00000000..b9f3912f
--- /dev/null
+++ b/apps/docs/content/docs/api/run-evalite.mdx
@@ -0,0 +1,294 @@
+---
+title: runEvalite()
+---
+
+Run evaluations programmatically from Node.js scripts or custom tooling.
+
+## Signature
+
+```typescript
+runEvalite(opts: {
+ mode: "run-once-and-exit" | "watch-for-file-changes" | "run-once-and-serve";
+ path?: string;
+ cwd?: string;
+ scoreThreshold?: number;
+ outputPath?: string;
+ hideTable?: boolean;
+ storage?: Evalite.Storage;
+}): Promise
+```
+
+## Parameters
+
+### `opts.mode`
+
+**Type:** `"run-once-and-exit" | "watch-for-file-changes" | "run-once-and-serve"` (required)
+
+The execution mode for running evals.
+
+**Modes:**
+
+- `"run-once-and-exit"` - Run evals once and exit. Ideal for CI/CD pipelines.
+- `"watch-for-file-changes"` - Watch for file changes and re-run automatically. Starts the UI server.
+- `"run-once-and-serve"` - Run evals once and serve the UI without watching for changes.
+
+```typescript
+import { runEvalite } from "evalite/runner";
+
+// CI/CD mode
+await runEvalite({
+ mode: "run-once-and-exit",
+});
+
+// Development mode with watch
+await runEvalite({
+ mode: "watch-for-file-changes",
+});
+
+// Run once and keep UI open
+await runEvalite({
+ mode: "run-once-and-serve",
+});
+```
+
+### `opts.path`
+
+**Type:** `string` (optional)
+
+Path filter to run specific eval files. If not provided, runs all `.eval.ts` files.
+
+```typescript
+await runEvalite({
+ mode: "run-once-and-exit",
+ path: "my-eval.eval.ts",
+});
+```
+
+### `opts.cwd`
+
+**Type:** `string` (optional)
+
+The working directory to run evals from. Defaults to `process.cwd()`.
+
+```typescript
+await runEvalite({
+ mode: "run-once-and-exit",
+ cwd: "/path/to/my/project",
+});
+```
+
+### `opts.scoreThreshold`
+
+**Type:** `number` (optional, 0-100)
+
+Minimum average score threshold. If the average score falls below this threshold, the process will exit with code 1.
+
+```typescript
+await runEvalite({
+ mode: "run-once-and-exit",
+ scoreThreshold: 80, // Fail if average score < 80
+});
+```
+
+Useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold.
+
+### `opts.outputPath`
+
+**Type:** `string` (optional)
+
+Path to write test results in JSON format after evaluation completes.
+
+```typescript
+await runEvalite({
+ mode: "run-once-and-exit",
+ outputPath: "./results.json",
+});
+```
+
+The exported JSON contains the complete run data including all evals, results, scores, and traces.
+
+**Note:** Not supported in `watch-for-file-changes` mode.
+
+### `opts.hideTable`
+
+**Type:** `boolean` (optional, default: `false`)
+
+Hide the detailed results table in terminal output. Keeps the score summary but removes the detailed table.
+
+```typescript
+await runEvalite({
+ mode: "watch-for-file-changes",
+ hideTable: true, // Useful for debugging with console.log
+});
+```
+
+### `opts.storage`
+
+**Type:** `Evalite.Storage` (optional)
+
+Custom storage backend instance. If not provided, uses the storage from `evalite.config.ts` or defaults to in-memory storage.
+
+```typescript
+import { runEvalite } from "evalite/runner";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+await runEvalite({
+ mode: "run-once-and-exit",
+ storage: createSqliteStorage("./custom.db"),
+});
+```
+
+See [Storage](/api/storage) for more details.
+
+## Usage Examples
+
+### Basic CI/CD Script
+
+```typescript
+import { runEvalite } from "evalite/runner";
+
+async function runTests() {
+ try {
+ await runEvalite({
+ mode: "run-once-and-exit",
+ scoreThreshold: 75,
+ outputPath: "./results.json",
+ });
+ console.log("All evals passed!");
+ } catch (error) {
+ console.error("Evals failed:", error);
+ process.exit(1);
+ }
+}
+
+runTests();
+```
+
+### Development Script
+
+```typescript
+import { runEvalite } from "evalite/runner";
+
+// Run specific eval in watch mode
+await runEvalite({
+ mode: "watch-for-file-changes",
+ path: "chat.eval.ts",
+ hideTable: true,
+});
+```
+
+### Custom Storage
+
+```typescript
+import { runEvalite } from "evalite/runner";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+const storage = createSqliteStorage("./evalite.db");
+
+await runEvalite({
+ mode: "run-once-and-exit",
+ storage,
+});
+```
+
+### Multi-Environment Testing
+
+```typescript
+import { runEvalite } from "evalite/runner";
+
+const environments = [
+ { name: "staging", url: "https://staging.example.com" },
+ { name: "production", url: "https://example.com" },
+];
+
+for (const env of environments) {
+ console.log(`Running evals for ${env.name}...`);
+
+ process.env.API_URL = env.url;
+
+ await runEvalite({
+ mode: "run-once-and-exit",
+ scoreThreshold: 80,
+ outputPath: `./results-${env.name}.json`,
+ });
+}
+```
+
+### Parallel Eval Execution
+
+```typescript
+import { runEvalite } from "evalite/runner";
+
+// Run multiple eval sets in parallel
+await Promise.all([
+ runEvalite({
+ mode: "run-once-and-exit",
+ path: "chat.eval.ts",
+ }),
+ runEvalite({
+ mode: "run-once-and-exit",
+ path: "completion.eval.ts",
+ }),
+]);
+```
+
+## Configuration Priority
+
+Options merge in this order (highest to lowest priority):
+
+1. Function arguments (`opts`)
+2. Config file (`evalite.config.ts`)
+3. Defaults
+
+Example:
+
+```typescript
+// evalite.config.ts
+export default defineConfig({
+ scoreThreshold: 70,
+ hideTable: true,
+});
+
+// script.ts
+await runEvalite({
+ mode: "run-once-and-exit",
+ scoreThreshold: 80, // Overrides config (80 used)
+ // hideTable not specified, uses config (true)
+});
+```
+
+## Error Handling
+
+The function throws an error if:
+
+- Evals fail to run
+- Score threshold is not met
+- Invalid options are provided
+
+```typescript
+try {
+ await runEvalite({
+ mode: "run-once-and-exit",
+ scoreThreshold: 90,
+ });
+} catch (error) {
+ console.error("Eval run failed:", error);
+ // Handle error or exit
+ process.exit(1);
+}
+```
+
+## Return Value
+
+Returns a `Promise`. The function completes when:
+
+- `run-once-and-exit`: All evals finish
+- `watch-for-file-changes`: Never (runs indefinitely)
+- `run-once-and-serve`: All evals finish, but UI server keeps process alive
+
+## See Also
+
+- [CLI](/api/cli) - Command-line interface
+- [defineConfig()](/api/define-config) - Configuration file
+- [Storage](/api/storage) - Custom storage backends
+- [Run Evals Programmatically Guide](/tips/run-evals-programmatically) - More examples
diff --git a/apps/docs/content/docs/api/storage.mdx b/apps/docs/content/docs/api/storage.mdx
new file mode 100644
index 00000000..1d3b4462
--- /dev/null
+++ b/apps/docs/content/docs/api/storage.mdx
@@ -0,0 +1,335 @@
+---
+title: Storage
+---
+
+Storage backends for persisting evaluation results. Evalite provides built-in SQLite and in-memory storage, plus a Storage interface for custom implementations.
+
+## Built-in Storage
+
+### `createSqliteStorage()`
+
+Create a SQLite storage backend for persistent storage.
+
+**Signature:**
+
+```typescript
+createSqliteStorage(dbLocation: string): Promise
+```
+
+**Parameters:**
+
+- `dbLocation` - Path to the SQLite database file (e.g., `"./evalite.db"`)
+
+**Returns:** A Promise that resolves to a `SqliteStorage` instance implementing the `Evalite.Storage` interface.
+
+**Example:**
+
+```typescript
+import { defineConfig } from "evalite/config";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+export default defineConfig({
+ storage: () => createSqliteStorage("./evalite.db"),
+});
+```
+
+**Features:**
+
+- Persistent storage across runs
+- Automatic schema management
+- History tracking for comparing runs
+- Used by default when no storage is configured
+
+### `createInMemoryStorage()`
+
+Create an in-memory storage backend. Data is lost when the process exits.
+
+**Signature:**
+
+```typescript
+createInMemoryStorage(): InMemoryStorage
+```
+
+**Returns:** An `InMemoryStorage` instance implementing the `Evalite.Storage` interface.
+
+**Example:**
+
+```typescript
+import { defineConfig } from "evalite/config";
+import { createInMemoryStorage } from "evalite/in-memory-storage";
+
+export default defineConfig({
+ storage: () => createInMemoryStorage(),
+});
+```
+
+**Features:**
+
+- Fast (no I/O operations)
+- No persistence
+- Useful for testing or ephemeral runs
+
+## Storage Interface
+
+The `Evalite.Storage` interface allows you to implement custom storage backends (e.g., PostgreSQL, Turso, cloud storage).
+
+### Interface Definition
+
+```typescript
+interface Storage {
+ runs: {
+ create(opts: CreateOpts): Promise;
+ getMany(opts?: GetManyOpts): Promise;
+ };
+
+ suites: {
+ create(opts: CreateOpts): Promise;
+ update(opts: UpdateOpts): Promise;
+ getMany(opts?: GetManyOpts): Promise;
+ };
+
+ evals: {
+ create(opts: CreateOpts): Promise;
+ update(opts: UpdateOpts): Promise;
+ getMany(opts?: GetManyOpts): Promise;
+ };
+
+ scores: {
+ create(opts: CreateOpts): Promise;
+ getMany(opts?: GetManyOpts): Promise;
+ };
+
+ traces: {
+ create(opts: CreateOpts): Promise;
+ getMany(opts?: GetManyOpts): Promise;
+ };
+
+ close(): Promise;
+ [Symbol.asyncDispose](): Promise;
+}
+```
+
+### Entity Types
+
+Storage backends must return these entity types:
+
+**Run:**
+
+```typescript
+type Run = {
+ id: number;
+ runType: "full" | "partial";
+ created_at: string; // ISO 8601 timestamp
+};
+```
+
+**Suite:**
+
+```typescript
+type Suite = {
+ id: number;
+ run_id: number;
+ name: string;
+ status: "fail" | "success" | "running";
+ filepath: string;
+ duration: number; // milliseconds
+ created_at: string;
+ variant_name?: string;
+ variant_group?: string;
+};
+```
+
+**Eval:**
+
+```typescript
+type Eval = {
+ id: number;
+ suite_id: number;
+ duration: number; // milliseconds
+ input: unknown;
+ output: unknown;
+ expected?: unknown;
+ created_at: string;
+ col_order: number;
+ status: "fail" | "success" | "running";
+ rendered_columns?: unknown;
+ trial_index?: number | null;
+};
+```
+
+**Score:**
+
+```typescript
+type Score = {
+ id: number;
+ eval_id: number;
+ name: string;
+ score: number; // 0-1
+ description?: string;
+ metadata?: unknown;
+ created_at: string;
+};
+```
+
+**Trace:**
+
+```typescript
+type Trace = {
+ id: number;
+ eval_id: number;
+ input: unknown;
+ output: unknown;
+ usage?: {
+ inputTokens: number;
+ outputTokens: number;
+ totalTokens: number;
+ };
+ start: number; // timestamp
+ end: number; // timestamp
+ created_at: string;
+};
+```
+
+## Implementing Custom Storage
+
+Create a class that implements the `Evalite.Storage` interface:
+
+```typescript
+import type { Evalite } from "evalite/types";
+
+export class PostgresStorage implements Evalite.Storage {
+ constructor(private connectionString: string) {}
+
+ runs = {
+ async create(opts: Evalite.Storage.Runs.CreateOpts) {
+ // Insert run into Postgres
+ // Return Evalite.Storage.Entities.Run
+ },
+ async getMany(opts?: Evalite.Storage.Runs.GetManyOpts) {
+ // Query runs from Postgres
+ // Return Evalite.Storage.Entities.Run[]
+ },
+ };
+
+ suites = {
+ async create(opts: Evalite.Storage.Suites.CreateOpts) {
+ // ...
+ },
+ async update(opts: Evalite.Storage.Suites.UpdateOpts) {
+ // ...
+ },
+ async getMany(opts?: Evalite.Storage.Suites.GetManyOpts) {
+ // ...
+ },
+ };
+
+ evals = {
+ async create(opts: Evalite.Storage.Evals.CreateOpts) {
+ // ...
+ },
+ async update(opts: Evalite.Storage.Evals.UpdateOpts) {
+ // ...
+ },
+ async getMany(opts?: Evalite.Storage.Evals.GetManyOpts) {
+ // ...
+ },
+ };
+
+ scores = {
+ async create(opts: Evalite.Storage.Scores.CreateOpts) {
+ // ...
+ },
+ async getMany(opts?: Evalite.Storage.Scores.GetManyOpts) {
+ // ...
+ },
+ };
+
+ traces = {
+ async create(opts: Evalite.Storage.Traces.CreateOpts) {
+ // ...
+ },
+ async getMany(opts?: Evalite.Storage.Traces.GetManyOpts) {
+ // ...
+ },
+ };
+
+ async close() {
+ // Close database connection
+ }
+
+ async [Symbol.asyncDispose]() {
+ await this.close();
+ }
+}
+
+// Factory function
+export const createPostgresStorage = (
+ connectionString: string
+): PostgresStorage => {
+ return new PostgresStorage(connectionString);
+};
+```
+
+### Using Custom Storage
+
+```typescript
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+import { createPostgresStorage } from "./postgres-storage";
+
+export default defineConfig({
+ storage: () => createPostgresStorage(process.env.DATABASE_URL),
+});
+```
+
+## Storage Lifecycle
+
+Storage instances are managed using the `await using` syntax:
+
+```typescript
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+await using storage = createSqliteStorage("./evalite.db");
+
+// Use storage...
+// Automatically closed when leaving scope
+```
+
+Implement `[Symbol.asyncDispose]()` to ensure proper cleanup.
+
+## Query Options
+
+### Common Query Patterns
+
+**Get latest run:**
+
+```typescript
+const runs = await storage.runs.getMany({ limit: 1 });
+const latestRun = runs[0];
+```
+
+**Get suites for a run:**
+
+```typescript
+const suites = await storage.suites.getMany({ run_id: runId });
+```
+
+**Get evals with scores:**
+
+```typescript
+const evals = await storage.evals.getMany({ suite_id: suiteId });
+const scores = await storage.scores.getMany({ eval_id: evalId });
+```
+
+## Best Practices
+
+1. **Use SQLite for persistence** - Default and recommended for most use cases
+2. **Use in-memory for CI** - Faster, no cleanup needed
+3. **Implement proper cleanup** - Use `close()` and `[Symbol.asyncDispose]()`
+4. **Handle JSON fields** - input/output/expected/metadata are stored as JSON
+5. **Index appropriately** - Optimize queries for run_id, suite_id, eval_id lookups
+
+## See Also
+
+- [defineConfig()](/api/define-config) - Configure storage in config file
+- [runEvalite()](/api/run-evalite) - Pass storage instance programmatically
diff --git a/apps/docs/content/docs/api/traces.mdx b/apps/docs/content/docs/api/traces.mdx
new file mode 100644
index 00000000..08867794
--- /dev/null
+++ b/apps/docs/content/docs/api/traces.mdx
@@ -0,0 +1,296 @@
+---
+title: Traces
+---
+
+Track nested LLM calls and intermediate steps within your evaluations.
+
+## Overview
+
+Traces allow you to record individual LLM calls or processing steps that occur during task execution. They appear in the Evalite UI alongside your main input/output, helping you debug and understand the full execution flow.
+
+## Functions
+
+### `reportTrace()`
+
+Manually report a trace for custom LLM calls or processing steps.
+
+**Signature:**
+
+```typescript
+reportTrace(trace: {
+ input: unknown;
+ output: unknown;
+ usage?: {
+ inputTokens: number;
+ outputTokens: number;
+ totalTokens: number;
+ };
+ start?: number;
+ end?: number;
+}): void
+```
+
+**Parameters:**
+
+- `input` - The input to the operation (e.g., prompt, messages)
+- `output` - The output from the operation (e.g., LLM response)
+- `usage` (optional) - Token usage statistics
+- `start` (optional) - Start timestamp (milliseconds). Defaults to current time.
+- `end` (optional) - End timestamp (milliseconds). Defaults to current time.
+
+**Example:**
+
+```typescript
+import { evalite } from "evalite";
+import { reportTrace } from "evalite/traces";
+
+evalite("Multi-Step Analysis", {
+ data: [{ input: "Analyze this text" }],
+ task: async (input) => {
+ // First LLM call
+ reportTrace({
+ input: { prompt: "Summarize: " + input },
+ output: { text: "Summary of the text" },
+ usage: {
+ inputTokens: 50,
+ outputTokens: 20,
+ totalTokens: 70,
+ },
+ });
+
+ // Second LLM call
+ reportTrace({
+ input: { prompt: "Translate to Spanish: Summary of the text" },
+ output: { text: "Resumen del texto" },
+ usage: {
+ inputTokens: 30,
+ outputTokens: 15,
+ totalTokens: 45,
+ },
+ });
+
+ return "Final result";
+ },
+ scorers: [],
+});
+```
+
+**Usage with timestamps:**
+
+```typescript
+const start = performance.now();
+const result = await callLLM(input);
+const end = performance.now();
+
+reportTrace({
+ input,
+ output: result,
+ start,
+ end,
+});
+```
+
+### `traceAISDKModel()`
+
+Automatically trace all calls made with a Vercel AI SDK model.
+
+**Signature:**
+
+```typescript
+traceAISDKModel(model: LanguageModelV2): LanguageModelV2
+```
+
+**Parameters:**
+
+- `model` - A Vercel AI SDK language model (from `@ai-sdk/openai`, etc.)
+
+**Returns:** A wrapped model that automatically reports traces.
+
+**Example:**
+
+```typescript
+import { evalite } from "evalite";
+import { traceAISDKModel } from "evalite/ai-sdk";
+import { openai } from "@ai-sdk/openai";
+import { generateText } from "ai";
+
+// Wrap your model
+const tracedModel = traceAISDKModel(openai("gpt-4"));
+
+evalite("AI SDK Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ // All calls with this model are automatically traced
+ const result = await generateText({
+ model: tracedModel,
+ prompt: input,
+ });
+
+ return result.text;
+ },
+ scorers: [],
+});
+```
+
+**With streaming:**
+
+```typescript
+import { streamText } from "ai";
+
+const tracedModel = traceAISDKModel(openai("gpt-4"));
+
+evalite("Streaming Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ const result = await streamText({
+ model: tracedModel,
+ prompt: input,
+ });
+
+ // Process the stream before returning
+ const text = await result.text;
+ return text;
+ },
+ scorers: [],
+});
+```
+
+## Enabling Traces
+
+Traces are only recorded when the `EVALITE_REPORT_TRACES` environment variable is set:
+
+```bash
+EVALITE_REPORT_TRACES=true evalite watch
+```
+
+Or in your `.env` file:
+
+```
+EVALITE_REPORT_TRACES=true
+```
+
+This prevents unnecessary overhead when traces aren't needed.
+
+## What Gets Traced
+
+### With `reportTrace()`
+
+You control exactly what gets traced:
+
+```typescript
+reportTrace({
+ input: "Whatever you want to log",
+ output: { any: "data structure" },
+});
+```
+
+### With `traceAISDKModel()`
+
+Automatically traces:
+
+- Full prompt/messages
+- Model responses (text and tool calls)
+- Token usage
+- Timing information
+
+## Viewing Traces in the UI
+
+Traces appear in the Evalite UI under each test case:
+
+1. Navigate to an eval result
+2. Click on a specific test case
+3. View the "Traces" section to see all nested calls
+4. Inspect input, output, and timing for each trace
+
+## Complete Example
+
+```typescript
+import { evalite } from "evalite";
+import { reportTrace, traceAISDKModel } from "evalite/traces";
+import { openai } from "@ai-sdk/openai";
+import { generateText } from "ai";
+
+const tracedModel = traceAISDKModel(openai("gpt-4"));
+
+evalite("Research Agent", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: "Paris",
+ },
+ ],
+ task: async (input) => {
+ // Step 1: Extract intent (manually traced)
+ const intent = await extractIntent(input);
+ reportTrace({
+ input: { query: input },
+ output: { intent },
+ });
+
+ // Step 2: Generate response (automatically traced via AI SDK)
+ const result = await generateText({
+ model: tracedModel,
+ prompt: `Answer this question: ${input}`,
+ });
+
+ // Step 3: Format result (manually traced)
+ const formatted = formatResponse(result.text);
+ reportTrace({
+ input: { raw: result.text },
+ output: { formatted },
+ });
+
+ return formatted;
+ },
+ scorers: [
+ {
+ name: "Exact Match",
+ scorer: ({ output, expected }) => {
+ return output === expected ? 1 : 0;
+ },
+ },
+ ],
+});
+```
+
+## Best Practices
+
+1. **Use `traceAISDKModel()` for AI SDK calls** - Automatic tracing with rich context
+2. **Use `reportTrace()` for custom logic** - Track non-LLM steps (parsing, validation, etc.)
+3. **Include usage data when available** - Helps track costs and performance
+4. **Keep trace data relevant** - Don't trace every small operation, focus on meaningful steps
+5. **Enable only when needed** - Use `EVALITE_REPORT_TRACES=true` during development/debugging
+
+## Troubleshooting
+
+### Traces not appearing
+
+Make sure `EVALITE_REPORT_TRACES=true` is set:
+
+```bash
+EVALITE_REPORT_TRACES=true evalite watch
+```
+
+### Error: "reportTrace must be called inside an evalite eval"
+
+`reportTrace()` can only be called within the `task` function of an eval:
+
+```typescript
+// âś… Correct
+evalite("My Eval", {
+ data: [{ input: "test" }],
+ task: async (input) => {
+ reportTrace({ input, output: "result" }); // Works
+ return "result";
+ },
+});
+
+// ❌ Wrong
+reportTrace({ input: "test", output: "result" }); // Outside eval
+```
+
+## See Also
+
+- [Adding Traces Guide](/tips/adding-traces) - Overview and examples
+- [Vercel AI SDK Integration](/tips/vercel-ai-sdk) - Using AI SDK with Evalite
+- [evalite()](/api/evalite) - Main evaluation function
diff --git a/apps/evalite-docs/src/content/docs/guides/configuration.mdx b/apps/docs/content/docs/guides/configuration.mdx
similarity index 72%
rename from apps/evalite-docs/src/content/docs/guides/configuration.mdx
rename to apps/docs/content/docs/guides/configuration.mdx
index 2c4ca11f..bb63db09 100644
--- a/apps/evalite-docs/src/content/docs/guides/configuration.mdx
+++ b/apps/docs/content/docs/guides/configuration.mdx
@@ -2,8 +2,6 @@
title: Configuration
---
-import { Steps } from "@astrojs/starlight/components";
-
Since **Evalite is based on Vitest**, you can configure eval behavior using Vitest's configuration options. Each data point in your eval becomes a separate Vitest test case, which means all Vitest configuration options work with Evalite.
## Evalite Configuration
@@ -68,36 +66,3 @@ export default defineConfig({
testTimeout: 60000, // 60 seconds
});
```
-
-### Running Evals Multiple Times
-
-Run each test case multiple times to measure variance in non-deterministic evaluations.
-
-Configure globally in `evalite.config.ts`:
-
-```ts
-// evalite.config.ts
-import { defineConfig } from "evalite/config";
-
-export default defineConfig({
- trialCount: 3, // Run each test case 3 times
-});
-```
-
-Or override per-eval in the `evalite()` call:
-
-```ts
-evalite("Non-deterministic eval", {
- data: () => [{ input: "Alice", expected: "Alice" }],
- task: async (input) => {
- // Non-deterministic task
- return getRandomGreeting(input);
- },
- scorers: [
- /* ... */
- ],
- trialCount: 5, // Override config: run 5 times
-});
-```
-
-Note: Per-eval `trialCount` overrides `evalite.config.ts` if both are present.
diff --git a/apps/docs/content/docs/guides/dev-loop.mdx b/apps/docs/content/docs/guides/dev-loop.mdx
new file mode 100644
index 00000000..a453d3bf
--- /dev/null
+++ b/apps/docs/content/docs/guides/dev-loop.mdx
@@ -0,0 +1,94 @@
+---
+title: The Dev Loop
+---
+
+Speed up development by running evals efficiently with watch mode, file filtering, and selective execution.
+
+## Watch Mode
+
+Watch mode re-runs evals when files change for fast iteration.
+
+```bash
+evalite watch
+```
+
+Watches `.eval.ts` files and re-runs on changes.
+
+### Hide Table Output
+
+When debugging with `console.log`, hide detailed table to see logs:
+
+```bash
+evalite watch --hideTable
+```
+
+Keeps score summary, removes results table.
+
+### Serve Mode Alternative
+
+Run evals once and serve UI without re-running on changes:
+
+```bash
+evalite serve
+```
+
+Runs once, keeps UI at `http://localhost:3006`. Tests won't re-run on file changes.
+
+To re-run after changes, restart `evalite serve`.
+
+Useful for slow-running evals.
+
+## Run Specific Files
+
+Run specific eval files instead of all evals:
+
+```bash
+evalite my-eval.eval.ts
+```
+
+Multiple files:
+
+```bash
+evalite eval1.eval.ts eval2.eval.ts
+```
+
+Works with `watch` and `serve`:
+
+```bash
+evalite watch my-eval.eval.ts
+evalite serve my-eval.eval.ts
+```
+
+Useful when working on specific eval, avoiding full suite.
+
+## Skip Entire Evals
+
+Use `evalite.skip()` to skip eval without running:
+
+```ts
+evalite.skip("My Eval", {
+ data: () => [],
+ task: () => {},
+});
+```
+
+Useful for temporarily disabling eval during development.
+
+## Focus on Specific Test Cases
+
+Use `only` flag on data entries to focus on specific inputs:
+
+```ts
+evalite("My Eval", {
+ data: () => [
+ { input: "test1", expected: "output1" },
+ { input: "test2", expected: "output2", only: true },
+ { input: "test3", expected: "output3" },
+ ],
+ task: async (input) => {
+ // Only runs for "test2"
+ },
+});
+```
+
+When any data entry has `only: true`, only those evals run.
diff --git a/apps/docs/content/docs/guides/meta.json b/apps/docs/content/docs/guides/meta.json
new file mode 100644
index 00000000..a8edac75
--- /dev/null
+++ b/apps/docs/content/docs/guides/meta.json
@@ -0,0 +1,10 @@
+{
+ "title": "Guides",
+ "pages": [
+ "what-is-evalite",
+ "quickstart",
+ "dev-loop",
+ "scorers",
+ "configuration"
+ ]
+}
diff --git a/apps/docs/content/docs/guides/quickstart.mdx b/apps/docs/content/docs/guides/quickstart.mdx
new file mode 100644
index 00000000..e834522e
--- /dev/null
+++ b/apps/docs/content/docs/guides/quickstart.mdx
@@ -0,0 +1,124 @@
+---
+title: Quickstart
+description: A guide to setting up Evalite in your project.
+---
+
+import { Step, Steps } from "fumadocs-ui/components/steps";
+
+We're going to walk through setting up Evalite in an existing project.
+
+
+
+
+
+Install `evalite`, `vitest`, and a scoring library like `autoevals`:
+
+```npm
+npm install -D evalite vitest autoevals
+```
+
+
+
+
+
+Add an `eval:dev` script to your package.json:
+
+```json
+{
+ "scripts": {
+ "eval:dev": "evalite watch"
+ }
+}
+```
+
+
+
+
+
+Create your first eval:
+
+```ts
+// my-eval.eval.ts
+
+import { evalite } from "evalite";
+import { Levenshtein } from "autoevals";
+
+evalite("My Eval", {
+ // An array of test data
+ // - TODO: Replace with your test data
+ data: [{ input: "Hello", expected: "Hello World!" }],
+ // The task to perform
+ // - TODO: Replace with your LLM call
+ task: async (input) => {
+ return input + " World!";
+ },
+ // The scoring methods for the eval
+ scorers: [Levenshtein],
+});
+```
+
+
+
+`.eval.ts` is the extension Evalite looks for when scanning for evals.
+
+
+
+
+
+
+
+Run `pnpm run eval:dev`.
+
+```npm
+npm run eval:dev
+```
+
+This runs `evalite`, which runs the evals:
+
+- Runs the `data` function to get the test data
+- Runs the `task` function on each test data
+- Scores the output of the `task` function using the `scorers`
+- Saves the results to a sqlite database in `node_modules/.evalite`
+
+It then:
+
+- Shows a UI for viewing the traces, scores, inputs and outputs at http://localhost:3006.
+- If you only ran one eval, it also shows a table summarizing the eval in the terminal.
+
+
+
+
+
+Open http://localhost:3006 in your browser to view the results of the eval.
+
+
+
+
+
+### What Next?
+
+Head to the [AI SDK example](/examples/ai-sdk) to see a fully-fleshed out example of Evalite in action.
+
+### Troubleshooting
+
+##### Error: Could not locate the bindings file
+
+Some users experienced issues running `evalite watch`. Your package manager will report the following error message:
+
+```
+Command failed, Error: Could not locate the bindings file.
+```
+
+This error is related to the `better-sqlite3` package. To resolve this, you can try the following steps:
+
+- Rebuild `better-sqlite3`:
+
+```bash
+pnpm rebuild better-sqlite3
+```
+
+- Approve the rebuild of the package with:
+
+```bash
+pnpm approve-builds
+```
diff --git a/apps/docs/content/docs/guides/scorers.mdx b/apps/docs/content/docs/guides/scorers.mdx
new file mode 100644
index 00000000..b281d057
--- /dev/null
+++ b/apps/docs/content/docs/guides/scorers.mdx
@@ -0,0 +1,126 @@
+---
+title: Scorers
+---
+
+Scorers are used to score the output of your LLM call.
+
+[Autoevals](https://github.com/braintrustdata/autoevals) is a great library of scorers to get you started.
+
+## Inline Scorers
+
+If you don't need your scorer to be reusable, you can define it inline.
+
+```ts
+import { evalite } from "evalite";
+
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ return input + " World!";
+ },
+ scorers: [
+ {
+ name: "Contains Paris",
+ description: "Checks if the output contains the word 'Paris'.",
+ scorer: ({ output }) => {
+ return output.includes("Paris") ? 1 : 0;
+ },
+ },
+ ],
+});
+```
+
+## Creating Reusable Scorers
+
+If you have a scorer you want to use across multiple files, you can use `createScorer` to create a reusable scorer.
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer({
+ name: "Contains Paris",
+ description: "Checks if the output contains the word 'Paris'.",
+ scorer: ({ output }) => {
+ return output.includes("Paris") ? 1 : 0;
+ },
+});
+
+evalite("My Eval", {
+ data: [{ input: "Hello" }],
+ task: async (input) => {
+ return input + " World!";
+ },
+ scorers: [containsParis],
+});
+```
+
+The `name` and `description` of the scorer will be displayed in the Evalite UI.
+
+## Score Properties
+
+The `score` function receives three properties on the object passed:
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer({
+ name: "Contains Paris",
+ description: "Checks if the output contains the word 'Paris'.",
+ scorer: ({ input, output, expected }) => {
+ // input comes from `data`
+ // expected also comes from `data`
+ // output is the output of `task`
+ return output.includes("Paris") ? 1 : 0;
+ },
+});
+```
+
+These are typed using the three type arguments passed to `createScorer`:
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer<
+ string, // Type of 'input'
+ string, // Type of 'output'
+ string // Type of 'expected'
+>({
+ name: "Contains Word",
+ description: "Checks if the output contains the specified word.",
+ scorer: ({ output, input, expected }) => {
+ // output is typed as string!
+ return output.includes(expected) ? 1 : 0;
+ },
+});
+```
+
+If `expected` is omitted, it will be inferred from the type of `output`.
+
+## Scorer Metadata
+
+You can provide metadata along with your custom scorer:
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer({
+ name: "Contains Paris",
+ description: "Checks if the output contains the word 'Paris'.",
+ scorer: (output) => {
+ return {
+ score: output.includes("Paris") ? 1 : 0,
+ metadata: {
+ // Can be anything!
+ },
+ };
+ },
+});
+```
+
+This will be visible along with the score in the Evalite UI.
+
+
+
+This is especially useful for debugging LLM-as-a-judge evals. In autoevals `Factuality` scorer, the metadata will include a rationale for why the scorer gave the score it did.
+
+
diff --git a/apps/evalite-docs/src/content/docs/what-is-evalite.mdx b/apps/docs/content/docs/guides/what-is-evalite.mdx
similarity index 96%
rename from apps/evalite-docs/src/content/docs/what-is-evalite.mdx
rename to apps/docs/content/docs/guides/what-is-evalite.mdx
index eceb2ed6..3d5a8450 100644
--- a/apps/evalite-docs/src/content/docs/what-is-evalite.mdx
+++ b/apps/docs/content/docs/guides/what-is-evalite.mdx
@@ -2,8 +2,6 @@
title: What Is Evalite?
---
-import { Aside, Steps } from "@astrojs/starlight/components";
-
Evalite runs your evals locally. Evals are like tests, but for AI-powered apps.
So Evalite is like Jest or Vitest, but for apps that use AI.
diff --git a/apps/docs/content/docs/index.mdx b/apps/docs/content/docs/index.mdx
new file mode 100644
index 00000000..f3044520
--- /dev/null
+++ b/apps/docs/content/docs/index.mdx
@@ -0,0 +1,29 @@
+---
+title: Evalite Documentation
+description: Test AI-powered apps in TypeScript
+---
+
+## Welcome to Evalite
+
+Evalite is a TypeScript-native, local-first tool for testing LLM-powered apps built on Vitest. It allows developers to write evaluations (evals) as `.eval.ts` files that run like tests.
+
+## Key Features
+
+- **Local-first**: Run evals locally on your machine with complete control of your data
+- **TypeScript-native**: Write evals in TypeScript with full type safety
+- **Built on Vitest**: Use all the same tools (mocks, lifecycle hooks) you're used to
+- **Live reload**: Local server with hot reload for rapid iteration
+- **Traces & Scorers**: Capture traces, build custom scorers, and much more
+
+## Getting Started
+
+Check out the [Quickstart](/docs/guides/quickstart) guide to get started with Evalite.
+
+## Popular Pages
+
+
+
+
+
+
+
diff --git a/apps/docs/content/docs/meta.json b/apps/docs/content/docs/meta.json
new file mode 100644
index 00000000..4d41b9c1
--- /dev/null
+++ b/apps/docs/content/docs/meta.json
@@ -0,0 +1,4 @@
+{
+ "title": "Documentation",
+ "pages": ["index", "guides", "tips", "api"]
+}
diff --git a/apps/evalite-docs/src/content/docs/guides/traces.mdx b/apps/docs/content/docs/tips/adding-traces.mdx
similarity index 50%
rename from apps/evalite-docs/src/content/docs/guides/traces.mdx
rename to apps/docs/content/docs/tips/adding-traces.mdx
index f62e79e6..7ff2d82b 100644
--- a/apps/evalite-docs/src/content/docs/guides/traces.mdx
+++ b/apps/docs/content/docs/tips/adding-traces.mdx
@@ -1,14 +1,12 @@
---
-title: Traces
+title: Adding Traces
---
-import { Aside } from "@astrojs/starlight/components";
-
-Traces are used to track the behaviour of each individual call to an LLM inside your task.
+Track timing, token usage, and input/output for each LLM call within your task using traces.
## `reportTrace`
-You can report a trace by calling `reportTrace` inside an `evalite` eval:
+Report a trace by calling `reportTrace` inside an `evalite` eval:
```ts
import { evalite, type Evalite } from "evalite";
@@ -48,33 +46,10 @@ evalite("My Eval", {
});
```
-
-
-## `traceAISDKModel`
-
-If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function:
-
-```ts
-import { traceAISDKModel } from "evalite/ai-sdk";
-import { generateText } from "ai";
-import { openai } from "@ai-sdk/openai";
-
-// All calls to this model will be recorded in evalite!
-const tracedModel = traceAISDKModel(openai("gpt-4o-mini"));
-
-const result = await generateText({
- model: tracedModel,
- system: `Answer the question concisely.`,
- prompt: `What is the capital of France?`,
-});
-```
-
-
+If you're using the Vercel AI SDK, see the [Vercel AI SDK](/docs/tips/vercel-ai-sdk) tip for automatic tracing with `traceAISDKModel`.
diff --git a/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx b/apps/docs/content/docs/tips/comparing-different-approaches.mdx
similarity index 78%
rename from apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx
rename to apps/docs/content/docs/tips/comparing-different-approaches.mdx
index 029e4142..83b91b4c 100644
--- a/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx
+++ b/apps/docs/content/docs/tips/comparing-different-approaches.mdx
@@ -1,17 +1,14 @@
---
-title: Variant Comparison
-description: Compare multiple task variants using evalite.each()
+title: Comparing Different Approaches
---
-import { Aside } from "@astrojs/starlight/components";
+A/B test different models, prompts, or configurations on the same dataset using `evalite.each()`.
-## Overview
+## What You Can Compare
-`evalite.each()` enables comparing multiple task variants (models, prompts, configs) within a single eval. This lets you:
-
-- Compare different models on the same dataset
-- A/B test prompt strategies
-- Test different config parameters (temperature, system prompts, etc.)
+- Different models on the same dataset
+- Prompt strategies (direct vs chain-of-thought vs few-shot)
+- Config parameters (temperature, system prompts, etc.)
## Basic Usage
diff --git a/apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx b/apps/docs/content/docs/tips/customize-the-ui.mdx
similarity index 93%
rename from apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx
rename to apps/docs/content/docs/tips/customize-the-ui.mdx
index 78ad14c6..45012060 100644
--- a/apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx
+++ b/apps/docs/content/docs/tips/customize-the-ui.mdx
@@ -1,12 +1,12 @@
---
-title: Customizing The UI
+title: Customize The UI
---
-import { Aside } from "@astrojs/starlight/components";
+Customize which columns appear in the Evalite UI to show only the data you care about.
## Creating Custom Columns
-By default, the Evalite UI renders the input, expected and output columns:
+By default, Evalite renders input, expected and output columns:
| Input | Expected | Output |
| ------------------------ | --------------------------- | ---------------- |
diff --git a/apps/evalite-docs/src/content/docs/guides/multi-modal.mdx b/apps/docs/content/docs/tips/images-and-media.mdx
similarity index 93%
rename from apps/evalite-docs/src/content/docs/guides/multi-modal.mdx
rename to apps/docs/content/docs/tips/images-and-media.mdx
index 8f747424..bf3437a0 100644
--- a/apps/evalite-docs/src/content/docs/guides/multi-modal.mdx
+++ b/apps/docs/content/docs/tips/images-and-media.mdx
@@ -1,10 +1,8 @@
---
-title: Multi-Modal
+title: Images And Media
---
-import { Aside } from "@astrojs/starlight/components";
-
-Evalite can handle not just text responses, but media like images, audio, and video.
+Test multi-modal LLM features by including images, audio, and video in your evals.
## Files In Memory
@@ -31,10 +29,10 @@ writeFileSync("path/to/new-file.jpg", fileContents);
It doesn't matter what the file extension is - when you read it into memory, it'll be a `Buffer`.
-
+
### Evalite And Files In Memory
diff --git a/apps/docs/content/docs/tips/meta.json b/apps/docs/content/docs/tips/meta.json
new file mode 100644
index 00000000..082ecc43
--- /dev/null
+++ b/apps/docs/content/docs/tips/meta.json
@@ -0,0 +1,11 @@
+{
+ "title": "Tips",
+ "pages": [
+ "customize-the-ui",
+ "comparing-different-approaches",
+ "adding-traces",
+ "vercel-ai-sdk",
+ "images-and-media",
+ "run-evals-on-ci-cd"
+ ]
+}
diff --git a/apps/evalite-docs/src/content/docs/guides/ci.mdx b/apps/docs/content/docs/tips/run-evals-on-ci-cd.mdx
similarity index 77%
rename from apps/evalite-docs/src/content/docs/guides/ci.mdx
rename to apps/docs/content/docs/tips/run-evals-on-ci-cd.mdx
index a0ab56ec..9022c855 100644
--- a/apps/evalite-docs/src/content/docs/guides/ci.mdx
+++ b/apps/docs/content/docs/tips/run-evals-on-ci-cd.mdx
@@ -2,7 +2,7 @@
title: CI/CD
---
-Evalite integrates seamlessly into CI/CD pipelines, allowing you to validate LLM-powered features as part of your automated testing workflow.
+Integrate Evalite into CI/CD pipelines to validate LLM-powered features automatically.
## Static UI Export
@@ -137,13 +137,49 @@ Executes all evals and exits.
## Score Thresholds
-Fail CI builds if scores fall below threshold:
+Require evals to pass a minimum score threshold. Useful for CI/CD pipelines where failing evals should block deployments.
+
+### Using the Threshold Flag
+
+Pass `--threshold` to set a minimum required score:
+
+```bash
+evalite --threshold=50 # Score must be >= 50
+```
+
+If the average score falls below the threshold, the process exits with code 1.
+
+### Works with All Modes
+
+```bash
+evalite watch --threshold=70
+evalite serve --threshold=80
+```
+
+### Configuration File
+
+Alternatively, set `scoreThreshold` in `evalite.config.ts`:
+
+```ts
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+
+export default defineConfig({
+ scoreThreshold: 80, // Fail if average score < 80
+});
+```
+
+CLI flag takes precedence over config file setting.
+
+### CI/CD Usage
+
+Typical CI workflow:
```bash
-evalite --threshold=70
+evalite --threshold=75
```
-Exits with code 1 if average score < 70.
+Process exits with error code if threshold not met, failing the CI job.
## JSON Export
diff --git a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md b/apps/docs/content/docs/tips/vercel-ai-sdk.mdx
similarity index 85%
rename from apps/evalite-docs/src/content/docs/examples/ai-sdk.md
rename to apps/docs/content/docs/tips/vercel-ai-sdk.mdx
index 215e5ea0..49394d3d 100644
--- a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md
+++ b/apps/docs/content/docs/tips/vercel-ai-sdk.mdx
@@ -1,14 +1,12 @@
---
-title: AI SDK
+title: Vercel AI SDK
---
-Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) is a great way to get started with AI in your apps.
+Use Evalite with Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) to trace LLM calls and test AI-powered features.
-It abstracts away the differences between different AI providers, so you can **switch between them easily**.
+## Automatic Tracing with `traceAISDKModel`
-## Tracing
-
-You can use the `traceAISDKModel` function to trace the calls to the AI SDK:
+Wrap your AI SDK model in `traceAISDKModel` to automatically track all LLM calls:
```ts
// my-eval.eval.ts
@@ -48,6 +46,12 @@ evalite("Test Capitals", {
});
```
+
+
+`traceAISDKModel` is a no-op in production, so you can leave it in your code without worrying about performance.
+
+
+
## Testing Whole Conversations
You can also pass messages to the `input` property of the eval. To get autocomplete, you can pass the `CoreMessage` type to the `evalite` function as a type argument.
diff --git a/apps/docs/eslint.config.mjs b/apps/docs/eslint.config.mjs
new file mode 100644
index 00000000..85bff042
--- /dev/null
+++ b/apps/docs/eslint.config.mjs
@@ -0,0 +1,26 @@
+import { dirname } from "path";
+import { fileURLToPath } from "url";
+import { FlatCompat } from "@eslint/eslintrc";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+const compat = new FlatCompat({
+ baseDirectory: __dirname,
+});
+
+const eslintConfig = [
+ ...compat.extends("next/core-web-vitals", "next/typescript"),
+ {
+ ignores: [
+ "node_modules/**",
+ ".next/**",
+ "out/**",
+ "build/**",
+ ".source/**",
+ "next-env.d.ts",
+ ],
+ },
+];
+
+export default eslintConfig;
diff --git a/apps/docs/lib/cn.ts b/apps/docs/lib/cn.ts
new file mode 100644
index 00000000..8e473dac
--- /dev/null
+++ b/apps/docs/lib/cn.ts
@@ -0,0 +1 @@
+export { twMerge as cn } from "tailwind-merge";
diff --git a/apps/docs/lib/get-llm-text.ts b/apps/docs/lib/get-llm-text.ts
new file mode 100644
index 00000000..91d04d0c
--- /dev/null
+++ b/apps/docs/lib/get-llm-text.ts
@@ -0,0 +1,10 @@
+import { source } from "@/lib/source";
+import type { InferPageType } from "fumadocs-core/source";
+
+export async function getLLMText(page: InferPageType) {
+ const processed = await page.data.getText("processed");
+
+ return `# ${page.data.title} (${page.url})
+
+${processed}`;
+}
diff --git a/apps/docs/lib/layout.shared.tsx b/apps/docs/lib/layout.shared.tsx
new file mode 100644
index 00000000..4fe0f767
--- /dev/null
+++ b/apps/docs/lib/layout.shared.tsx
@@ -0,0 +1,11 @@
+import Logo from "@/components/logo";
+import type { BaseLayoutProps } from "fumadocs-ui/layouts/shared";
+
+export function baseOptions(): BaseLayoutProps {
+ return {
+ nav: {
+ title: ,
+ },
+ githubUrl: "https://github.com/mattpocock/evalite",
+ };
+}
diff --git a/apps/docs/lib/source.ts b/apps/docs/lib/source.ts
new file mode 100644
index 00000000..d9938ad8
--- /dev/null
+++ b/apps/docs/lib/source.ts
@@ -0,0 +1,27 @@
+import { docs } from "@/.source";
+import { type InferPageType, loader } from "fumadocs-core/source";
+import { lucideIconsPlugin } from "fumadocs-core/source/lucide-icons";
+
+// See https://fumadocs.dev/docs/headless/source-api for more info
+export const source = loader({
+ baseUrl: "/docs",
+ source: docs.toFumadocsSource(),
+ plugins: [lucideIconsPlugin()],
+});
+
+export function getPageImage(page: InferPageType) {
+ const segments = [...page.slugs, "image.png"];
+
+ return {
+ segments,
+ url: `/og/docs/${segments.join("/")}`,
+ };
+}
+
+export async function getLLMText(page: InferPageType) {
+ const processed = await page.data.getText("processed");
+
+ return `# ${page.data.title}
+
+${processed}`;
+}
diff --git a/apps/docs/lib/utils.ts b/apps/docs/lib/utils.ts
new file mode 100644
index 00000000..a5ef1935
--- /dev/null
+++ b/apps/docs/lib/utils.ts
@@ -0,0 +1,6 @@
+import { clsx, type ClassValue } from "clsx";
+import { twMerge } from "tailwind-merge";
+
+export function cn(...inputs: ClassValue[]) {
+ return twMerge(clsx(inputs));
+}
diff --git a/apps/docs/mdx-components.tsx b/apps/docs/mdx-components.tsx
new file mode 100644
index 00000000..b21a1618
--- /dev/null
+++ b/apps/docs/mdx-components.tsx
@@ -0,0 +1,9 @@
+import defaultMdxComponents from "fumadocs-ui/mdx";
+import type { MDXComponents } from "mdx/types";
+
+export function getMDXComponents(components?: MDXComponents): MDXComponents {
+ return {
+ ...defaultMdxComponents,
+ ...components,
+ };
+}
diff --git a/apps/docs/next.config.mjs b/apps/docs/next.config.mjs
new file mode 100644
index 00000000..dfe557c3
--- /dev/null
+++ b/apps/docs/next.config.mjs
@@ -0,0 +1,21 @@
+import { createMDX } from "fumadocs-mdx/next";
+
+const withMDX = createMDX();
+
+/** @type {import('next').NextConfig} */
+const config = {
+ reactStrictMode: true,
+ images: {
+ qualities: [75, 100],
+ },
+ async rewrites() {
+ return [
+ {
+ source: "/docs/:path*.mdx",
+ destination: "/llms.mdx/:path*",
+ },
+ ];
+ },
+};
+
+export default withMDX(config);
diff --git a/apps/docs/package.json b/apps/docs/package.json
new file mode 100644
index 00000000..70a02128
--- /dev/null
+++ b/apps/docs/package.json
@@ -0,0 +1,40 @@
+{
+ "name": "docs",
+ "version": "0.0.0",
+ "private": true,
+ "scripts": {
+ "build": "next build",
+ "dev": "next dev",
+ "start": "next start",
+ "postinstall": "fumadocs-mdx",
+ "lint": "eslint"
+ },
+ "dependencies": {
+ "@icons-pack/react-simple-icons": "^13.8.0",
+ "class-variance-authority": "^0.7.1",
+ "clsx": "^2.1.1",
+ "fumadocs-core": "16.0.8",
+ "fumadocs-mdx": "13.0.5",
+ "fumadocs-ui": "16.0.8",
+ "geist": "^1.5.1",
+ "lucide-react": "^0.552.0",
+ "motion": "^12.23.24",
+ "next": "16.0.1",
+ "react": "^19.2.0",
+ "react-dom": "^19.2.0",
+ "tailwind-merge": "^3.3.1"
+ },
+ "devDependencies": {
+ "@eslint/eslintrc": "^3.3.1",
+ "@tailwindcss/postcss": "^4.1.16",
+ "@types/mdx": "^2.0.13",
+ "@types/node": "^24.10.0",
+ "@types/react": "^19.2.2",
+ "@types/react-dom": "^19.2.2",
+ "eslint": "^9.39.1",
+ "eslint-config-next": "16.0.1",
+ "postcss": "^8.5.6",
+ "tailwindcss": "^4.1.16",
+ "typescript": "^5.9.3"
+ }
+}
diff --git a/apps/docs/postcss.config.mjs b/apps/docs/postcss.config.mjs
new file mode 100644
index 00000000..c2ddf748
--- /dev/null
+++ b/apps/docs/postcss.config.mjs
@@ -0,0 +1,5 @@
+export default {
+ plugins: {
+ "@tailwindcss/postcss": {},
+ },
+};
diff --git a/apps/docs/proxy.ts b/apps/docs/proxy.ts
new file mode 100644
index 00000000..c53a2698
--- /dev/null
+++ b/apps/docs/proxy.ts
@@ -0,0 +1,16 @@
+import { NextRequest, NextResponse } from "next/server";
+import { isMarkdownPreferred, rewritePath } from "fumadocs-core/negotiation";
+
+const { rewrite: rewriteLLM } = rewritePath("/docs/*path", "/llms.mdx/*path");
+
+export default function proxy(request: NextRequest) {
+ if (isMarkdownPreferred(request)) {
+ const result = rewriteLLM(request.nextUrl.pathname);
+
+ if (result) {
+ return NextResponse.rewrite(new URL(result, request.nextUrl));
+ }
+ }
+
+ return NextResponse.next();
+}
diff --git a/apps/docs/public/hero-dark.png b/apps/docs/public/hero-dark.png
new file mode 100644
index 00000000..25b2d108
Binary files /dev/null and b/apps/docs/public/hero-dark.png differ
diff --git a/apps/docs/public/hero.png b/apps/docs/public/hero.png
new file mode 100644
index 00000000..5e067459
Binary files /dev/null and b/apps/docs/public/hero.png differ
diff --git a/apps/evalite-docs/public/og-image.jpg b/apps/docs/public/og-image.jpg
similarity index 100%
rename from apps/evalite-docs/public/og-image.jpg
rename to apps/docs/public/og-image.jpg
diff --git a/apps/docs/source.config.ts b/apps/docs/source.config.ts
new file mode 100644
index 00000000..1965eca6
--- /dev/null
+++ b/apps/docs/source.config.ts
@@ -0,0 +1,27 @@
+import {
+ defineConfig,
+ defineDocs,
+ frontmatterSchema,
+ metaSchema,
+} from "fumadocs-mdx/config";
+
+// You can customise Zod schemas for frontmatter and `meta.json` here
+// see https://fumadocs.dev/docs/mdx/collections
+export const docs = defineDocs({
+ dir: "content/docs",
+ docs: {
+ schema: frontmatterSchema,
+ postprocess: {
+ includeProcessedMarkdown: true,
+ },
+ },
+ meta: {
+ schema: metaSchema,
+ },
+});
+
+export default defineConfig({
+ mdxOptions: {
+ // MDX options
+ },
+});
diff --git a/apps/docs/tsconfig.json b/apps/docs/tsconfig.json
new file mode 100644
index 00000000..f2b5062f
--- /dev/null
+++ b/apps/docs/tsconfig.json
@@ -0,0 +1,36 @@
+{
+ "compilerOptions": {
+ "baseUrl": ".",
+ "target": "ESNext",
+ "lib": ["dom", "dom.iterable", "esnext"],
+ "allowJs": true,
+ "skipLibCheck": true,
+ "strict": true,
+ "forceConsistentCasingInFileNames": true,
+ "noEmit": true,
+ "esModuleInterop": true,
+ "module": "esnext",
+ "moduleResolution": "bundler",
+ "resolveJsonModule": true,
+ "isolatedModules": true,
+ "jsx": "react-jsx",
+ "incremental": true,
+ "paths": {
+ "@/*": ["./*"],
+ "@/.source": [".source"]
+ },
+ "plugins": [
+ {
+ "name": "next"
+ }
+ ]
+ },
+ "include": [
+ "next-env.d.ts",
+ "**/*.ts",
+ "**/*.tsx",
+ ".next/types/**/*.ts",
+ ".next/dev/types/**/*.ts"
+ ],
+ "exclude": ["node_modules"]
+}
diff --git a/apps/evalite-docs/.gitignore b/apps/evalite-docs/.gitignore
deleted file mode 100644
index 6240da8b..00000000
--- a/apps/evalite-docs/.gitignore
+++ /dev/null
@@ -1,21 +0,0 @@
-# build output
-dist/
-# generated types
-.astro/
-
-# dependencies
-node_modules/
-
-# logs
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-pnpm-debug.log*
-
-
-# environment variables
-.env
-.env.production
-
-# macOS-specific files
-.DS_Store
diff --git a/apps/evalite-docs/CLAUDE.md b/apps/evalite-docs/CLAUDE.md
deleted file mode 100644
index 7c19bbfe..00000000
--- a/apps/evalite-docs/CLAUDE.md
+++ /dev/null
@@ -1 +0,0 @@
-Make sure that when adding new pages, they also get added to astro.config.mts.
diff --git a/apps/evalite-docs/README.md b/apps/evalite-docs/README.md
deleted file mode 100644
index e09bf55f..00000000
--- a/apps/evalite-docs/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Starlight Starter Kit: Basics
-
-[](https://starlight.astro.build)
-
-```
-npm create astro@latest -- --template starlight
-```
-
-[](https://stackblitz.com/github/withastro/starlight/tree/main/examples/basics)
-[](https://codesandbox.io/p/sandbox/github/withastro/starlight/tree/main/examples/basics)
-[](https://app.netlify.com/start/deploy?repository=https://github.com/withastro/starlight&create_from_path=examples/basics)
-[](https://vercel.com/new/clone?repository-url=https%3A%2F%2Fgithub.com%2Fwithastro%2Fstarlight%2Ftree%2Fmain%2Fexamples%2Fbasics&project-name=my-starlight-docs&repository-name=my-starlight-docs)
-
-> 🧑‍🚀 **Seasoned astronaut?** Delete this file. Have fun!
-
-## 🚀 Project Structure
-
-Inside of your Astro + Starlight project, you'll see the following folders and files:
-
-```
-.
-├── public/
-├── src/
-│ ├── assets/
-│ ├── content/
-│ │ ├── docs/
-│ │ └── config.ts
-│ └── env.d.ts
-├── astro.config.mjs
-├── package.json
-└── tsconfig.json
-```
-
-Starlight looks for `.md` or `.mdx` files in the `src/content/docs/` directory. Each file is exposed as a route based on its file name.
-
-Images can be added to `src/assets/` and embedded in Markdown with a relative link.
-
-Static assets, like favicons, can be placed in the `public/` directory.
-
-## đź§ž Commands
-
-All commands are run from the root of the project, from a terminal:
-
-| Command | Action |
-| :------------------------ | :----------------------------------------------- |
-| `npm install` | Installs dependencies |
-| `npm run dev` | Starts local dev server at `localhost:4321` |
-| `npm run build` | Build your production site to `./dist/` |
-| `npm run preview` | Preview your build locally, before deploying |
-| `npm run astro ...` | Run CLI commands like `astro add`, `astro check` |
-| `npm run astro -- --help` | Get help using the Astro CLI |
-
-## đź‘€ Want to learn more?
-
-Check out [Starlight’s docs](https://starlight.astro.build/), read [the Astro documentation](https://docs.astro.build), or jump into the [Astro Discord server](https://astro.build/chat).
diff --git a/apps/evalite-docs/astro.config.mts b/apps/evalite-docs/astro.config.mts
deleted file mode 100644
index 6f366886..00000000
--- a/apps/evalite-docs/astro.config.mts
+++ /dev/null
@@ -1,163 +0,0 @@
-import { defineConfig } from "astro/config";
-import starlight from "@astrojs/starlight";
-
-// https://astro.build/config
-export default defineConfig({
- integrations: [
- starlight({
- title: "Evalite",
- favicon: "/favicon.ico",
- editLink: {
- baseUrl:
- "https://github.com/mattpocock/evalite/edit/main/apps/evalite-docs",
- },
- head: [
- {
- tag: "meta",
- attrs: {
- property: "og:url",
- content: "https://evalite.dev",
- },
- },
- {
- tag: "meta",
- attrs: {
- property: "og:image",
- content: "https://evalite.dev/og-image.jpg",
- },
- },
- {
- tag: "meta",
- attrs: {
- property: "og:image:width",
- content: "1280",
- },
- },
- {
- tag: "meta",
- attrs: {
- property: "og:image:height",
- content: "640",
- },
- },
- {
- tag: "meta",
- attrs: {
- property: "og:image:alt",
- content: "Evalite Logo",
- },
- },
- {
- tag: "meta",
- attrs: {
- name: "twitter:card",
- content: "summary_large_image",
- },
- },
- {
- tag: "meta",
- attrs: {
- name: "twitter:image",
- content: "https://evalite.dev/og-image.jpg",
- },
- },
- {
- tag: "script",
- attrs: {
- src: "https://www.googletagmanager.com/gtag/js?id=G-KBWLHSRCHD",
- async: true,
- },
- },
- {
- tag: "script",
- content: `
- window.dataLayer = window.dataLayer || [];
- function gtag(){dataLayer.push(arguments);}
- gtag('js', new Date());
- gtag('config', 'G-KBWLHSRCHD');
- `,
- },
- ],
- social: {
- github: "https://github.com/mattpocock/evalite",
- discord: "https://mattpocock.com/ai-discord",
- },
- logo: {
- light: "./src/assets/logo-light.svg",
- dark: "./src/assets/logo-dark.svg",
- },
- sidebar: [
- {
- label: "Getting Started",
- items: [
- {
- label: "What Is Evalite?",
- slug: "what-is-evalite",
- },
- {
- label: "Quickstart",
- slug: "quickstart",
- },
- ],
- },
- {
- label: "Guides",
- items: [
- {
- label: "Environment Variables",
- slug: "guides/environment-variables",
- },
- {
- label: "Scorers",
- slug: "guides/scorers",
- },
- {
- label: "Traces",
- slug: "guides/traces",
- },
- {
- label: "A/B Testing",
- slug: "guides/variant-comparison",
- },
- {
- label: "Multi-Modal",
- slug: "guides/multi-modal",
- },
- {
- label: "Configuration",
- slug: "guides/configuration",
- },
- {
- label: "Streams",
- slug: "guides/streams",
- },
- {
- label: "CLI",
- slug: "guides/cli",
- },
- {
- label: "Running Programmatically",
- slug: "guides/running-programmatically",
- },
- {
- label: "CI/CD",
- slug: "guides/ci",
- },
- {
- label: "Skipping Evals",
- slug: "guides/skipping",
- },
- {
- label: "Customizing The UI",
- slug: "guides/customizing-the-ui",
- },
- ],
- },
- {
- label: "Integrations",
- items: [{ label: "Vercel AI SDK", slug: "examples/ai-sdk" }],
- },
- ],
- }),
- ],
-});
diff --git a/apps/evalite-docs/package.json b/apps/evalite-docs/package.json
deleted file mode 100644
index 448c8355..00000000
--- a/apps/evalite-docs/package.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "name": "evalite-docs",
- "type": "module",
- "private": true,
- "version": "0.0.1",
- "scripts": {
- "dev": "astro dev",
- "start": "astro dev",
- "build": "astro build",
- "preview": "astro preview",
- "astro": "astro"
- },
- "dependencies": {
- "@astrojs/starlight": "^0.29.2",
- "astro": "^4.16.10",
- "sharp": "^0.32.5"
- }
-}
diff --git a/apps/evalite-docs/public/hero.webp b/apps/evalite-docs/public/hero.webp
deleted file mode 100644
index 8758421c..00000000
Binary files a/apps/evalite-docs/public/hero.webp and /dev/null differ
diff --git a/apps/evalite-docs/src/assets/houston.webp b/apps/evalite-docs/src/assets/houston.webp
deleted file mode 100644
index 930c1649..00000000
Binary files a/apps/evalite-docs/src/assets/houston.webp and /dev/null differ
diff --git a/apps/evalite-docs/src/assets/logo-dark.svg b/apps/evalite-docs/src/assets/logo-dark.svg
deleted file mode 100644
index d1aae00e..00000000
--- a/apps/evalite-docs/src/assets/logo-dark.svg
+++ /dev/null
@@ -1,10 +0,0 @@
-
diff --git a/apps/evalite-docs/src/assets/logo-light.svg b/apps/evalite-docs/src/assets/logo-light.svg
deleted file mode 100644
index f48a55bd..00000000
--- a/apps/evalite-docs/src/assets/logo-light.svg
+++ /dev/null
@@ -1,10 +0,0 @@
-
diff --git a/apps/evalite-docs/src/components/Header.astro b/apps/evalite-docs/src/components/Header.astro
new file mode 100644
index 00000000..145452fb
--- /dev/null
+++ b/apps/evalite-docs/src/components/Header.astro
@@ -0,0 +1,15 @@
+---
+import type { Props } from "@astrojs/starlight/props";
+---
+
+
+ These are the docs for the beta version of Evalite. Install with pnpm add evalite@beta
+
+
diff --git a/apps/evalite-docs/src/content/config.ts b/apps/evalite-docs/src/content/config.ts
deleted file mode 100644
index a4eec59b..00000000
--- a/apps/evalite-docs/src/content/config.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-import { defineCollection } from "astro:content";
-import { docsSchema } from "@astrojs/starlight/schema";
-
-export const collections = {
- docs: defineCollection({ schema: docsSchema() }),
-};
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/answer-correctness.mdx b/apps/evalite-docs/src/content/docs/api/scorers/answer-correctness.mdx
new file mode 100644
index 00000000..dfe36de3
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/answer-correctness.mdx
@@ -0,0 +1,155 @@
+---
+title: answerCorrectness
+---
+
+Checks if your AI's answer is correct by comparing it to a reference answer. Combines factual accuracy (75%) and semantic similarity (25%) by default.
+
+**When to use**: When you need comprehensive answer evaluation that balances exact correctness with semantic equivalence. Ideal for QA systems where both factual accuracy and meaning matter.
+
+**When NOT to use**: If you only care about exact facts (use [faithfulness](/api/scorers/faithfulness)), or only semantic similarity (use `answerSimilarity`). Not suitable for creative tasks where divergence from reference is desired.
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { answerCorrectness } from "evalite/scorers";
+
+evalite("Answer Correctness", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: {
+ reference: "Paris is the capital of France.",
+ },
+ },
+ {
+ input: "Who invented the telephone?",
+ expected: {
+ reference:
+ "Alexander Graham Bell invented the telephone. The telephone was patented in 1876.",
+ },
+ },
+ ],
+ task: async (input) => {
+ // Your AI task here
+ return "Paris is the capital of France and has many museums.";
+ },
+ scorers: [
+ {
+ scorer: ({ input, output, expected }) =>
+ answerCorrectness({
+ question: input,
+ answer: output,
+ reference: expected.reference,
+ model: openai("gpt-4o-mini"),
+ embeddingModel: openai.embedding("text-embedding-3-small"),
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+function answerCorrectness(opts: {
+ question: string;
+ answer: string;
+ reference: string;
+ model: LanguageModel;
+ embeddingModel: EmbeddingModel;
+ weights?: [number, number];
+ beta?: number;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: {
+ classification: {
+ TP: Array<{ statement: string; reason: string }>;
+ FP: Array<{ statement: string; reason: string }>;
+ FN: Array<{ statement: string; reason: string }>;
+ };
+ factualityScore: number;
+ similarityScore: number;
+ responseStatements: string[];
+ referenceStatements: string[];
+ };
+}>;
+```
+
+## Parameters
+
+### question
+
+**Type:** `string`
+
+The question being asked.
+
+### answer
+
+**Type:** `string`
+
+The AI's answer to evaluate.
+
+### reference
+
+**Type:** `string`
+
+Reference answer for comparison. Should be a complete, accurate answer.
+
+### model
+
+**Type:** `LanguageModel`
+
+Language model to use for evaluation.
+
+### embeddingModel
+
+**Type:** `EmbeddingModel`
+
+Embedding model to use for semantic similarity calculation.
+
+### weights (optional)
+
+**Type:** `[number, number]`
+**Default:** `[0.75, 0.25]`
+
+Weights for combining factuality and similarity scores: `[factualityWeight, similarityWeight]`. Default weighs factual accuracy at 75% and semantic similarity at 25%.
+
+### beta (optional)
+
+**Type:** `number`
+**Default:** `1.0`
+
+Beta parameter for F-beta score calculation. `beta > 1` favors recall (catching all reference statements), `beta < 1` favors precision (avoiding false positives).
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Answer Correctness"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (weighted combination of factuality and similarity)
+- `metadata`: Object containing:
+ - `classification`: TP (true positives), FP (false positives), FN (false negatives) with statements and reasons
+ - `factualityScore`: F-beta score based on statement classification
+ - `similarityScore`: Cosine similarity between embeddings
+ - `responseStatements`: Decomposed statements from the answer
+ - `referenceStatements`: Decomposed statements from the reference
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [faithfulness](/api/scorers/faithfulness)
+- [exactMatch](/api/scorers/exact-match)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/answer-relevancy.mdx b/apps/evalite-docs/src/content/docs/api/scorers/answer-relevancy.mdx
new file mode 100644
index 00000000..a81d81ad
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/answer-relevancy.mdx
@@ -0,0 +1,131 @@
+---
+title: answerRelevancy
+---
+
+Checks if your AI actually answered the question asked (vs going off-topic or being evasive).
+
+**When to use**: When you want to catch answers that are technically correct but don't address what was asked. Perfect for customer support bots, Q&A systems, or any scenario where staying on-topic matters.
+
+**When NOT to use**: If your use case allows tangential or exploratory responses, or when creative interpretations of questions are desired.
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { answerRelevancy } from "evalite/scorers";
+
+evalite("Answer Relevancy", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ },
+ {
+ input: "Who invented the telephone?",
+ },
+ {
+ input: "What are the health benefits of exercise?",
+ },
+ ],
+ task: async (input) => {
+ if (input.includes("capital of France")) {
+ return "Paris is the capital of France. It's known for the Eiffel Tower and the Louvre Museum.";
+ } else if (input.includes("telephone")) {
+ return "Alexander Graham Bell is credited with inventing the telephone in 1876.";
+ } else if (input.includes("health benefits")) {
+ return "I don't know about that topic.";
+ }
+ return "I'm not sure about that.";
+ },
+ scorers: [
+ {
+ scorer: ({ input, output }) =>
+ answerRelevancy({
+ question: input,
+ answer: output,
+ model: openai("gpt-4o-mini"),
+ embeddingModel: openai.embedding("text-embedding-3-small"),
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+function answerRelevancy(opts: {
+ question: string;
+ answer: string;
+ model: LanguageModel;
+ embeddingModel: EmbeddingModel;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: {
+ generatedQuestions: string[];
+ similarities: number[];
+ allNoncommittal: boolean;
+ };
+}>;
+```
+
+## Parameters
+
+### question
+
+**Type:** `string`
+
+The original question being asked.
+
+### answer
+
+**Type:** `string`
+
+The AI's answer to evaluate. Note: Only supports string output, not multi-turn.
+
+### model
+
+**Type:** `LanguageModel`
+
+Language model to use for generating hypothetical questions.
+
+### embeddingModel
+
+**Type:** `EmbeddingModel`
+
+Embedding model to use for computing semantic similarity between questions.
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## How It Works
+
+Looks at your AI's answer and generates hypothetical questions it could be answering (by default, 3 questions). Then compares those generated questions to your original question using embeddings and cosine similarity. If similar, your AI stayed on topic.
+
+Also detects evasive/noncommittal answers like "I don't know" or "I'm not sure" and scores them as 0.
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Answer Relevancy"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (mean cosine similarity of generated questions to original, or 0 if all answers are noncommittal)
+- `metadata`: Object containing:
+ - `generatedQuestions`: Array of hypothetical questions generated from the answer
+ - `similarities`: Array of cosine similarity scores for each generated question
+ - `allNoncommittal`: Boolean indicating if all generated questions were flagged as noncommittal
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [faithfulness](/api/scorers/faithfulness)
+- [answerCorrectness](/api/scorers/answer-correctness)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/answer-similarity.mdx b/apps/evalite-docs/src/content/docs/api/scorers/answer-similarity.mdx
new file mode 100644
index 00000000..25a896ef
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/answer-similarity.mdx
@@ -0,0 +1,99 @@
+---
+title: answerSimilarity
+---
+
+Checks how similar your AI's answer is to the expected answer in meaning (not exact words). This is a "soft" comparison using embeddings and cosine similarity.
+
+**When to use**: When there are many valid ways to express the correct answer, and you want to allow flexibility in phrasing. Good for cases where multiple phrasings are valid.
+
+**When NOT to use**: When you need to verify specific facts (use answerCorrectness or faithfulness), or need exact matches (use exactMatch).
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { answerSimilarity } from "evalite/scorers";
+
+evalite("Answer Similarity", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: {
+ reference: "Paris is the capital of France.",
+ },
+ },
+ ],
+ task: async (input) => {
+ return "The capital city of France is Paris.";
+ },
+ scorers: [
+ {
+ scorer: ({ output, expected }) =>
+ answerSimilarity({
+ answer: output,
+ reference: expected.reference,
+ embeddingModel: openai.embedding("text-embedding-3-small"),
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+async function answerSimilarity(opts: {
+ answer: string;
+ reference: string;
+ embeddingModel: EmbeddingModel;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+}>;
+```
+
+## Parameters
+
+### answer
+
+**Type:** `string`
+
+The AI's answer to evaluate.
+
+### reference
+
+**Type:** `string`
+
+Reference answer for comparison (complete, accurate answer).
+
+### embeddingModel
+
+**Type:** `EmbeddingModel`
+
+Embedding model to use for semantic similarity. Supports any AI SDK embedding model.
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Answer Similarity"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (cosine similarity between embeddings)
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [exactMatch](/api/scorers/exact-match)
+- [answerCorrectness](/api/scorers/answer-correctness)
+- [SAS Paper](https://arxiv.org/pdf/2108.06130.pdf) - Research paper this scorer is based on
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/contains.mdx b/apps/evalite-docs/src/content/docs/api/scorers/contains.mdx
new file mode 100644
index 00000000..1facc799
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/contains.mdx
@@ -0,0 +1,68 @@
+---
+title: contains
+---
+
+Checks if your AI's output contains the expected substring anywhere in the text. Returns 1 if found, 0 otherwise.
+
+**When to use**: To verify specific keywords or phrases appear in the response, regardless of surrounding text.
+
+**When NOT to use**: When you need exact matches (use exactMatch) or semantic similarity (use answerSimilarity).
+
+## Example
+
+```ts
+import { evalite } from "evalite";
+import { contains } from "evalite/scorers";
+
+evalite("Contains", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: {
+ reference: "Paris",
+ },
+ },
+ ],
+ task: async (input) => {
+ return "Paris is the capital of France";
+ },
+ scorers: [
+ {
+ scorer: ({ output, expected }) =>
+ contains({
+ actual: output,
+ expected: expected.reference,
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+async function contains(opts: { actual: string; expected: string }): Promise<{
+ name: string;
+ description: string;
+ score: number;
+}>;
+```
+
+## Parameters
+
+### actual
+
+**Type:** `string`
+
+The actual output to check.
+
+### expected
+
+**Type:** `string`
+
+Substring that should appear anywhere in output.
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [exactMatch](/api/scorers/exact-match)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/context-recall.mdx b/apps/evalite-docs/src/content/docs/api/scorers/context-recall.mdx
new file mode 100644
index 00000000..9fbb4b82
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/context-recall.mdx
@@ -0,0 +1,123 @@
+---
+title: contextRecall
+---
+
+Checks if your retrieval system (like RAG) is finding the right documents. Compares the correct answer to what's in your retrieved documents.
+
+**When to use**: To diagnose and improve your document retrieval. Helps identify when you're not fetching relevant documents. Low score means retrieval is missing important info. High score means you retrieved the right stuff.
+
+**When NOT to use**: If you don't have a retrieval system, or if your AI should use general knowledge beyond retrieved docs.
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { contextRecall } from "evalite/scorers";
+
+evalite("RAG Context Recall", {
+ data: [
+ {
+ input: "When did the Space Shuttle program end?",
+ expected: {
+ answer:
+ "The Space Shuttle program ended in 2011 with the final flight of Atlantis on July 21, 2011.",
+ groundTruth: [
+ "NASA's Space Shuttle program operated from 1981 to 2011, completing 135 missions.",
+ "The final Space Shuttle mission was STS-135, flown by Atlantis in July 2011.",
+ ],
+ },
+ },
+ ],
+ task: async (input) => {
+ // Your RAG system here
+ return "The Space Shuttle program ended in 2011.";
+ },
+ scorers: [
+ {
+ scorer: ({ input, expected }) =>
+ contextRecall({
+ question: input,
+ answer: expected.answer,
+ groundTruth: expected.groundTruth,
+ model: openai("gpt-4o-mini"),
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+async function contextRecall(opts: {
+ question: string;
+ answer: string;
+ groundTruth: string[];
+ model: LanguageModel;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: {
+ classifications: Array<{
+ statement: string;
+ reason: string;
+ attributed: number;
+ }>;
+ reason: string;
+ };
+}>;
+```
+
+## Parameters
+
+### question
+
+**Type:** `string`
+
+The question being asked.
+
+### answer
+
+**Type:** `string`
+
+The reference answer to evaluate against the retrieved context. Note: Only supports string output, not multi-turn.
+
+### groundTruth
+
+**Type:** `string[]`
+
+Array of retrieved context documents/passages that should support the reference answer.
+
+### model
+
+**Type:** `LanguageModel`
+
+Language model to use for evaluation.
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Context Recall"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (percentage of statements from reference answer attributed to retrieved contexts)
+- `metadata`: Object containing:
+ - `classifications`: Array of evaluations for each statement with `statement`, `reason`, and `attributed` fields
+ - `reason`: Summary string explaining the score
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [faithfulness](/api/scorers/faithfulness)
+- [answerCorrectness](/api/scorers/answer-correctness)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/exact-match.mdx b/apps/evalite-docs/src/content/docs/api/scorers/exact-match.mdx
new file mode 100644
index 00000000..5e856618
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/exact-match.mdx
@@ -0,0 +1,68 @@
+---
+title: exactMatch
+---
+
+Checks if your AI's output exactly matches the expected text character-for-character. Returns 1 if they match, 0 otherwise.
+
+**When to use**: For testing structured outputs like JSON, code, or specific phrases that must be exact.
+
+**When NOT to use**: When slight variations in wording are acceptable (use answerSimilarity instead).
+
+## Example
+
+```ts
+import { evalite } from "evalite";
+import { exactMatch } from "evalite/scorers";
+
+evalite("Exact Match", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: {
+ reference: "Paris is the capital of France",
+ },
+ },
+ ],
+ task: async (input) => {
+ return "Paris is the capital of France";
+ },
+ scorers: [
+ {
+ scorer: ({ output, expected }) =>
+ exactMatch({
+ actual: output,
+ expected: expected.reference,
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+async function exactMatch(opts: { actual: string; expected: string }): Promise<{
+ name: string;
+ description: string;
+ score: number;
+}>;
+```
+
+## Parameters
+
+### actual
+
+**Type:** `string`
+
+The actual output to check.
+
+### expected
+
+**Type:** `string`
+
+The exact string that output should match.
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [contains](/api/scorers/contains)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/faithfulness.mdx b/apps/evalite-docs/src/content/docs/api/scorers/faithfulness.mdx
new file mode 100644
index 00000000..6ef4c972
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/faithfulness.mdx
@@ -0,0 +1,114 @@
+---
+title: faithfulness
+---
+
+Checks if your AI is making things up or sticking to the provided information.
+
+**When to use**: Essential for RAG systems where accuracy matters (medical, legal, financial). Catches when your AI invents facts not in your documents.
+
+**When NOT to use**: If your AI should add knowledge beyond the context, or for creative tasks where invention is desired.
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { faithfulness } from "evalite/scorers";
+
+evalite("RAG Faithfulness", {
+ data: [
+ {
+ input: "What programming languages does John know?",
+ expected: {
+ groundTruth: [
+ "John is a software engineer at XYZ Corp. He specializes in backend development using Python and Go. He has 5 years of experience in the industry.",
+ ],
+ },
+ },
+ ],
+ task: async () => {
+ return "John knows Python, Go, and JavaScript. He also invented TypeScript and works at Google.";
+ },
+ scorers: [
+ {
+ scorer: ({ input, output, expected }) =>
+ faithfulness({
+ question: input,
+ answer: output,
+ groundTruth: expected.groundTruth,
+ model: openai("gpt-4o-mini"),
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+function faithfulness(opts: {
+ question: string;
+ answer: string;
+ groundTruth: string[];
+ model: LanguageModel;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: Array<{
+ statement: string;
+ reason: string;
+ verdict: number;
+ }>;
+}>;
+```
+
+## Parameters
+
+### question
+
+**Type:** `string`
+
+The question being asked.
+
+### answer
+
+**Type:** `string`
+
+The AI's answer to evaluate. Note: Only supports string output, not multi-turn.
+
+### groundTruth
+
+**Type:** `string[]`
+
+Array of source documents/passages that should support all claims.
+
+### model
+
+**Type:** `LanguageModel`
+
+Language model to use for evaluation.
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Faithfulness"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (percentage of claims supported by ground truth)
+- `metadata`: Array of verdicts for each statement with `statement`, `reason`, and `verdict` fields
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [contextRecall](/api/scorers/context-recall)
+- [answerCorrectness](/api/scorers/answer-correctness)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/index.mdx b/apps/evalite-docs/src/content/docs/api/scorers/index.mdx
new file mode 100644
index 00000000..a17bf75d
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/index.mdx
@@ -0,0 +1,55 @@
+---
+title: Built-in Scorers
+---
+
+Ready-to-use scorer functions for common evaluation patterns. Import from `evalite/scorers`.
+
+```ts
+import { exactMatch, faithfulness, toolCallAccuracy } from "evalite/scorers";
+```
+
+Evalite's built-in scorers are deeply integrated with the [Vercel AI SDK](https://sdk.vercel.ai/), making it easy to evaluate LLM outputs using standardized models and embeddings. Many scorers require AI SDK models for LLM-based evaluation.
+
+## String Scorers
+
+Simple deterministic scorers for text matching. No AI SDK required.
+
+- [**exactMatch**](/api/scorers/exact-match) - Exact string comparison
+- [**contains**](/api/scorers/contains) - Substring matching
+
+## RAG Scorers
+
+LLM-based scorers for evaluating retrieval-augmented generation systems. Require AI SDK models.
+
+- [**faithfulness**](/api/scorers/faithfulness) - Detects hallucinations by checking if answers stick to provided context
+- [**answerSimilarity**](/api/scorers/answer-similarity) - Measures semantic similarity between answers using embeddings
+- [**answerCorrectness**](/api/scorers/answer-correctness) - Comprehensive evaluation combining factual accuracy and semantic similarity
+- [**answerRelevancy**](/api/scorers/answer-relevancy) - Checks if AI actually answered the question (vs going off-topic)
+- [**contextRecall**](/api/scorers/context-recall) - Evaluates if retrieval system found the right documents
+
+## Advanced Scorers
+
+Specialized scorers for specific use cases.
+
+- [**toolCallAccuracy**](/api/scorers/tool-call-accuracy) - Verifies AI is calling correct functions with correct parameters
+- [**noiseSensitivity**](/api/scorers/noise-sensitivity) - Diagnoses if AI is misled by irrelevant documents in RAG systems
+
+## Quick Reference
+
+| Scorer | AI SDK Required | Use Case |
+| ----------------- | ---------------------- | ----------------------------- |
+| exactMatch | No | Exact string matching |
+| contains | No | Substring matching |
+| faithfulness | Yes (LLM) | RAG hallucination detection |
+| answerSimilarity | Yes (Embeddings) | Semantic similarity |
+| answerCorrectness | Yes (LLM + Embeddings) | Comprehensive evaluation |
+| answerRelevancy | Yes (LLM + Embeddings) | On-topic checking |
+| contextRecall | Yes (LLM) | Retrieval quality |
+| toolCallAccuracy | No | Function calling verification |
+| noiseSensitivity | Yes (LLM) | RAG debugging |
+
+## See Also
+
+- [Scorers Guide](/guides/scorers) - Learn how to create custom scorers
+- [createScorer()](/api/create-scorer) - API for building reusable scorers
+- [Vercel AI SDK](/tips/vercel-ai-sdk) - AI SDK integration guide
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/noise-sensitivity.mdx b/apps/evalite-docs/src/content/docs/api/scorers/noise-sensitivity.mdx
new file mode 100644
index 00000000..ae32bfa4
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/noise-sensitivity.mdx
@@ -0,0 +1,153 @@
+---
+title: noiseSensitivity
+---
+
+Checks if your AI is being misled by irrelevant documents in retrieval results.
+
+**When to use**: Use to debug RAG systems with accuracy issues. Identifies if problems come from bad retrieval or poor reasoning. Helps diagnose whether you're retrieving bad documents, or if your AI isn't using good ones correctly.
+
+**When NOT to use**: Skip for non-RAG systems, or when you haven't identified accuracy issues yet (start with faithfulness first).
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { evalite } from "evalite";
+import { noiseSensitivity } from "evalite/scorers";
+
+evalite("RAG Noise Sensitivity", {
+ data: [
+ {
+ input: "What is the capital of France?",
+ expected: {
+ reference: "Paris is the capital of France.",
+ groundTruth: [
+ "Paris is the capital and largest city of France. It is located in the north-central part of the country.",
+ "Lyon is the third-largest city in France and an important cultural center.",
+ "Marseille is a major French port city on the Mediterranean coast.",
+ ],
+ },
+ },
+ ],
+ task: async () => {
+ return "Lyon is the capital of France. Paris is the largest city in France.";
+ },
+ scorers: [
+ {
+ scorer: ({ input, output, expected }) =>
+ noiseSensitivity({
+ question: input,
+ answer: output,
+ reference: expected.reference,
+ groundTruth: expected.groundTruth,
+ model: openai("gpt-4o-mini"),
+ mode: "relevant",
+ }),
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+function noiseSensitivity(opts: {
+ question: string;
+ answer: string;
+ reference: string;
+ groundTruth: string[];
+ model: LanguageModel;
+ mode?: "relevant" | "irrelevant";
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: {
+ referenceStatements: string[];
+ answerStatements: string[];
+ incorrectStatements: string[];
+ relevantContextIndices: number[];
+ irrelevantContextIndices: number[];
+ mode: "relevant" | "irrelevant";
+ retrievedToGroundTruth: boolean[][];
+ retrievedToAnswer: boolean[][];
+ groundTruthToAnswer: boolean[];
+ };
+}>;
+```
+
+## Parameters
+
+### question
+
+**Type:** `string`
+
+The question being asked.
+
+### answer
+
+**Type:** `string`
+
+The AI's answer to evaluate. Note: Only supports string output, not multi-turn.
+
+### reference
+
+**Type:** `string`
+
+Correct/reference answer to the question.
+
+### groundTruth
+
+**Type:** `string[]`
+
+Array of retrieved context documents.
+
+### model
+
+**Type:** `LanguageModel`
+
+Language model to use for evaluation.
+
+### mode
+
+**Type:** `"relevant" | "irrelevant"`
+**Default:** `"relevant"`
+
+Evaluation mode:
+
+- `"relevant"`: Measures how often your AI makes mistakes even when correct docs are present
+- `"irrelevant"`: Measures how often your AI is confused by wrong/irrelevant documents
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Noise Sensitivity"
+- `description`: Description of what was evaluated
+- `score`: Number between 0-1 (percentage of incorrect statements attributed to relevant or irrelevant contexts, depending on mode)
+- `metadata`: Extensive attribution data including:
+ - `referenceStatements`: Statements decomposed from reference answer
+ - `answerStatements`: Statements decomposed from AI answer
+ - `incorrectStatements`: AI statements not supported by reference
+ - `relevantContextIndices`: Indices of contexts containing reference information
+ - `irrelevantContextIndices`: Indices of contexts not containing reference information
+ - `mode`: Mode used for evaluation
+ - `retrievedToGroundTruth`: Boolean matrix mapping contexts to reference statements
+ - `retrievedToAnswer`: Boolean matrix mapping contexts to answer statements
+ - `groundTruthToAnswer`: Boolean array mapping reference to answer statements
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [faithfulness](/api/scorers/faithfulness)
+- [contextRecall](/api/scorers/context-recall)
+- [answerCorrectness](/api/scorers/answer-correctness)
diff --git a/apps/evalite-docs/src/content/docs/api/scorers/tool-call-accuracy.mdx b/apps/evalite-docs/src/content/docs/api/scorers/tool-call-accuracy.mdx
new file mode 100644
index 00000000..a6990961
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/api/scorers/tool-call-accuracy.mdx
@@ -0,0 +1,235 @@
+---
+title: toolCallAccuracy
+---
+
+Checks if your AI is calling the right functions with the right parameters.
+
+**When to use**: For AI agents that need to call external functions/APIs. Essential for:
+
+- Multi-step agents that orchestrate API calls
+- Assistants that execute user commands via tools
+- Workflow automation systems
+- Testing function calling reliability
+
+**When NOT to use**: For simple text generation tasks without function calling. Use `exactMatch` or `answerSimilarity` instead.
+
+## Example
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { generateText, tool } from "ai";
+import { evalite } from "evalite";
+import { toolCallAccuracy } from "evalite/scorers";
+import { z } from "zod";
+
+evalite("Tool Calls", {
+ data: [
+ {
+ input: "What calendar events do I have today?",
+ expected: [
+ {
+ toolName: "getCalendarEvents",
+ input: {
+ date: "2024-04-27",
+ },
+ },
+ ],
+ },
+ ],
+ task: async (input) => {
+ const result = await generateText({
+ model: openai("gpt-4o-mini"),
+ prompt: input,
+ tools: {
+ getCalendarEvents: tool({
+ description: "Get calendar events",
+ inputSchema: z.object({
+ date: z.string(),
+ }),
+ execute: async ({ date }) => {
+ return `You have a meeting with John on ${date}`;
+ },
+ }),
+ },
+ system: `You are a helpful assistant. Today's date is 2024-04-27.`,
+ });
+
+ return result.toolCalls;
+ },
+ scorers: [
+ {
+ scorer: ({ output, expected }) => {
+ return toolCallAccuracy({
+ actualCalls: output,
+ expectedCalls: expected,
+ });
+ },
+ },
+ ],
+});
+```
+
+## Signature
+
+```ts
+async function toolCallAccuracy(opts: {
+ actualCalls: ToolCall[];
+ expectedCalls: ToolCall[];
+ mode?: "exact" | "flexible";
+ weights?: Partial<{
+ exact: number;
+ nameOnly: number;
+ extraPenalty: number;
+ wrongPenalty: number;
+ }>;
+}): Promise<{
+ name: string;
+ description: string;
+ score: number;
+ metadata: {
+ mode: "exact" | "flexible";
+ exactMatches: number;
+ nameOnlyMatches: number;
+ // Mode-specific fields
+ };
+}>;
+
+type ToolCall = {
+ toolName: string;
+ input?: unknown;
+};
+```
+
+## Parameters
+
+### actualCalls
+
+**Type:** `ToolCall[]`
+
+The actual tool calls made by the AI. Typically the `toolCalls` array from your AI SDK response.
+
+### expectedCalls
+
+**Type:** `ToolCall[]`
+
+Expected tool calls as an array of objects with `toolName` and optional `input` properties.
+
+### mode
+
+**Type:** `"exact" | "flexible"`
+**Default:** `"exact"`
+
+Comparison mode:
+
+- `"exact"`: Calls must happen in exact order with exact arguments
+- `"flexible"`: Calls can happen in any order, only checks correct functions were called
+
+### weights
+
+**Type:** `Partial`
+
+Custom weights for scoring components:
+
+- `exact` (default: 1.0) - Full credit for correct function + correct arguments
+- `nameOnly` (default: 0.5) - Partial credit for correct function + wrong arguments
+- `extraPenalty` (default: 0.25) - Penalty for extra calls (flexible mode only)
+- `wrongPenalty` (default: 0.25) - Penalty for wrong/missing calls (exact mode only)
+
+## Scoring Modes
+
+### Exact Mode
+
+Enforces strict ordering. Calls must appear in the same sequence as expected.
+
+```ts
+toolCallAccuracy({
+ actualCalls: [
+ { toolName: "getTasks" },
+ { toolName: "createTask", input: { title: "Buy milk" } },
+ ],
+ expectedCalls: [
+ { toolName: "getTasks" },
+ { toolName: "createTask", input: { title: "Buy milk" } },
+ ],
+ mode: "exact",
+});
+// Score: 1.0 (perfect match)
+```
+
+### Flexible Mode
+
+Allows calls in any order. Useful for multi-step agents where execution order varies.
+
+```ts
+toolCallAccuracy({
+ actualCalls: [
+ { toolName: "createTask", input: { title: "Buy milk" } },
+ { toolName: "getTasks" }, // Order swapped
+ ],
+ expectedCalls: [
+ { toolName: "getTasks" },
+ { toolName: "createTask", input: { title: "Buy milk" } },
+ ],
+ mode: "flexible",
+});
+// Score: 1.0 (still perfect - order doesn't matter)
+```
+
+## Return Value
+
+Returns an object with:
+
+- `name`: "Tool Call Accuracy"
+- `description`: "Checks if the tool calls are correct"
+- `score`: Number between 0-1
+- `metadata`: Object with match counts and mode-specific details
+
+### Exact Mode Metadata
+
+```ts
+{
+ mode: "exact",
+ exactMatches: number,
+ nameOnlyMatches: number,
+ wrongOrMissing: number,
+ totals: {
+ reference: number,
+ output: number
+ }
+}
+```
+
+### Flexible Mode Metadata
+
+```ts
+{
+ mode: "flexible",
+ exactMatches: number,
+ nameOnlyMatches: number,
+ extras: number,
+ missing: number,
+ totals: {
+ reference: number,
+ output: number
+ },
+ details: {
+ matches: ToolCall[],
+ nameOnlyMatches: ToolCall[],
+ extras: ToolCall[],
+ missingToolCalls: ToolCall[]
+ }
+}
+```
+
+import { Aside } from "@astrojs/starlight/components";
+
+
+
+## See Also
+
+- [createScorer()](/api/create-scorer)
+- [Vercel AI SDK](/tips/vercel-ai-sdk)
+- [exactMatch](/api/scorers/exact-match)
diff --git a/apps/evalite-docs/src/content/docs/guides/cli.mdx b/apps/evalite-docs/src/content/docs/guides/cli.mdx
deleted file mode 100644
index b581c2a4..00000000
--- a/apps/evalite-docs/src/content/docs/guides/cli.mdx
+++ /dev/null
@@ -1,90 +0,0 @@
----
-title: CLI
----
-
-## Watch Mode
-
-You can run Evalite in watch mode by running `evalite watch`:
-
-```bash
-evalite watch
-```
-
-This will watch for changes to your `.eval.ts` files and re-run the evals when they change.
-
-> [!IMPORTANT]
->
-> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits.
-
-### Hiding the Table Output
-
-When debugging with `console.log`, the detailed table output can make it harder to see your logs. You can hide it with `--hideTable`:
-
-```bash
-evalite watch --hideTable
-```
-
-This keeps the score summary but removes the detailed results table from the CLI output.
-
-## Serve Mode
-
-You can run evals once and serve the UI without re-running on file changes:
-
-```bash
-evalite serve
-```
-
-This runs your evals once and keeps the UI server running at `http://localhost:3006`. Unlike watch mode, tests won't re-run when files change.
-
-Since evals can take a while to run, this can be a useful alternative to watch mode.
-
-To re-run evals after making changes, restart `evalite serve`.
-
-## Running Specific Files
-
-You can run specific files by passing them as arguments:
-
-```bash
-evalite my-eval.eval.ts
-```
-
-This also works for `watch` and `serve` modes:
-
-```bash
-evalite watch my-eval.eval.ts
-evalite serve my-eval.eval.ts
-```
-
-## Threshold
-
-You can tell Evalite that your evals must pass a specific score by passing `--threshold`:
-
-```bash
-evalite --threshold=50 # Score must be greater than or equal to 50
-
-evalite watch --threshold=70 # Also works in watch mode
-```
-
-This is useful for running on CI. If the score threshold is not met, it will fail the process.
-
-## Export Command
-
-Export eval results as a static HTML bundle:
-
-```bash
-evalite export
-```
-
-This exports the latest run to `./evalite-export` by default.
-
-### Options
-
-- `--output` - Custom output directory
-- `--runId` - Export specific run ID
-- `--basePath` - Base path for non-root hosting (must start with `/`)
-
-```bash
-evalite export --basePath=/evals-123 --output=./my-export
-```
-
-See the [CI/CD guide](/guides/ci) for full documentation on exporting and viewing static UI bundles.
diff --git a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx b/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx
deleted file mode 100644
index 24e15b49..00000000
--- a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx
+++ /dev/null
@@ -1,47 +0,0 @@
----
-title: Environment Variables
----
-
-import { Steps } from "@astrojs/starlight/components";
-
-To call your LLM from a third-party service, you'll likely need some environment variables to keep your API keys safe.
-
-## Setting Up Env Variables
-
-
-
-1. Create a `.env` file in the root of your project:
-
- ```
- // .env
- OPENAI_API_KEY=your-api-key
- ```
-
-2. Add `.env` to your `.gitignore`, if it's not already there
-
- ```
- // .gitignore
- .env
- ```
-
-3. Install `dotenv`:
-
- ```bash
- pnpm add -D dotenv
- ```
-
-4. Add an `evalite.config.ts` file:
-
- ```ts
- // evalite.config.ts
-
- import { defineConfig } from "evalite/config";
-
- export default defineConfig({
- setupFiles: ["dotenv/config"],
- });
- ```
-
-
-
-Now, your environment variables will be available in your evals.
diff --git a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx b/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx
deleted file mode 100644
index 53252ea1..00000000
--- a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx
+++ /dev/null
@@ -1,119 +0,0 @@
----
-title: Running Programmatically
----
-
-You can run Evalite programmatically using the Node API. This is useful when you want to integrate Evalite into your own scripts, CI/CD pipelines, or custom tooling.
-
-## Basic Usage
-
-Import the `runEvalite` function from `evalite/runner`:
-
-```typescript
-import { runEvalite } from "evalite/runner";
-
-await runEvalite({
- mode: "run-once-and-exit",
-});
-```
-
-That's it! The `path` and `cwd` parameters are optional and default to running all evals in the current directory.
-
-## Run Modes
-
-### Run Once and Exit
-
-This mode runs all evals once and exits. It's ideal for CI/CD pipelines:
-
-```typescript
-await runEvalite({
- mode: "run-once-and-exit",
-});
-```
-
-### Watch Mode
-
-This mode watches for file changes and re-runs evals automatically. It also starts the Evalite UI server:
-
-```typescript
-await runEvalite({
- mode: "watch-for-file-changes",
-});
-```
-
-> [!IMPORTANT]
->
-> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits.
-
-## Options
-
-### `path`
-
-Optional path filter to run specific eval files. Defaults to `undefined` (runs all evals):
-
-```typescript
-await runEvalite({
- path: "my-eval.eval.ts",
- mode: "run-once-and-exit",
-});
-```
-
-### `cwd`
-
-The working directory to run evals from. Defaults to `process.cwd()`:
-
-```typescript
-await runEvalite({
- cwd: "/path/to/my/project",
- mode: "run-once-and-exit",
-});
-```
-
-### `scoreThreshold`
-
-Set a minimum score threshold (0-100). If the average score falls below this threshold, the process will exit with a non-zero exit code:
-
-```typescript
-await runEvalite({
- mode: "run-once-and-exit",
- scoreThreshold: 80, // Fail if score is below 80
-});
-```
-
-This is particularly useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold.
-
-### `outputPath`
-
-Export the results to a JSON file after the run completes:
-
-```typescript
-await runEvalite({
- mode: "run-once-and-exit",
- outputPath: "./results.json",
-});
-```
-
-The exported JSON file contains the complete run data including all evals, results, scores, and traces.
-
-## Complete Example
-
-Here's a complete example that combines multiple options:
-
-```typescript
-import { runEvalite } from "evalite/runner";
-
-async function runEvals() {
- try {
- await runEvalite({
- mode: "run-once-and-exit",
- scoreThreshold: 75, // Fail if average score < 75
- outputPath: "./evalite-results.json", // Export results
- });
- console.log("All evals passed!");
- } catch (error) {
- console.error("Evals failed:", error);
- process.exit(1);
- }
-}
-
-runEvals();
-```
diff --git a/apps/evalite-docs/src/content/docs/guides/scorers.mdx b/apps/evalite-docs/src/content/docs/guides/scorers.mdx
deleted file mode 100644
index 5ff16b80..00000000
--- a/apps/evalite-docs/src/content/docs/guides/scorers.mdx
+++ /dev/null
@@ -1,246 +0,0 @@
----
-title: Scorers
----
-
-import { Aside } from "@astrojs/starlight/components";
-
-Scorers are used to score the output of your LLM call.
-
-[Autoevals](https://github.com/braintrustdata/autoevals) is a great library of scorers to get you started.
-
-## Inline Scorers
-
-If you don't need your scorer to be reusable, you can define it inline.
-
-```ts
-import { evalite } from "evalite";
-
-evalite("My Eval", {
- data: [{ input: "Hello" }],
- task: async (input) => {
- return input + " World!";
- },
- scorers: [
- {
- name: "Contains Paris",
- description: "Checks if the output contains the word 'Paris'.",
- scorer: ({ output }) => {
- return output.includes("Paris") ? 1 : 0;
- },
- },
- ],
-});
-```
-
-## Creating Reusable Scorers
-
-If you have a scorer you want to use across multiple files, you can use `createScorer` to create a reusable scorer.
-
-```ts
-import { createScorer } from "evalite";
-
-const containsParis = createScorer({
- name: "Contains Paris",
- description: "Checks if the output contains the word 'Paris'.",
- scorer: ({ output }) => {
- return output.includes("Paris") ? 1 : 0;
- },
-});
-
-evalite("My Eval", {
- data: [{ input: "Hello" }],
- task: async (input) => {
- return input + " World!";
- },
- scorers: [containsParis],
-});
-```
-
-The `name` and `description` of the scorer will be displayed in the Evalite UI.
-
-## Score Properties
-
-The `score` function receives three properties on the object passed:
-
-```ts
-import { createScorer } from "evalite";
-
-const containsParis = createScorer({
- name: "Contains Paris",
- description: "Checks if the output contains the word 'Paris'.",
- scorer: ({ input, output, expected }) => {
- // input comes from `data`
- // expected also comes from `data`
- // output is the output of `task`
- return output.includes("Paris") ? 1 : 0;
- },
-});
-```
-
-These are typed using the three type arguments passed to `createScorer`:
-
-```ts
-import { createScorer } from "evalite";
-
-const containsParis = createScorer<
- string, // Type of 'input'
- string, // Type of 'output'
- string // Type of 'expected'
->({
- name: "Contains Word",
- description: "Checks if the output contains the specified word.",
- scorer: ({ output, input, expected }) => {
- // output is typed as string!
- return output.includes(expected) ? 1 : 0;
- },
-});
-```
-
-If `expected` is omitted, it will be inferred from the type of `output`.
-
-## Scorer Metadata
-
-You can provide metadata along with your custom scorer:
-
-```ts
-import { createScorer } from "evalite";
-
-const containsParis = createScorer({
- name: "Contains Paris",
- description: "Checks if the output contains the word 'Paris'.",
- scorer: (output) => {
- return {
- score: output.includes("Paris") ? 1 : 0,
- metadata: {
- // Can be anything!
- },
- };
- },
-});
-```
-
-This will be visible along with the score in the Evalite UI.
-
-
-
-## Creating LLM-As-A-Judge Scorers
-
-Here is a brief guide on building your own LLM-as-a-judge scorer.
-
-We're looking to improve this feature with a first-class guide in the future.
-
-```ts
-import { openai } from "@ai-sdk/openai";
-import { generateObject } from "ai";
-import { createScorer } from "evalite";
-import { z } from "zod";
-
-/**
- * Factuality scorer using OpenAI's GPT-4o model.
- */
-export const Factuality = createScorer({
- name: "Factuality",
- scorer: async ({ input, expected, output }) => {
- return checkFactuality({
- question: input,
- groundTruth: expected!,
- submission: output,
- });
- },
-});
-
-/**
- * Checks the factuality of a submission, using
- * OpenAI's GPT-4o model.
- */
-const checkFactuality = async (opts: {
- question: string;
- groundTruth: string;
- submission: string;
-}) => {
- const { object } = await generateObject({
- model: openai("gpt-4o-2024-11-20"),
- /**
- * Prompt taken from autoevals:
- *
- * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
- */
- prompt: `
- You are comparing a submitted answer to an expert answer on a given question. Here is the data:
- [BEGIN DATA]
- ************
- [Question]: ${opts.question}
- ************
- [Expert]: ${opts.groundTruth}
- ************
- [Submission]: ${opts.submission}
- ************
- [END DATA]
-
- Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
- The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
- (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
- (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
- (C) The submitted answer contains all the same details as the expert answer.
- (D) There is a disagreement between the submitted answer and the expert answer.
- (E) The answers differ, but these differences don't matter from the perspective of factuality.
- `,
- schema: z.object({
- answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
- rationale: z
- .string()
- .describe("Why you chose this answer. Be very detailed."),
- }),
- });
-
- /**
- * LLM's are well documented at being poor at generating
- */
- const scores = {
- A: 0.4,
- B: 0.6,
- C: 1,
- D: 0,
- E: 1,
- };
-
- return {
- score: scores[object.answer],
- metadata: {
- rationale: object.rationale,
- },
- };
-};
-
-/**
- * Use the Factuality eval like so:
- *
- * 1. The input (in data()) is a question.
- * 2. The expected output is the ground truth answer to the question.
- * 3. The output is the text to be evaluated.
- */
-evalite("Factuality", {
- data: [
- {
- // The question
- input: "What is the capital of France?",
-
- // The expected answer
- expected: "Paris",
- },
- ],
- task: async (input) => {
- // Technically correct, but a nightmare for non-LLM
- // scorers to evaluate.
- return (
- "The capital of France is a city that starts " +
- "with a letter P, and ends in 'aris'."
- );
- },
- scorers: [Factuality],
-});
-```
diff --git a/apps/evalite-docs/src/content/docs/guides/skipping.mdx b/apps/evalite-docs/src/content/docs/guides/skipping.mdx
deleted file mode 100644
index a443da01..00000000
--- a/apps/evalite-docs/src/content/docs/guides/skipping.mdx
+++ /dev/null
@@ -1,35 +0,0 @@
----
-title: Skipping Evals
----
-
-## Skipping Entire Evals
-
-You can use `evalite.skip()` to skip an entire eval without running it.
-
-```ts
-evalite.skip("My Eval", {
- data: () => [],
- task: () => {},
-});
-```
-
-This is useful when you want to temporarily disable an eval during development or testing.
-
-## Focusing on Specific Evals
-
-You can use the `only` flag on data entries to focus on specific inputs during development.
-
-```ts
-evalite("My Eval", {
- data: () => [
- { input: "test1", expected: "output1" },
- { input: "test2", expected: "output2", only: true },
- { input: "test3", expected: "output3" },
- ],
- task: async (input) => {
- // Only runs for "test2"
- },
-});
-```
-
-When any data entry has `only: true`, only those evals will be run.
diff --git a/apps/evalite-docs/src/content/docs/guides/storage.mdx b/apps/evalite-docs/src/content/docs/guides/storage.mdx
new file mode 100644
index 00000000..9c59adc8
--- /dev/null
+++ b/apps/evalite-docs/src/content/docs/guides/storage.mdx
@@ -0,0 +1,67 @@
+---
+title: Storage
+---
+
+You'll often want to persist the results of your evals so that you can compare results across time or share them with your team.
+
+Evalite uses **storage adapters** to persist your results. By default, Evalite stores results in memory, which is fast but ephemeral. For persistent storage across runs, you can use SQLite or build a custom storage adapter.
+
+## Default: In-Memory Storage
+
+Evalite uses in-memory storage by default. This is the recommended option for most users because:
+
+- **Fast**: No disk I/O overhead
+- **Simple**: Zero configuration required
+- **Ideal for local development**: Quick iteration without cluttering your filesystem
+
+Data is lost when the process exits, which keeps your workspace clean during active development.
+
+## Using SQLite Storage
+
+For persistent storage across runs, use the SQLite adapter. This is useful when you need to:
+
+- **Compare runs over time**: Track how changes affect eval performance
+- **CI/CD environments**: Preserve results for reporting or archival
+- **Historical analysis**: Review past eval results
+
+### Setting Up SQLite Storage
+
+First, install the required peer dependency:
+
+```bash
+pnpm add -D better-sqlite3
+```
+
+Then configure Evalite to use SQLite storage:
+
+```ts
+// evalite.config.ts
+import { defineConfig } from "evalite/config";
+import { createSqliteStorage } from "evalite/sqlite-storage";
+
+export default defineConfig({
+ storage: () => createSqliteStorage("./evalite.db"),
+});
+```
+
+The database file will be created automatically at the specified path. Parent directories will also be created if they don't exist.
+
+## Custom Storage Adapters
+
+You can build your own storage adapter to integrate with remote databases, cloud storage, or custom data pipelines. This is useful for:
+
+- **Remote databases**: PostgreSQL, MySQL, MongoDB, etc.
+- **Enterprise requirements**: Integration with existing data infrastructure
+- **Custom workflows**: Specialized data processing or reporting
+
+To create a custom adapter, implement the `Evalite.Storage` interface. This interface defines methods for creating and querying five entity types:
+
+- **Runs**: Full or partial test runs
+- **Suites**: Test suites within runs
+- **Evals**: Individual eval executions
+- **Scores**: Scorer results for evals
+- **Traces**: Nested LLM call traces
+
+See the [Storage API Reference](/api/storage) for the complete interface definition, entity schemas, and implementation examples.
+
+In the future, the Evalite team plans to integrate with popular observability platforms, databases, and more. If you have a specific request, please [open an issue](https://github.com/mattpocock/evalite/issues/new) or implement a storage adapter yourself!
diff --git a/apps/evalite-docs/src/content/docs/guides/streams.md b/apps/evalite-docs/src/content/docs/guides/streams.md
deleted file mode 100644
index ce5fa063..00000000
--- a/apps/evalite-docs/src/content/docs/guides/streams.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: Streams
----
-
-You can handle streams in Evalite by returning any async iterable (including a `ReadableStream`) from your task. This means you can test functions like the AI SDK `streamText` function easily:
-
-```ts
-import { evalite } from "evalite";
-import { streamText } from "ai";
-import { openai } from "@ai-sdk/openai";
-import { Factuality } from "autoevals";
-
-evalite("My Eval", {
- data: [{ input: "What is the capital of France?", expected: "Paris" }],
- task: async (input) => {
- const result = await streamText({
- model: openai("your-model"),
- system: `Answer the question concisely.`,
- prompt: input,
- });
-
- return result.textStream;
- },
- scorers: [Factuality],
-});
-```
diff --git a/apps/evalite-docs/src/content/docs/index.mdx b/apps/evalite-docs/src/content/docs/index.mdx
deleted file mode 100644
index c4ed5224..00000000
--- a/apps/evalite-docs/src/content/docs/index.mdx
+++ /dev/null
@@ -1,39 +0,0 @@
----
-title: Test AI-powered apps in TypeScript
-description: Evalite makes evals simple. Test your AI-powered apps with a local dev server.
-template: splash
-hero:
- tagline: Evals are hard. Evalite makes them simple. Test your AI-powered apps with a local dev server.
- image:
- html:
- actions:
- - text: Get Started
- link: /quickstart/
- icon: right-arrow
----
-
-import { Card, CardGrid } from "@astrojs/starlight/components";
-
-## Features
-
-
-
- `.eval.ts` is the new `.test.ts`. Write evals in TypeScript with an
- instantly familiar API.
-
-
- Explore your outputs, traces and logs in a beautiful, interactive UI that
- runs on `localhost`.
-
-
- Works out of the box with your stack. Use mocks, fixtures, and more to
- simulate real-world scenarios.
-
-
- Work with any LLM, and test outputs against each other seamlessly.
-
-
- Export your evals as static HTML bundles for CI/CD. Use score thresholds to
- fail the build.
-
-
diff --git a/apps/evalite-docs/src/content/docs/quickstart.mdx b/apps/evalite-docs/src/content/docs/quickstart.mdx
deleted file mode 100644
index c26997da..00000000
--- a/apps/evalite-docs/src/content/docs/quickstart.mdx
+++ /dev/null
@@ -1,104 +0,0 @@
----
-title: Quickstart
-description: A guide in my new Starlight docs site.
----
-
-import { Aside, Steps } from "@astrojs/starlight/components";
-
-We're going to walk through setting up Evalite in an existing project.
-
-
-
-1. Install `evalite`, `vitest`, and a scoring library like `autoevals`:
-
- ```bash
- pnpm add -D evalite vitest autoevals
- ```
-
-2. Add an `eval:dev` script to your package.json:
-
- ```json
- {
- "scripts": {
- "eval:dev": "evalite watch"
- }
- }
- ```
-
-3. Create your first eval:
-
- ```ts
- // my-eval.eval.ts
-
- import { evalite } from "evalite";
- import { Levenshtein } from "autoevals";
-
- evalite("My Eval", {
- // An array of test data
- // - TODO: Replace with your test data
- data: [{ input: "Hello", expected: "Hello World!" }],
- // The task to perform
- // - TODO: Replace with your LLM call
- task: async (input) => {
- return input + " World!";
- },
- // The scoring methods for the eval
- scorers: [Levenshtein],
- });
- ```
-
-
-
-4. Run `pnpm run eval:dev`.
-
- ```bash
- pnpm run eval:dev
- ```
-
- This runs `evalite`, which runs the evals:
-
- - Runs the `data` function to get the test data
- - Runs the `task` function on each test data
- - Scores the output of the `task` function using the `scorers`
- - Saves the results to a sqlite database in `node_modules/.evalite`
-
- It then:
-
- - Shows a UI for viewing the traces, scores, inputs and outputs at http://localhost:3006.
- - If you only ran one eval, it also shows a table summarizing the eval in the terminal.
-
-5. Open http://localhost:3006 in your browser to view the results of the eval.
-
-
-
-### What Next?
-
-Head to the [AI SDK example](/examples/ai-sdk) to see a fully-fleshed out example of Evalite in action.
-
-### Troubleshooting
-
-##### Error: Could not locate the bindings file
-
-Some users experienced issues running `evalite watch`. Your package manager will report the following error message:
-
-```
-Command failed, Error: Could not locate the bindings file.
-```
-
-This error is related to the `better-sqlite3` package. To resolve this, you can try the following steps:
-
-- Rebuild `better-sqlite3`:
-
-```bash
-pnpm rebuild better-sqlite3
-```
-
-- Approve the rebuild of the package with:
-
-```bash
-pnpm approve-builds
-```
diff --git a/apps/evalite-docs/src/env.d.ts b/apps/evalite-docs/src/env.d.ts
deleted file mode 100644
index acef35f1..00000000
--- a/apps/evalite-docs/src/env.d.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-///
-///
diff --git a/apps/evalite-docs/tsconfig.json b/apps/evalite-docs/tsconfig.json
deleted file mode 100644
index bcbf8b50..00000000
--- a/apps/evalite-docs/tsconfig.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "extends": "astro/tsconfigs/strict"
-}
diff --git a/apps/evalite-docs/vercel.json b/apps/evalite-docs/vercel.json
new file mode 100644
index 00000000..e37154c4
--- /dev/null
+++ b/apps/evalite-docs/vercel.json
@@ -0,0 +1,10 @@
+{
+ "buildCommand": "cd ../.. && pnpm install && pnpm --filter evalite-docs build",
+ "outputDirectory": "dist",
+ "framework": "astro",
+ "git": {
+ "deploymentEnabled": {
+ "v1": true
+ }
+ }
+}
diff --git a/apps/evalite-ui/app/components/display-input.tsx b/apps/evalite-ui/app/components/display-input.tsx
index c2d781ab..0d94deb5 100644
--- a/apps/evalite-ui/app/components/display-input.tsx
+++ b/apps/evalite-ui/app/components/display-input.tsx
@@ -1,17 +1,26 @@
import type { Evalite } from "evalite/types";
import { EvaliteFile } from "evalite/utils";
+import type { DynamicToolCall, StaticToolCall, UIMessage } from "ai";
import {
AlertCircle,
ChevronDown,
ChevronUp,
DownloadIcon,
+ Code2,
+ Wrench,
} from "lucide-react";
import React, { Fragment, useLayoutEffect, useRef, useState } from "react";
import { JSONTree } from "react-json-tree";
-import ReactMarkdown from "react-markdown";
-import remarkGfm from "remark-gfm";
import { downloadFile, serveFile } from "~/sdk";
+import { Response } from "./response";
import { Button } from "./ui/button";
+import { cn } from "~/lib/utils";
+import {
+ analyzeForTableRendering,
+ analyzeForSingleRowTable,
+ tableDataToMarkdown,
+ singleRowTableToMarkdown,
+} from "~/utils/render-detection";
// Helper function to find single string value in an object and its path
const findSingleStringValue = (
@@ -84,12 +93,7 @@ const DisplayText = ({
overflow: "hidden",
}}
>
-
- {input}
-
+ {input}
{status === "showing-show-more-button" && shouldTruncateText && (
@@ -129,7 +133,7 @@ const DisplayJSON = ({
input: object;
name: string | undefined;
}) => {
- // Check if object has only one string value
+ // Check if object has only one string value (legacy behavior)
const singleStringResult = findSingleStringValue(input);
if (singleStringResult) {
@@ -137,11 +141,13 @@ const DisplayJSON = ({
return (