From 352fbc86ce75d9dbb07707d556bccc785ba962c9 Mon Sep 17 00:00:00 2001 From: Gordon Mickel Date: Wed, 26 Nov 2025 09:00:40 +0100 Subject: [PATCH 1/4] feat: add watchFiles support for Evalite watch mode --- .../src/content/docs/guides/cli.mdx | 126 +++ .../src/content/docs/guides/configuration.mdx | 1 + .../docs/guides/running-programmatically.mdx | 135 ++++ .../config-watchfiles/evalite.config.ts | 5 + .../fixtures/config-watchfiles/test.eval.ts | 7 + packages/evalite-tests/tests/test-utils.ts | 1 + .../evalite-tests/tests/watch-files.test.ts | 73 ++ packages/evalite/package-lock.json | 721 ++++++++++++++++++ packages/evalite/src/config.ts | 5 +- packages/evalite/src/run-evalite.ts | 23 +- packages/evalite/src/types.ts | 23 + 11 files changed, 1116 insertions(+), 4 deletions(-) create mode 100644 apps/evalite-docs/src/content/docs/guides/cli.mdx create mode 100644 apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx create mode 100644 packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts create mode 100644 packages/evalite-tests/tests/fixtures/config-watchfiles/test.eval.ts create mode 100644 packages/evalite-tests/tests/watch-files.test.ts create mode 100644 packages/evalite/package-lock.json diff --git a/apps/evalite-docs/src/content/docs/guides/cli.mdx b/apps/evalite-docs/src/content/docs/guides/cli.mdx new file mode 100644 index 00000000..00e48f3f --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/cli.mdx @@ -0,0 +1,126 @@ +--- +title: CLI +--- + +## Watch Mode + +You can run Evalite in watch mode by running `evalite watch`: + +```bash +evalite watch +``` + +This will watch for changes to your `.eval.ts` files (and any additional files configured in `evalite.config.ts`) and re-run the evals when they change. + +> [!IMPORTANT] +> +> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits. + +### Watching Additional Files + +By default, `evalite watch` only triggers reruns when your `*.eval.ts` files change. + +If your evals depend on other files that Vitest can't automatically detect (e.g., prompt templates, external data files, or CLI build outputs), you can configure extra watch globs in `evalite.config.ts`: + +```ts +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + watchFiles: [ + "src/**/*.ts", // helper / model code + "prompts/**/*", // prompt templates + "data/**/*.json", // test data + ], +}); +``` + +For monorepos, you can watch files across multiple packages: + +```ts +// evalite.config.ts at repo root +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + watchFiles: ["apps/web/src/**/*.ts", "apps/web/prompts/**/*"], +}); +``` + +These globs are passed through to Vitest's [`forceRerunTriggers`](https://vitest.dev/config/#forcereruntriggers) option, so any change to a matching file will trigger a full eval rerun. + +> [!NOTE] +> Globs are resolved relative to the directory where you run evalite (the Evalite cwd). +> If the files you want to watch live outside that directory, run `evalite` from the project root or pass `cwd` in the Node API so that those paths fall under the same root. + +### Hiding the Table Output + +When debugging with `console.log`, the detailed table output can make it harder to see your logs. You can hide it with `--hideTable`: + +```bash +evalite watch --hideTable +``` + +This keeps the score summary but removes the detailed results table from the CLI output. + +## Serve Mode + +You can run evals once and serve the UI without re-running on file changes: + +```bash +evalite serve +``` + +This runs your evals once and keeps the UI server running at `http://localhost:3006`. Unlike watch mode, tests won't re-run when files change. + +Since evals can take a while to run, this can be a useful alternative to watch mode. + +To re-run evals after making changes, restart `evalite serve`. + +## Running Specific Files + +You can run specific files by passing them as arguments: + +```bash +evalite my-eval.eval.ts +``` + +This also works for `watch` and `serve` modes: + +```bash +evalite watch my-eval.eval.ts +evalite serve my-eval.eval.ts +``` + +## Threshold + +You can tell Evalite that your evals must pass a specific score by passing `--threshold`: + +```bash +evalite --threshold=50 # Score must be greater than or equal to 50 + +evalite watch --threshold=70 # Also works in watch mode +``` + +This is useful for running on CI. If the score threshold is not met, it will fail the process. + +## Export Command + +Export eval results as a static HTML bundle: + +```bash +evalite export +``` + +This exports the latest run to `./evalite-export` by default. + +### Options + +- `--output` - Custom output directory +- `--runId` - Export specific run ID +- `--basePath` - Base path for non-root hosting (must start with `/`) + +```bash +evalite export --basePath=/evals-123 --output=./my-export +``` + +See the [CI/CD guide](/guides/ci) for full documentation on exporting and viewing static UI bundles. diff --git a/apps/evalite-docs/src/content/docs/guides/configuration.mdx b/apps/evalite-docs/src/content/docs/guides/configuration.mdx index ca4769a1..cf19c21a 100644 --- a/apps/evalite-docs/src/content/docs/guides/configuration.mdx +++ b/apps/evalite-docs/src/content/docs/guides/configuration.mdx @@ -36,6 +36,7 @@ export default defineConfig({ - **`setupFiles`**: Array of file paths to run before tests (e.g., for loading environment variables). - **`cache`**: Enable or disable caching of AI SDK model outputs. Default is true. See [Vercel AI SDK](/tips/vercel-ai-sdk#caching) for details. - **`viteConfig`**: Pass through Vite/Vitest configuration options. This allows you to import and use your existing vite.config.ts explicitly. +- **`watchFiles`**: Extra file globs that trigger eval reruns in watch mode (globs are resolved relative to the directory where you run Evalite). See [Watching Additional Files](/guides/cli#watching-additional-files). ## Important Configuration Options diff --git a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx b/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx new file mode 100644 index 00000000..a6fea965 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx @@ -0,0 +1,135 @@ +--- +title: Running Programmatically +--- + +You can run Evalite programmatically using the Node API. This is useful when you want to integrate Evalite into your own scripts, CI/CD pipelines, or custom tooling. + +## Basic Usage + +Import the `runEvalite` function from `evalite/runner`: + +```typescript +import { runEvalite } from "evalite/runner"; + +await runEvalite({ + mode: "run-once-and-exit", +}); +``` + +That's it! The `path` and `cwd` parameters are optional and default to running all evals in the current directory. + +## Run Modes + +### Run Once and Exit + +This mode runs all evals once and exits. It's ideal for CI/CD pipelines: + +```typescript +await runEvalite({ + mode: "run-once-and-exit", +}); +``` + +### Watch Mode + +This mode watches for file changes and re-runs evals automatically. It also starts the Evalite UI server: + +```typescript +await runEvalite({ + mode: "watch-for-file-changes", +}); +``` + +> [!IMPORTANT] +> +> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits. + +## Options + +### `path` + +Optional path filter to run specific eval files. Defaults to `undefined` (runs all evals): + +```typescript +await runEvalite({ + path: "my-eval.eval.ts", + mode: "run-once-and-exit", +}); +``` + +### `cwd` + +The working directory to run evals from. Defaults to `process.cwd()`: + +```typescript +await runEvalite({ + cwd: "/path/to/my/project", + mode: "run-once-and-exit", +}); +``` + +### `scoreThreshold` + +Set a minimum score threshold (0-100). If the average score falls below this threshold, the process will exit with a non-zero exit code: + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 80, // Fail if score is below 80 +}); +``` + +This is particularly useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold. + +### `outputPath` + +Export the results to a JSON file after the run completes: + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + outputPath: "./results.json", +}); +``` + +The exported JSON file contains the complete run data including all evals, results, scores, and traces. + +### `watchFiles` + +Specify extra file globs that trigger eval reruns in watch mode. This overrides any `watchFiles` setting in `evalite.config.ts`: + +```typescript +await runEvalite({ + mode: "watch-for-file-changes", + watchFiles: ["src/**/*.ts", "prompts/**/*", "data/**/*.json"], +}); +``` + +This is useful when your evals depend on files that aren't automatically detected as dependencies (e.g., prompt templates, external data files). + +> [!TIP] > `watchFiles` globs are evaluated relative to the `cwd` you pass (or `process.cwd()` if unspecified). +> If your watched files live outside the default working directory, point `cwd` at the project root so Vitest can see those changes. + +## Complete Example + +Here's a complete example that combines multiple options: + +```typescript +import { runEvalite } from "evalite/runner"; + +async function runEvals() { + try { + await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 75, // Fail if average score < 75 + outputPath: "./evalite-results.json", // Export results + }); + console.log("All evals passed!"); + } catch (error) { + console.error("Evals failed:", error); + process.exit(1); + } +} + +runEvals(); +``` diff --git a/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts b/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts new file mode 100644 index 00000000..28f2aec6 --- /dev/null +++ b/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts @@ -0,0 +1,5 @@ +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + watchFiles: ["src/**/*.ts", "data/**/*.json"], +}); diff --git a/packages/evalite-tests/tests/fixtures/config-watchfiles/test.eval.ts b/packages/evalite-tests/tests/fixtures/config-watchfiles/test.eval.ts new file mode 100644 index 00000000..caeecfe3 --- /dev/null +++ b/packages/evalite-tests/tests/fixtures/config-watchfiles/test.eval.ts @@ -0,0 +1,7 @@ +import { evalite } from "evalite"; + +evalite("WatchFiles Config Test", { + data: () => [{ input: "hello", expected: "hello" }], + task: async (input) => input, + scorers: [], +}); diff --git a/packages/evalite-tests/tests/test-utils.ts b/packages/evalite-tests/tests/test-utils.ts index 6d955079..862c43fa 100644 --- a/packages/evalite-tests/tests/test-utils.ts +++ b/packages/evalite-tests/tests/test-utils.ts @@ -69,6 +69,7 @@ export const loadFixture = async ( * Enable cache for AI SDK model outputs. */ cacheEnabled?: boolean; + watchFiles?: string[]; }) => { const result = await runEvalite({ ...opts, diff --git a/packages/evalite-tests/tests/watch-files.test.ts b/packages/evalite-tests/tests/watch-files.test.ts new file mode 100644 index 00000000..760c2ed1 --- /dev/null +++ b/packages/evalite-tests/tests/watch-files.test.ts @@ -0,0 +1,73 @@ +import { expect, it } from "vitest"; +import { configDefaults } from "vitest/config"; +import { getEvalsAsRecordViaStorage, loadFixture } from "./test-utils.js"; + +it("watchFiles in evalite.config.ts should configure Vitest forceRerunTriggers", async () => { + await using fixture = await loadFixture("config-watchfiles"); + + const vitest = await fixture.run({ + mode: "run-once-and-exit", + }); + + // Verify the forceRerunTriggers includes our watchFiles + const forceRerunTriggers = vitest.config.forceRerunTriggers; + + expect(forceRerunTriggers).toContain("src/**/*.ts"); + expect(forceRerunTriggers).toContain("data/**/*.json"); + + // Verify Vitest defaults are preserved (use configDefaults to stay resilient to Vitest changes) + for (const pattern of configDefaults.forceRerunTriggers) { + expect(forceRerunTriggers).toContain(pattern); + } + + const evals = await getEvalsAsRecordViaStorage(fixture.storage); + + // Should complete successfully + expect(evals["WatchFiles Config Test"]).toHaveLength(1); + expect(evals["WatchFiles Config Test"]?.[0]?.status).toBe("success"); +}); + +it("watchFiles passed to runEvalite should override evalite.config.ts", async () => { + await using fixture = await loadFixture("config-watchfiles"); + + // Override the config's watchFiles with different values + const vitest = await fixture.run({ + mode: "run-once-and-exit", + watchFiles: ["custom/**/*.md"], + }); + + const forceRerunTriggers = vitest.config.forceRerunTriggers; + + // Should contain the override value + expect(forceRerunTriggers).toContain("custom/**/*.md"); + + // Should NOT contain the config file values since we overrode them + expect(forceRerunTriggers).not.toContain("src/**/*.ts"); + expect(forceRerunTriggers).not.toContain("data/**/*.json"); + + // Should still include Vitest defaults + for (const pattern of configDefaults.forceRerunTriggers) { + expect(forceRerunTriggers).toContain(pattern); + } +}); + +it("empty watchFiles array should not add any extra triggers", async () => { + await using fixture = await loadFixture("config-watchfiles"); + + // Override with empty array - should result in only Vitest defaults + const vitest = await fixture.run({ + mode: "run-once-and-exit", + watchFiles: [], + }); + + const forceRerunTriggers = vitest.config.forceRerunTriggers; + + // Should NOT contain the config file values since we overrode with empty array + expect(forceRerunTriggers).not.toContain("src/**/*.ts"); + expect(forceRerunTriggers).not.toContain("data/**/*.json"); + + // Should still include Vitest defaults (this verifies we didn't break anything) + for (const pattern of configDefaults.forceRerunTriggers) { + expect(forceRerunTriggers).toContain(pattern); + } +}); diff --git a/packages/evalite/package-lock.json b/packages/evalite/package-lock.json new file mode 100644 index 00000000..2df591ba --- /dev/null +++ b/packages/evalite/package-lock.json @@ -0,0 +1,721 @@ +{ + "name": "evalite", + "version": "0.19.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "evalite", + "version": "0.19.0", + "dependencies": { + "@ai-sdk/provider": "^2.0.0", + "@fastify/static": "^8.2.0", + "@fastify/websocket": "11.2.0", + "@stricli/auto-complete": "^1.2.0", + "@stricli/core": "^1.2.0", + "@vitest/runner": "^4.0.0", + "@vitest/utils": "^4.0.1", + "better-sqlite3": "^11.6.0", + "fastify": "^5.6.1", + "file-type": "^19.6.0", + "jiti": "^2.6.1", + "table": "^6.9.0", + "tinyrainbow": "^3.0.3" + }, + "bin": { + "evalite": "dist/bin.js" + }, + "devDependencies": { + "@types/better-sqlite3": "^7.6.13", + "@types/ws": "^8.18.1", + "ai": "^5.0.59", + "autoevals": "^0.0.131", + "unstorage": "^1.17.1" + } + }, + "../../node_modules/.pnpm/@ai-sdk+provider@2.0.0/node_modules/@ai-sdk/provider": { + "version": "2.0.0", + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "devDependencies": { + "@types/json-schema": "7.0.15", + "@types/node": "20.17.24", + "@vercel/ai-tsconfig": "0.0.0", + "tsup": "^8", + "typescript": "5.8.3" + }, + "engines": { + "node": ">=18" + } + }, + "../../node_modules/.pnpm/@fastify+static@8.2.0/node_modules/@fastify/static": { + "version": "8.2.0", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT", + "dependencies": { + "@fastify/accept-negotiator": "^2.0.0", + "@fastify/send": "^4.0.0", + "content-disposition": "^0.5.4", + "fastify-plugin": "^5.0.0", + "fastq": "^1.17.1", + "glob": "^11.0.0" + }, + "devDependencies": { + "@fastify/compress": "^8.0.0", + "@fastify/pre-commit": "^2.1.0", + "@types/node": "^22.0.0", + "borp": "^0.20.0", + "c8": "^10.1.3", + "concat-stream": "^2.0.0", + "eslint": "^9.17.0", + "fastify": "^5.1.0", + "neostandard": "^0.12.0", + "pino": "^9.1.0", + "proxyquire": "^2.1.3", + "tsd": "^0.32.0" + } + }, + "../../node_modules/.pnpm/@fastify+websocket@11.2.0/node_modules/@fastify/websocket": { + "version": "11.2.0", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT", + "dependencies": { + "duplexify": "^4.1.3", + "fastify-plugin": "^5.0.0", + "ws": "^8.16.0" + }, + "devDependencies": { + "@fastify/pre-commit": "^2.1.0", + "@fastify/type-provider-typebox": "^5.0.0", + "@types/node": "^24.0.9", + "@types/ws": "^8.5.10", + "c8": "^10.1.3", + "eslint": "^9.17.0", + "fastify": "^5.0.0", + "fastify-tsconfig": "^3.0.0", + "neostandard": "^0.12.0", + "split2": "^4.2.0", + "tsd": "^0.32.0" + } + }, + "../../node_modules/.pnpm/@stricli+auto-complete@1.2.0/node_modules/@stricli/auto-complete": { + "version": "1.2.0", + "license": "Apache-2.0", + "dependencies": { + "@stricli/core": "^1.2.0" + }, + "bin": { + "auto-complete": "dist/bin/cli.js" + }, + "devDependencies": { + "@typescript-eslint/eslint-plugin": "^8.2.0", + "@typescript-eslint/parser": "^8.2.0", + "eslint": "^8.57.0", + "eslint-plugin-import": "^2.26.0", + "eslint-plugin-prettier": "^5.0.0", + "prettier": "^3.0.0", + "tsup": "^6.7.0", + "typescript": "5.6.x" + } + }, + "../../node_modules/.pnpm/@stricli+core@1.2.0/node_modules/@stricli/core": { + "version": "1.2.0", + "license": "Apache-2.0", + "devDependencies": { + "@types/chai": "^4.3.11", + "@types/fs-extra": "^11.0.4", + "@types/mocha": "^10.0.6", + "@types/sinon": "^17.0.2", + "@typescript-eslint/eslint-plugin": "^8.2.0", + "@typescript-eslint/parser": "^8.2.0", + "c8": "^8.0.1", + "chai": "^4.3.10", + "eslint": "^8.57.0", + "eslint-plugin-header": "^3.1.1", + "eslint-plugin-import": "^2.29.1", + "eslint-plugin-prettier": "^5.1.3", + "fs-extra": "^11.2.0", + "mocha": "^10.2.0", + "prettier": "^3.2.5", + "sinon": "^17.0.1", + "tsup": "^8.0.1", + "tsx": "^4.8.2", + "typescript": "5.6.x" + } + }, + "../../node_modules/.pnpm/@types+better-sqlite3@7.6.13/node_modules/@types/better-sqlite3": { + "version": "7.6.13", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "../../node_modules/.pnpm/@types+ws@8.18.1/node_modules/@types/ws": { + "version": "8.18.1", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "../../node_modules/.pnpm/@vitest+runner@4.0.1/node_modules/@vitest/runner": { + "version": "4.0.1", + "license": "MIT", + "dependencies": { + "@vitest/utils": "4.0.1", + "pathe": "^2.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "../../node_modules/.pnpm/@vitest+utils@4.0.1/node_modules/@vitest/utils": { + "version": "4.0.1", + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "4.0.1", + "tinyrainbow": "^3.0.3" + }, + "devDependencies": { + "@jridgewell/trace-mapping": "0.3.31", + "@types/estree": "^1.0.8", + "diff-sequences": "^29.6.3", + "loupe": "^3.2.1", + "tinyhighlight": "^0.3.2" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "../../node_modules/.pnpm/ai@5.0.59_zod@3.25.76/node_modules/ai": { + "version": "5.0.59", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/gateway": "1.0.32", + "@ai-sdk/provider": "2.0.0", + "@ai-sdk/provider-utils": "3.0.10", + "@opentelemetry/api": "1.9.0" + }, + "devDependencies": { + "@edge-runtime/vm": "^5.0.0", + "@types/json-schema": "7.0.15", + "@types/node": "20.17.24", + "@vercel/ai-tsconfig": "0.0.0", + "eslint": "8.57.1", + "eslint-config-vercel-ai": "0.0.0", + "tsup": "^7.2.0", + "typescript": "5.8.3", + "zod": "3.25.76" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.25.76 || ^4.1.8" + } + }, + "../../node_modules/.pnpm/autoevals@0.0.131_ws@8.18.0/node_modules/autoevals": { + "version": "0.0.131", + "dev": true, + "license": "MIT", + "dependencies": { + "ajv": "^8.17.1", + "compute-cosine-similarity": "^1.1.0", + "js-levenshtein": "^1.1.6", + "js-yaml": "^4.1.0", + "linear-sum-assignment": "^1.0.7", + "mustache": "^4.2.0", + "openai": "^4.104.0", + "zod": "^3.25.76", + "zod-to-json-schema": "^3.24.6" + }, + "devDependencies": { + "@rollup/plugin-yaml": "^4.1.2", + "@types/js-levenshtein": "^1.1.3", + "@types/js-yaml": "^4.0.9", + "@types/mustache": "^4.2.6", + "@types/node": "^20.19.11", + "msw": "^2.10.5", + "tsup": "^8.5.0", + "tsx": "^3.14.0", + "typedoc": "^0.25.13", + "typedoc-plugin-markdown": "^3.17.1", + "typescript": "^5.9.2", + "vitest": "^2.1.9" + } + }, + "../../node_modules/.pnpm/better-sqlite3@11.6.0/node_modules/better-sqlite3": { + "version": "11.6.0", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + }, + "devDependencies": { + "chai": "^4.3.8", + "cli-color": "^2.0.3", + "fs-extra": "^11.1.1", + "mocha": "^10.2.0", + "nodemark": "^0.3.0", + "prebuild": "^13.0.0", + "sqlite": "^5.0.1", + "sqlite3": "^5.1.6" + } + }, + "../../node_modules/.pnpm/fastify@5.6.1/node_modules/fastify": { + "version": "5.6.1", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT", + "dependencies": { + "@fastify/ajv-compiler": "^4.0.0", + "@fastify/error": "^4.0.0", + "@fastify/fast-json-stringify-compiler": "^5.0.0", + "@fastify/proxy-addr": "^5.0.0", + "abstract-logging": "^2.0.1", + "avvio": "^9.0.0", + "fast-json-stringify": "^6.0.0", + "find-my-way": "^9.0.0", + "light-my-request": "^6.0.0", + "pino": "^9.0.0", + "process-warning": "^5.0.0", + "rfdc": "^1.3.1", + "secure-json-parse": "^4.0.0", + "semver": "^7.6.0", + "toad-cache": "^3.7.0" + }, + "devDependencies": { + "@fastify/pre-commit": "^2.1.0", + "@jsumners/line-reporter": "^1.0.1", + "@sinclair/typebox": "^0.34.13", + "@sinonjs/fake-timers": "^11.2.2", + "@stylistic/eslint-plugin": "^5.1.0", + "@stylistic/eslint-plugin-js": "^4.1.0", + "@types/node": "^24.0.12", + "ajv": "^8.12.0", + "ajv-errors": "^3.0.0", + "ajv-formats": "^3.0.1", + "ajv-i18n": "^4.2.0", + "ajv-merge-patch": "^5.0.1", + "autocannon": "^8.0.0", + "borp": "^0.20.0", + "branch-comparer": "^1.1.0", + "concurrently": "^9.1.2", + "cross-env": "^10.0.0", + "eslint": "^9.0.0", + "fast-json-body": "^1.1.0", + "fastify-plugin": "^5.0.0", + "fluent-json-schema": "^6.0.0", + "h2url": "^0.2.0", + "http-errors": "^2.0.0", + "joi": "^17.12.3", + "json-schema-to-ts": "^3.0.1", + "JSONStream": "^1.3.5", + "markdownlint-cli2": "^0.18.1", + "neostandard": "^0.12.0", + "node-forge": "^1.3.1", + "proxyquire": "^2.1.3", + "split2": "^4.2.0", + "tsd": "^0.32.0", + "typescript": "~5.9.2", + "undici": "^7.11.0", + "vary": "^1.1.2", + "yup": "^1.4.0" + } + }, + "../../node_modules/.pnpm/file-type@19.6.0/node_modules/file-type": { + "version": "19.6.0", + "license": "MIT", + "dependencies": { + "get-stream": "^9.0.1", + "strtok3": "^9.0.1", + "token-types": "^6.0.0", + "uint8array-extras": "^1.3.0" + }, + "devDependencies": { + "@tokenizer/token": "^0.3.0", + "@types/node": "^20.10.7", + "ava": "^6.0.1", + "commonmark": "^0.30.0", + "noop-stream": "^1.0.0", + "tsd": "^0.30.3", + "xo": "^0.56.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sindresorhus/file-type?sponsor=1" + } + }, + "../../node_modules/.pnpm/jiti@2.6.1/node_modules/jiti": { + "version": "2.6.1", + "license": "MIT", + "bin": { + "jiti": "lib/jiti-cli.mjs" + }, + "devDependencies": { + "@babel/core": "^7.28.4", + "@babel/helper-module-imports": "^7.27.1", + "@babel/helper-module-transforms": "^7.28.3", + "@babel/helper-plugin-utils": "^7.27.1", + "@babel/helper-simple-access": "^7.27.1", + "@babel/plugin-proposal-decorators": "^7.28.0", + "@babel/plugin-syntax-class-properties": "^7.12.13", + "@babel/plugin-syntax-import-assertions": "^7.27.1", + "@babel/plugin-syntax-jsx": "^7.27.1", + "@babel/plugin-transform-export-namespace-from": "^7.27.1", + "@babel/plugin-transform-react-jsx": "^7.27.1", + "@babel/plugin-transform-typescript": "^7.28.0", + "@babel/preset-typescript": "^7.27.1", + "@babel/template": "^7.27.2", + "@babel/traverse": "^7.28.4", + "@babel/types": "^7.28.4", + "@rspack/cli": "^1.5.8", + "@rspack/core": "^1.5.8", + "@types/babel__core": "^7.20.5", + "@types/babel__helper-module-imports": "^7.18.3", + "@types/babel__helper-plugin-utils": "^7.10.3", + "@types/babel__template": "^7.4.4", + "@types/babel__traverse": "^7.28.0", + "@types/node": "^24.6.1", + "@vitest/coverage-v8": "^3.2.4", + "acorn": "^8.15.0", + "babel-plugin-parameter-decorator": "^1.0.16", + "changelogen": "^0.6.2", + "config": "^4.1.1", + "consola": "^3.4.2", + "defu": "^6.1.4", + "destr": "^2.0.5", + "escape-string-regexp": "^5.0.0", + "eslint": "^9.36.0", + "eslint-config-unjs": "^0.5.0", + "estree-walker": "^3.0.3", + "etag": "^1.8.1", + "fast-glob": "^3.3.3", + "is-installed-globally": "^1.0.0", + "mime": "^4.1.0", + "mlly": "^1.8.0", + "moment-timezone": "^0.6.0", + "nano-jsx": "^0.2.0", + "pathe": "^2.0.3", + "pkg-types": "^2.3.0", + "preact": "^10.27.2", + "preact-render-to-string": "^6.6.2", + "prettier": "^3.6.2", + "react": "^19.1.1", + "react-dom": "^19.1.1", + "reflect-metadata": "^0.2.2", + "solid-js": "^1.9.9", + "std-env": "^3.9.0", + "terser-webpack-plugin": "^5.3.14", + "tinyexec": "^1.0.1", + "ts-loader": "^9.5.4", + "typescript": "^5.9.3", + "vitest": "^3.2.4", + "vue": "^3.5.22", + "yoctocolors": "^2.1.2", + "zod": "^4.1.11" + } + }, + "../../node_modules/.pnpm/table@6.9.0/node_modules/table": { + "version": "6.9.0", + "license": "BSD-3-Clause", + "dependencies": { + "ajv": "^8.0.1", + "lodash.truncate": "^4.4.2", + "slice-ansi": "^4.0.0", + "string-width": "^4.2.3", + "strip-ansi": "^6.0.1" + }, + "devDependencies": { + "@types/chai": "^4.2.16", + "@types/lodash.mapvalues": "^4.6.6", + "@types/lodash.truncate": "^4.4.6", + "@types/mocha": "^9.0.0", + "@types/node": "^14.14.37", + "@types/sinon": "^10.0.0", + "@types/slice-ansi": "^4.0.0", + "ajv-cli": "^5.0.0", + "ajv-keywords": "^5.0.0", + "chai": "^4.2.0", + "chalk": "^4.1.0", + "coveralls": "^3.1.0", + "eslint": "^7.32.0", + "eslint-config-canonical": "^25.0.0", + "gitdown": "^3.1.4", + "husky": "^4.3.6", + "js-beautify": "^1.14.0", + "lodash.mapvalues": "^4.6.0", + "mkdirp": "^1.0.4", + "mocha": "^8.2.1", + "nyc": "^15.1.0", + "semantic-release": "^17.3.1", + "sinon": "^12.0.1", + "ts-node": "^9.1.1", + "typescript": "4.5.2" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "../../node_modules/.pnpm/tinyrainbow@3.0.3/node_modules/tinyrainbow": { + "version": "3.0.3", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "../../node_modules/.pnpm/unstorage@1.17.1/node_modules/unstorage": { + "version": "1.17.1", + "dev": true, + "license": "MIT", + "dependencies": { + "anymatch": "^3.1.3", + "chokidar": "^4.0.3", + "destr": "^2.0.5", + "h3": "^1.15.4", + "lru-cache": "^10.4.3", + "node-fetch-native": "^1.6.7", + "ofetch": "^1.4.1", + "ufo": "^1.6.1" + }, + "devDependencies": { + "@azure/app-configuration": "^1.9.0", + "@azure/cosmos": "^4.5.0", + "@azure/data-tables": "^13.3.1", + "@azure/identity": "^4.11.1", + "@azure/keyvault-secrets": "^4.10.0", + "@azure/storage-blob": "^12.28.0", + "@capacitor/preferences": "^7.0.2", + "@cloudflare/workers-types": "^4.20250903.0", + "@deno/kv": "^0.12.0", + "@electric-sql/pglite": "^0.3.7", + "@libsql/client": "^0.15.14", + "@netlify/blobs": "^10.0.10", + "@planetscale/database": "^1.19.0", + "@types/deno": "^2.3.0", + "@types/ioredis-mock": "^8.2.6", + "@types/jsdom": "^21.1.7", + "@types/node": "^24.3.0", + "@upstash/redis": "^1.35.3", + "@vercel/blob": "^1.1.1", + "@vercel/functions": "^3.0.0", + "@vercel/kv": "^3.0.0", + "@vitest/coverage-v8": "^3.2.4", + "aws4fetch": "^1.0.20", + "azurite": "^3.35.0", + "better-sqlite3": "^12.2.0", + "changelogen": "^0.6.2", + "citty": "^0.1.6", + "db0": "^0.3.2", + "eslint": "^9.34.0", + "eslint-config-unjs": "^0.5.0", + "fake-indexeddb": "^6.2.2", + "get-port-please": "^3.2.0", + "idb-keyval": "^6.2.2", + "ioredis": "^5.7.0", + "ioredis-mock": "^8.9.0", + "jiti": "^2.5.1", + "jsdom": "^26.1.0", + "listhen": "^1.9.0", + "mitata": "^1.0.34", + "mlly": "^1.8.0", + "mongodb": "^6.19.0", + "mongodb-memory-server": "^10.2.0", + "prettier": "^3.6.2", + "scule": "^1.3.0", + "types-cloudflare-worker": "^1.2.0", + "typescript": "^5.9.2", + "unbuild": "^3.6.1", + "uploadthing": "^7.7.4", + "vite": "^7.1.4", + "vitest": "^3.2.4", + "wrangler": "^4.33.2" + }, + "peerDependencies": { + "@azure/app-configuration": "^1.8.0", + "@azure/cosmos": "^4.2.0", + "@azure/data-tables": "^13.3.0", + "@azure/identity": "^4.6.0", + "@azure/keyvault-secrets": "^4.9.0", + "@azure/storage-blob": "^12.26.0", + "@capacitor/preferences": "^6.0.3 || ^7.0.0", + "@deno/kv": ">=0.9.0", + "@netlify/blobs": "^6.5.0 || ^7.0.0 || ^8.1.0 || ^9.0.0 || ^10.0.0", + "@planetscale/database": "^1.19.0", + "@upstash/redis": "^1.34.3", + "@vercel/blob": ">=0.27.1", + "@vercel/functions": "^2.2.12 || ^3.0.0", + "@vercel/kv": "^1.0.1", + "aws4fetch": "^1.0.20", + "db0": ">=0.2.1", + "idb-keyval": "^6.2.1", + "ioredis": "^5.4.2", + "uploadthing": "^7.4.4" + }, + "peerDependenciesMeta": { + "@azure/app-configuration": { + "optional": true + }, + "@azure/cosmos": { + "optional": true + }, + "@azure/data-tables": { + "optional": true + }, + "@azure/identity": { + "optional": true + }, + "@azure/keyvault-secrets": { + "optional": true + }, + "@azure/storage-blob": { + "optional": true + }, + "@capacitor/preferences": { + "optional": true + }, + "@deno/kv": { + "optional": true + }, + "@netlify/blobs": { + "optional": true + }, + "@planetscale/database": { + "optional": true + }, + "@upstash/redis": { + "optional": true + }, + "@vercel/blob": { + "optional": true + }, + "@vercel/functions": { + "optional": true + }, + "@vercel/kv": { + "optional": true + }, + "aws4fetch": { + "optional": true + }, + "db0": { + "optional": true + }, + "idb-keyval": { + "optional": true + }, + "ioredis": { + "optional": true + }, + "uploadthing": { + "optional": true + } + } + }, + "node_modules/@ai-sdk/provider": { + "resolved": "../../node_modules/.pnpm/@ai-sdk+provider@2.0.0/node_modules/@ai-sdk/provider", + "link": true + }, + "node_modules/@fastify/static": { + "resolved": "../../node_modules/.pnpm/@fastify+static@8.2.0/node_modules/@fastify/static", + "link": true + }, + "node_modules/@fastify/websocket": { + "resolved": "../../node_modules/.pnpm/@fastify+websocket@11.2.0/node_modules/@fastify/websocket", + "link": true + }, + "node_modules/@stricli/auto-complete": { + "resolved": "../../node_modules/.pnpm/@stricli+auto-complete@1.2.0/node_modules/@stricli/auto-complete", + "link": true + }, + "node_modules/@stricli/core": { + "resolved": "../../node_modules/.pnpm/@stricli+core@1.2.0/node_modules/@stricli/core", + "link": true + }, + "node_modules/@types/better-sqlite3": { + "resolved": "../../node_modules/.pnpm/@types+better-sqlite3@7.6.13/node_modules/@types/better-sqlite3", + "link": true + }, + "node_modules/@types/ws": { + "resolved": "../../node_modules/.pnpm/@types+ws@8.18.1/node_modules/@types/ws", + "link": true + }, + "node_modules/@vitest/runner": { + "resolved": "../../node_modules/.pnpm/@vitest+runner@4.0.1/node_modules/@vitest/runner", + "link": true + }, + "node_modules/@vitest/utils": { + "resolved": "../../node_modules/.pnpm/@vitest+utils@4.0.1/node_modules/@vitest/utils", + "link": true + }, + "node_modules/ai": { + "resolved": "../../node_modules/.pnpm/ai@5.0.59_zod@3.25.76/node_modules/ai", + "link": true + }, + "node_modules/autoevals": { + "resolved": "../../node_modules/.pnpm/autoevals@0.0.131_ws@8.18.0/node_modules/autoevals", + "link": true + }, + "node_modules/better-sqlite3": { + "resolved": "../../node_modules/.pnpm/better-sqlite3@11.6.0/node_modules/better-sqlite3", + "link": true + }, + "node_modules/fastify": { + "resolved": "../../node_modules/.pnpm/fastify@5.6.1/node_modules/fastify", + "link": true + }, + "node_modules/file-type": { + "resolved": "../../node_modules/.pnpm/file-type@19.6.0/node_modules/file-type", + "link": true + }, + "node_modules/jiti": { + "resolved": "../../node_modules/.pnpm/jiti@2.6.1/node_modules/jiti", + "link": true + }, + "node_modules/table": { + "resolved": "../../node_modules/.pnpm/table@6.9.0/node_modules/table", + "link": true + }, + "node_modules/tinyrainbow": { + "resolved": "../../node_modules/.pnpm/tinyrainbow@3.0.3/node_modules/tinyrainbow", + "link": true + }, + "node_modules/unstorage": { + "resolved": "../../node_modules/.pnpm/unstorage@1.17.1/node_modules/unstorage", + "link": true + } + } +} diff --git a/packages/evalite/src/config.ts b/packages/evalite/src/config.ts index 5c382067..db7ae1ef 100644 --- a/packages/evalite/src/config.ts +++ b/packages/evalite/src/config.ts @@ -1,6 +1,6 @@ +import path from "node:path"; import { createJiti } from "jiti"; import { access } from "fs/promises"; -import path from "path"; import type { Evalite } from "./types.js"; /** @@ -17,7 +17,8 @@ import type { Evalite } from "./types.js"; * storage: () => createSqliteStorage("./custom.db"), * server: { port: 3001 }, * scoreThreshold: 80, - * hideTable: true + * hideTable: true, + * watchFiles: ["src/**\/*.ts", "prompts/**\/*"], * }) * ``` */ diff --git a/packages/evalite/src/run-evalite.ts b/packages/evalite/src/run-evalite.ts index ec0ef78f..764f7048 100644 --- a/packages/evalite/src/run-evalite.ts +++ b/packages/evalite/src/run-evalite.ts @@ -1,6 +1,7 @@ import { mkdir, writeFile } from "fs/promises"; import path from "path"; -import { Writable } from "stream"; +import type { Writable } from "stream"; +import { configDefaults } from "vitest/config"; import { createVitest, registerConsoleShortcuts } from "vitest/node"; import getPort from "get-port"; import { FILES_LOCATION } from "./backend-only-constants.js"; @@ -173,6 +174,7 @@ const exportResultsToJSON = async (opts: { * @param opts.mode - Execution mode: "watch-for-file-changes", "run-once-and-exit", "run-once-and-serve", or "run-once" * @param opts.scoreThreshold - Optional score threshold (0-100) to fail the process if scores are below * @param opts.outputPath - Optional path to write test results in JSON format after completion + * @param opts.watchFiles - Optional extra file globs that trigger reruns in watch mode (overrides evalite.config.ts if provided) * * @example * ```typescript @@ -190,6 +192,12 @@ const exportResultsToJSON = async (opts: { * mode: "watch-for-file-changes" * }); * + * // Watch mode with extra file triggers + * await runEvalite({ + * mode: "watch-for-file-changes", + * watchFiles: ["src/**.ts", "prompts/**"] + * }); + * * // Run specific eval file with custom working directory * await runEvalite({ * path: "tests/my-eval.eval.ts", @@ -211,6 +219,11 @@ export const runEvalite = async (opts: { disableServer?: boolean; cacheEnabled?: boolean; cacheDebug?: boolean; + /** + * Extra file globs that should trigger reruns in watch mode. + * Overrides `watchFiles` from evalite.config.ts if provided. + */ + watchFiles?: string[]; }) => { const cwd = opts.cwd ?? process.cwd(); const filesLocation = path.join(cwd, FILES_LOCATION); @@ -257,6 +270,12 @@ export const runEvalite = async (opts: { // 2. Add setupFiles from evalite.config.ts const setupFiles = ["evalite/env-setup-file", ...(config?.setupFiles || [])]; + // Evalite-level "extra watch files": + // Node API (opts.watchFiles) takes precedence over evalite.config.ts. + // If opts.watchFiles is defined (even []), it wins. + const watchFiles = + opts.watchFiles !== undefined ? opts.watchFiles : config?.watchFiles; + const filters = opts.path ? [opts.path] : undefined; process.env.EVALITE_REPORT_TRACES = "true"; @@ -282,7 +301,7 @@ export const runEvalite = async (opts: { } } - let exitCode: number | undefined = undefined; + let exitCode: number | undefined; // Merge user's viteConfig with evalite defaults const mergedViteConfig: ViteUserConfig = { diff --git a/packages/evalite/src/types.ts b/packages/evalite/src/types.ts index 2de53593..1e5818eb 100644 --- a/packages/evalite/src/types.ts +++ b/packages/evalite/src/types.ts @@ -154,6 +154,29 @@ export declare namespace Evalite { | "testNamePattern" >; }; + + /** + * Extra file globs that should trigger eval reruns in watch mode. + * + * Paths are interpreted the same way as Vitest's `test.forceRerunTriggers` + * (glob patterns, relative to the project root). + * + * This is useful when your evals depend on files that Vitest can't + * automatically detect as dependencies (e.g., prompt templates, external + * data files, or CLI build outputs). + * + * @example + * ```ts + * export default defineConfig({ + * watchFiles: [ + * "src/**\/*.ts", // model / helper code + * "prompts/**\/*", // prompt templates + * "data/**\/*.json", // test data + * ] + * }) + * ``` + */ + watchFiles?: string[]; } export type RunType = "full" | "partial"; From 01ab38dcca450755adc5413aa94d35ec77a414cd Mon Sep 17 00:00:00 2001 From: Gordon Mickel Date: Wed, 26 Nov 2025 11:31:41 +0100 Subject: [PATCH 2/4] refactor: rename watchFiles option to forceRerunTriggers --- .../evalite-docs/src/content/docs/api/cli.mdx | 23 +++ .../src/content/docs/api/define-config.mdx | 60 ++++++++ .../src/content/docs/api/run-evalite.mdx | 18 +++ .../src/content/docs/guides/cli.mdx | 126 ---------------- .../src/content/docs/guides/configuration.mdx | 2 +- .../docs/guides/running-programmatically.mdx | 135 ------------------ .../config-watchfiles/evalite.config.ts | 2 +- packages/evalite-tests/tests/test-utils.ts | 11 +- .../evalite-tests/tests/watch-files.test.ts | 14 +- packages/evalite/src/config.ts | 2 +- packages/evalite/src/run-evalite.ts | 18 +-- packages/evalite/src/types.ts | 12 +- 12 files changed, 133 insertions(+), 290 deletions(-) delete mode 100644 apps/evalite-docs/src/content/docs/guides/cli.mdx delete mode 100644 apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx diff --git a/apps/evalite-docs/src/content/docs/api/cli.mdx b/apps/evalite-docs/src/content/docs/api/cli.mdx index 905e2a48..da5df10a 100644 --- a/apps/evalite-docs/src/content/docs/api/cli.mdx +++ b/apps/evalite-docs/src/content/docs/api/cli.mdx @@ -74,6 +74,29 @@ evalite watch path/to/eval.eval.ts **Note:** `--outputPath` is not supported in watch mode. +#### Watching Additional Files + +By default, `evalite watch` only triggers reruns when your `*.eval.ts` files change. + +If your evals depend on other files that Vitest can't automatically detect (e.g., prompt templates, external data files, or CLI build outputs), you can configure extra watch globs in `evalite.config.ts`: + +```ts +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + forceRerunTriggers: [ + "src/**/*.ts", // helper / model code + "prompts/**/*", // prompt templates + "data/**/*.json", // test data + ], +}); +``` + +These globs are passed through to Vitest's [`forceRerunTriggers`](https://vitest.dev/config/#forcereruntriggers) option, so any change to a matching file will trigger a full eval rerun. + +> **Note:** Globs are resolved relative to the directory where you run evalite (the Evalite cwd). + **Examples:** ```bash diff --git a/apps/evalite-docs/src/content/docs/api/define-config.mdx b/apps/evalite-docs/src/content/docs/api/define-config.mdx index 99b3f5a0..8c786c16 100644 --- a/apps/evalite-docs/src/content/docs/api/define-config.mdx +++ b/apps/evalite-docs/src/content/docs/api/define-config.mdx @@ -18,6 +18,9 @@ defineConfig(config: { maxConcurrency?: number; trialCount?: number; setupFiles?: string[]; + cache?: boolean; + viteConfig?: ViteUserConfig; + forceRerunTriggers?: string[]; }): Evalite.Config ``` @@ -167,6 +170,57 @@ export default defineConfig({ **Note:** `.env` files are loaded automatically via `dotenv/config` - no need to configure them here. +### `cache` + +**Type:** `boolean` + +**Default:** `true` + +Enable or disable caching of AI SDK model outputs. See [Vercel AI SDK caching](/tips/vercel-ai-sdk#caching) for details. + +```typescript +export default defineConfig({ + cache: false, // Disable cache entirely +}); +``` + +### `viteConfig` + +**Type:** `ViteUserConfig` + +Pass-through Vite/Vitest configuration options. This allows you to import and use your existing `vite.config.ts` explicitly. + +```typescript +import { defineConfig } from "evalite/config"; +import viteConfig from "./vite.config.ts"; + +export default defineConfig({ + viteConfig: viteConfig, +}); +``` + +**Note:** `testTimeout`, `maxConcurrency`, and `setupFiles` must be configured at the root level of `evalite.config.ts`, not in `viteConfig.test`. + +### `forceRerunTriggers` + +**Type:** `string[]` + +**Default:** `[]` + +Extra file globs that trigger eval reruns in watch mode. This maps onto Vitest's [`forceRerunTriggers`](https://vitest.dev/config/#forcereruntriggers) option. + +```typescript +export default defineConfig({ + forceRerunTriggers: [ + "src/**/*.ts", // helper / model code + "prompts/**/*", // prompt templates + "data/**/*.json", // test data + ], +}); +``` + +Useful when your evals depend on files that Vitest can't automatically detect as dependencies (e.g., prompt templates, external data files). + ## Complete Example ```typescript @@ -196,6 +250,12 @@ export default defineConfig({ // Setup setupFiles: ["./test-setup.ts"], + + // Caching + cache: true, + + // Watch mode triggers + forceRerunTriggers: ["src/**/*.ts", "prompts/**/*"], }); ``` diff --git a/apps/evalite-docs/src/content/docs/api/run-evalite.mdx b/apps/evalite-docs/src/content/docs/api/run-evalite.mdx index b9f3912f..7d2d5a76 100644 --- a/apps/evalite-docs/src/content/docs/api/run-evalite.mdx +++ b/apps/evalite-docs/src/content/docs/api/run-evalite.mdx @@ -15,6 +15,7 @@ runEvalite(opts: { outputPath?: string; hideTable?: boolean; storage?: Evalite.Storage; + forceRerunTriggers?: string[]; }): Promise ``` @@ -140,6 +141,23 @@ await runEvalite({ See [Storage](/api/storage) for more details. +### `opts.forceRerunTriggers` + +**Type:** `string[]` (optional) + +Extra file globs that trigger eval reruns in watch mode. This overrides any `forceRerunTriggers` setting in `evalite.config.ts`. + +```typescript +await runEvalite({ + mode: "watch-for-file-changes", + forceRerunTriggers: ["src/**/*.ts", "prompts/**/*", "data/**/*.json"], +}); +``` + +This is useful when your evals depend on files that aren't automatically detected as dependencies (e.g., prompt templates, external data files). + +> **Tip:** `forceRerunTriggers` globs are evaluated relative to the `cwd` you pass (or `process.cwd()` if unspecified). + ## Usage Examples ### Basic CI/CD Script diff --git a/apps/evalite-docs/src/content/docs/guides/cli.mdx b/apps/evalite-docs/src/content/docs/guides/cli.mdx deleted file mode 100644 index 00e48f3f..00000000 --- a/apps/evalite-docs/src/content/docs/guides/cli.mdx +++ /dev/null @@ -1,126 +0,0 @@ ---- -title: CLI ---- - -## Watch Mode - -You can run Evalite in watch mode by running `evalite watch`: - -```bash -evalite watch -``` - -This will watch for changes to your `.eval.ts` files (and any additional files configured in `evalite.config.ts`) and re-run the evals when they change. - -> [!IMPORTANT] -> -> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits. - -### Watching Additional Files - -By default, `evalite watch` only triggers reruns when your `*.eval.ts` files change. - -If your evals depend on other files that Vitest can't automatically detect (e.g., prompt templates, external data files, or CLI build outputs), you can configure extra watch globs in `evalite.config.ts`: - -```ts -// evalite.config.ts -import { defineConfig } from "evalite/config"; - -export default defineConfig({ - watchFiles: [ - "src/**/*.ts", // helper / model code - "prompts/**/*", // prompt templates - "data/**/*.json", // test data - ], -}); -``` - -For monorepos, you can watch files across multiple packages: - -```ts -// evalite.config.ts at repo root -import { defineConfig } from "evalite/config"; - -export default defineConfig({ - watchFiles: ["apps/web/src/**/*.ts", "apps/web/prompts/**/*"], -}); -``` - -These globs are passed through to Vitest's [`forceRerunTriggers`](https://vitest.dev/config/#forcereruntriggers) option, so any change to a matching file will trigger a full eval rerun. - -> [!NOTE] -> Globs are resolved relative to the directory where you run evalite (the Evalite cwd). -> If the files you want to watch live outside that directory, run `evalite` from the project root or pass `cwd` in the Node API so that those paths fall under the same root. - -### Hiding the Table Output - -When debugging with `console.log`, the detailed table output can make it harder to see your logs. You can hide it with `--hideTable`: - -```bash -evalite watch --hideTable -``` - -This keeps the score summary but removes the detailed results table from the CLI output. - -## Serve Mode - -You can run evals once and serve the UI without re-running on file changes: - -```bash -evalite serve -``` - -This runs your evals once and keeps the UI server running at `http://localhost:3006`. Unlike watch mode, tests won't re-run when files change. - -Since evals can take a while to run, this can be a useful alternative to watch mode. - -To re-run evals after making changes, restart `evalite serve`. - -## Running Specific Files - -You can run specific files by passing them as arguments: - -```bash -evalite my-eval.eval.ts -``` - -This also works for `watch` and `serve` modes: - -```bash -evalite watch my-eval.eval.ts -evalite serve my-eval.eval.ts -``` - -## Threshold - -You can tell Evalite that your evals must pass a specific score by passing `--threshold`: - -```bash -evalite --threshold=50 # Score must be greater than or equal to 50 - -evalite watch --threshold=70 # Also works in watch mode -``` - -This is useful for running on CI. If the score threshold is not met, it will fail the process. - -## Export Command - -Export eval results as a static HTML bundle: - -```bash -evalite export -``` - -This exports the latest run to `./evalite-export` by default. - -### Options - -- `--output` - Custom output directory -- `--runId` - Export specific run ID -- `--basePath` - Base path for non-root hosting (must start with `/`) - -```bash -evalite export --basePath=/evals-123 --output=./my-export -``` - -See the [CI/CD guide](/guides/ci) for full documentation on exporting and viewing static UI bundles. diff --git a/apps/evalite-docs/src/content/docs/guides/configuration.mdx b/apps/evalite-docs/src/content/docs/guides/configuration.mdx index cf19c21a..7a6c1780 100644 --- a/apps/evalite-docs/src/content/docs/guides/configuration.mdx +++ b/apps/evalite-docs/src/content/docs/guides/configuration.mdx @@ -36,7 +36,7 @@ export default defineConfig({ - **`setupFiles`**: Array of file paths to run before tests (e.g., for loading environment variables). - **`cache`**: Enable or disable caching of AI SDK model outputs. Default is true. See [Vercel AI SDK](/tips/vercel-ai-sdk#caching) for details. - **`viteConfig`**: Pass through Vite/Vitest configuration options. This allows you to import and use your existing vite.config.ts explicitly. -- **`watchFiles`**: Extra file globs that trigger eval reruns in watch mode (globs are resolved relative to the directory where you run Evalite). See [Watching Additional Files](/guides/cli#watching-additional-files). +- **`forceRerunTriggers`**: Extra file globs that trigger eval reruns in watch mode. This maps onto Vitest's `test.forceRerunTriggers` option (globs resolved relative to the directory where you run Evalite). See [Watching Additional Files](/api/cli#watching-additional-files). ## Important Configuration Options diff --git a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx b/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx deleted file mode 100644 index a6fea965..00000000 --- a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx +++ /dev/null @@ -1,135 +0,0 @@ ---- -title: Running Programmatically ---- - -You can run Evalite programmatically using the Node API. This is useful when you want to integrate Evalite into your own scripts, CI/CD pipelines, or custom tooling. - -## Basic Usage - -Import the `runEvalite` function from `evalite/runner`: - -```typescript -import { runEvalite } from "evalite/runner"; - -await runEvalite({ - mode: "run-once-and-exit", -}); -``` - -That's it! The `path` and `cwd` parameters are optional and default to running all evals in the current directory. - -## Run Modes - -### Run Once and Exit - -This mode runs all evals once and exits. It's ideal for CI/CD pipelines: - -```typescript -await runEvalite({ - mode: "run-once-and-exit", -}); -``` - -### Watch Mode - -This mode watches for file changes and re-runs evals automatically. It also starts the Evalite UI server: - -```typescript -await runEvalite({ - mode: "watch-for-file-changes", -}); -``` - -> [!IMPORTANT] -> -> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits. - -## Options - -### `path` - -Optional path filter to run specific eval files. Defaults to `undefined` (runs all evals): - -```typescript -await runEvalite({ - path: "my-eval.eval.ts", - mode: "run-once-and-exit", -}); -``` - -### `cwd` - -The working directory to run evals from. Defaults to `process.cwd()`: - -```typescript -await runEvalite({ - cwd: "/path/to/my/project", - mode: "run-once-and-exit", -}); -``` - -### `scoreThreshold` - -Set a minimum score threshold (0-100). If the average score falls below this threshold, the process will exit with a non-zero exit code: - -```typescript -await runEvalite({ - mode: "run-once-and-exit", - scoreThreshold: 80, // Fail if score is below 80 -}); -``` - -This is particularly useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold. - -### `outputPath` - -Export the results to a JSON file after the run completes: - -```typescript -await runEvalite({ - mode: "run-once-and-exit", - outputPath: "./results.json", -}); -``` - -The exported JSON file contains the complete run data including all evals, results, scores, and traces. - -### `watchFiles` - -Specify extra file globs that trigger eval reruns in watch mode. This overrides any `watchFiles` setting in `evalite.config.ts`: - -```typescript -await runEvalite({ - mode: "watch-for-file-changes", - watchFiles: ["src/**/*.ts", "prompts/**/*", "data/**/*.json"], -}); -``` - -This is useful when your evals depend on files that aren't automatically detected as dependencies (e.g., prompt templates, external data files). - -> [!TIP] > `watchFiles` globs are evaluated relative to the `cwd` you pass (or `process.cwd()` if unspecified). -> If your watched files live outside the default working directory, point `cwd` at the project root so Vitest can see those changes. - -## Complete Example - -Here's a complete example that combines multiple options: - -```typescript -import { runEvalite } from "evalite/runner"; - -async function runEvals() { - try { - await runEvalite({ - mode: "run-once-and-exit", - scoreThreshold: 75, // Fail if average score < 75 - outputPath: "./evalite-results.json", // Export results - }); - console.log("All evals passed!"); - } catch (error) { - console.error("Evals failed:", error); - process.exit(1); - } -} - -runEvals(); -``` diff --git a/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts b/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts index 28f2aec6..a7975299 100644 --- a/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts +++ b/packages/evalite-tests/tests/fixtures/config-watchfiles/evalite.config.ts @@ -1,5 +1,5 @@ import { defineConfig } from "evalite/config"; export default defineConfig({ - watchFiles: ["src/**/*.ts", "data/**/*.json"], + forceRerunTriggers: ["src/**/*.ts", "data/**/*.json"], }); diff --git a/packages/evalite-tests/tests/test-utils.ts b/packages/evalite-tests/tests/test-utils.ts index 862c43fa..b1cff940 100644 --- a/packages/evalite-tests/tests/test-utils.ts +++ b/packages/evalite-tests/tests/test-utils.ts @@ -1,11 +1,12 @@ import { randomUUID } from "crypto"; +import type { Evalite } from "evalite"; +import { DB_LOCATION } from "evalite/backend-only-constants"; +import { createInMemoryStorage } from "evalite/in-memory-storage"; +import { runEvalite } from "evalite/runner"; import { cpSync, rmSync } from "fs"; import path from "path"; import { Writable } from "stream"; import stripAnsi from "strip-ansi"; -import type { Evalite } from "evalite"; -import { runEvalite } from "evalite/runner"; -import { createInMemoryStorage } from "evalite/in-memory-storage"; import type { Vitest } from "vitest/node"; const FIXTURES_DIR = path.join(import.meta.dirname, "./fixtures"); @@ -29,7 +30,7 @@ export const loadFixture = async ( const captured = captureStdout(); - let vitestInstance: Vitest | undefined = undefined; + let vitestInstance: Vitest | undefined; return { dir: dirPath, @@ -69,7 +70,7 @@ export const loadFixture = async ( * Enable cache for AI SDK model outputs. */ cacheEnabled?: boolean; - watchFiles?: string[]; + forceRerunTriggers?: string[]; }) => { const result = await runEvalite({ ...opts, diff --git a/packages/evalite-tests/tests/watch-files.test.ts b/packages/evalite-tests/tests/watch-files.test.ts index 760c2ed1..e0dac10b 100644 --- a/packages/evalite-tests/tests/watch-files.test.ts +++ b/packages/evalite-tests/tests/watch-files.test.ts @@ -2,14 +2,14 @@ import { expect, it } from "vitest"; import { configDefaults } from "vitest/config"; import { getEvalsAsRecordViaStorage, loadFixture } from "./test-utils.js"; -it("watchFiles in evalite.config.ts should configure Vitest forceRerunTriggers", async () => { +it("forceRerunTriggers in evalite.config.ts should configure Vitest forceRerunTriggers", async () => { await using fixture = await loadFixture("config-watchfiles"); const vitest = await fixture.run({ mode: "run-once-and-exit", }); - // Verify the forceRerunTriggers includes our watchFiles + // Verify the forceRerunTriggers includes our configured triggers const forceRerunTriggers = vitest.config.forceRerunTriggers; expect(forceRerunTriggers).toContain("src/**/*.ts"); @@ -27,13 +27,13 @@ it("watchFiles in evalite.config.ts should configure Vitest forceRerunTriggers", expect(evals["WatchFiles Config Test"]?.[0]?.status).toBe("success"); }); -it("watchFiles passed to runEvalite should override evalite.config.ts", async () => { +it("forceRerunTriggers passed to runEvalite should override evalite.config.ts", async () => { await using fixture = await loadFixture("config-watchfiles"); - // Override the config's watchFiles with different values + // Override the config's forceRerunTriggers with different values const vitest = await fixture.run({ mode: "run-once-and-exit", - watchFiles: ["custom/**/*.md"], + forceRerunTriggers: ["custom/**/*.md"], }); const forceRerunTriggers = vitest.config.forceRerunTriggers; @@ -51,13 +51,13 @@ it("watchFiles passed to runEvalite should override evalite.config.ts", async () } }); -it("empty watchFiles array should not add any extra triggers", async () => { +it("empty forceRerunTriggers array should not add any extra triggers", async () => { await using fixture = await loadFixture("config-watchfiles"); // Override with empty array - should result in only Vitest defaults const vitest = await fixture.run({ mode: "run-once-and-exit", - watchFiles: [], + forceRerunTriggers: [], }); const forceRerunTriggers = vitest.config.forceRerunTriggers; diff --git a/packages/evalite/src/config.ts b/packages/evalite/src/config.ts index db7ae1ef..1419ec95 100644 --- a/packages/evalite/src/config.ts +++ b/packages/evalite/src/config.ts @@ -18,7 +18,7 @@ import type { Evalite } from "./types.js"; * server: { port: 3001 }, * scoreThreshold: 80, * hideTable: true, - * watchFiles: ["src/**\/*.ts", "prompts/**\/*"], + * forceRerunTriggers: ["src/**\/*.ts", "prompts/**\/*"], * }) * ``` */ diff --git a/packages/evalite/src/run-evalite.ts b/packages/evalite/src/run-evalite.ts index 764f7048..82084e41 100644 --- a/packages/evalite/src/run-evalite.ts +++ b/packages/evalite/src/run-evalite.ts @@ -174,7 +174,7 @@ const exportResultsToJSON = async (opts: { * @param opts.mode - Execution mode: "watch-for-file-changes", "run-once-and-exit", "run-once-and-serve", or "run-once" * @param opts.scoreThreshold - Optional score threshold (0-100) to fail the process if scores are below * @param opts.outputPath - Optional path to write test results in JSON format after completion - * @param opts.watchFiles - Optional extra file globs that trigger reruns in watch mode (overrides evalite.config.ts if provided) + * @param opts.forceRerunTriggers - Optional extra file globs that trigger reruns in watch mode (overrides evalite.config.ts if provided) * * @example * ```typescript @@ -195,7 +195,7 @@ const exportResultsToJSON = async (opts: { * // Watch mode with extra file triggers * await runEvalite({ * mode: "watch-for-file-changes", - * watchFiles: ["src/**.ts", "prompts/**"] + * forceRerunTriggers: ["src/**\/*.ts", "prompts/**\/*"] * }); * * // Run specific eval file with custom working directory @@ -221,9 +221,9 @@ export const runEvalite = async (opts: { cacheDebug?: boolean; /** * Extra file globs that should trigger reruns in watch mode. - * Overrides `watchFiles` from evalite.config.ts if provided. + * Overrides `forceRerunTriggers` from evalite.config.ts if provided. */ - watchFiles?: string[]; + forceRerunTriggers?: string[]; }) => { const cwd = opts.cwd ?? process.cwd(); const filesLocation = path.join(cwd, FILES_LOCATION); @@ -271,10 +271,12 @@ export const runEvalite = async (opts: { const setupFiles = ["evalite/env-setup-file", ...(config?.setupFiles || [])]; // Evalite-level "extra watch files": - // Node API (opts.watchFiles) takes precedence over evalite.config.ts. - // If opts.watchFiles is defined (even []), it wins. - const watchFiles = - opts.watchFiles !== undefined ? opts.watchFiles : config?.watchFiles; + // Node API (opts.forceRerunTriggers) takes precedence over evalite.config.ts. + // If opts.forceRerunTriggers is defined (even []), it wins. + const extraForceRerunTriggers = + opts.forceRerunTriggers !== undefined + ? opts.forceRerunTriggers + : config?.forceRerunTriggers; const filters = opts.path ? [opts.path] : undefined; process.env.EVALITE_REPORT_TRACES = "true"; diff --git a/packages/evalite/src/types.ts b/packages/evalite/src/types.ts index 1e5818eb..29208e3c 100644 --- a/packages/evalite/src/types.ts +++ b/packages/evalite/src/types.ts @@ -158,17 +158,17 @@ export declare namespace Evalite { /** * Extra file globs that should trigger eval reruns in watch mode. * - * Paths are interpreted the same way as Vitest's `test.forceRerunTriggers` - * (glob patterns, relative to the project root). + * This maps directly onto Vitest's `test.forceRerunTriggers` option + * (glob patterns, relative to the project root / Evalite cwd). * * This is useful when your evals depend on files that Vitest can't - * automatically detect as dependencies (e.g., prompt templates, external - * data files, or CLI build outputs). + * automatically detect as dependencies (e.g., prompt templates, + * external data files, or CLI build outputs). * * @example * ```ts * export default defineConfig({ - * watchFiles: [ + * forceRerunTriggers: [ * "src/**\/*.ts", // model / helper code * "prompts/**\/*", // prompt templates * "data/**\/*.json", // test data @@ -176,7 +176,7 @@ export declare namespace Evalite { * }) * ``` */ - watchFiles?: string[]; + forceRerunTriggers?: string[]; } export type RunType = "full" | "partial"; From b9086ad6e1c2b9b6d0487628238d68e2f4c1f6f9 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Wed, 3 Dec 2025 12:54:29 +0000 Subject: [PATCH 3/4] Fixed errors and made the forceReruntriggers work how it does in vitest --- .../evalite-tests/tests/watch-files.test.ts | 24 ++++--------------- packages/evalite/src/run-evalite.ts | 7 +++--- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/packages/evalite-tests/tests/watch-files.test.ts b/packages/evalite-tests/tests/watch-files.test.ts index e0dac10b..237af4fa 100644 --- a/packages/evalite-tests/tests/watch-files.test.ts +++ b/packages/evalite-tests/tests/watch-files.test.ts @@ -1,6 +1,5 @@ import { expect, it } from "vitest"; -import { configDefaults } from "vitest/config"; -import { getEvalsAsRecordViaStorage, loadFixture } from "./test-utils.js"; +import { getSuitesAsRecordViaStorage, loadFixture } from "./test-utils.js"; it("forceRerunTriggers in evalite.config.ts should configure Vitest forceRerunTriggers", async () => { await using fixture = await loadFixture("config-watchfiles"); @@ -15,16 +14,11 @@ it("forceRerunTriggers in evalite.config.ts should configure Vitest forceRerunTr expect(forceRerunTriggers).toContain("src/**/*.ts"); expect(forceRerunTriggers).toContain("data/**/*.json"); - // Verify Vitest defaults are preserved (use configDefaults to stay resilient to Vitest changes) - for (const pattern of configDefaults.forceRerunTriggers) { - expect(forceRerunTriggers).toContain(pattern); - } - - const evals = await getEvalsAsRecordViaStorage(fixture.storage); + const suites = await getSuitesAsRecordViaStorage(fixture.storage); // Should complete successfully - expect(evals["WatchFiles Config Test"]).toHaveLength(1); - expect(evals["WatchFiles Config Test"]?.[0]?.status).toBe("success"); + expect(suites["WatchFiles Config Test"]).toHaveLength(1); + expect(suites["WatchFiles Config Test"]?.[0]?.status).toBe("success"); }); it("forceRerunTriggers passed to runEvalite should override evalite.config.ts", async () => { @@ -44,11 +38,6 @@ it("forceRerunTriggers passed to runEvalite should override evalite.config.ts", // Should NOT contain the config file values since we overrode them expect(forceRerunTriggers).not.toContain("src/**/*.ts"); expect(forceRerunTriggers).not.toContain("data/**/*.json"); - - // Should still include Vitest defaults - for (const pattern of configDefaults.forceRerunTriggers) { - expect(forceRerunTriggers).toContain(pattern); - } }); it("empty forceRerunTriggers array should not add any extra triggers", async () => { @@ -65,9 +54,4 @@ it("empty forceRerunTriggers array should not add any extra triggers", async () // Should NOT contain the config file values since we overrode with empty array expect(forceRerunTriggers).not.toContain("src/**/*.ts"); expect(forceRerunTriggers).not.toContain("data/**/*.json"); - - // Should still include Vitest defaults (this verifies we didn't break anything) - for (const pattern of configDefaults.forceRerunTriggers) { - expect(forceRerunTriggers).toContain(pattern); - } }); diff --git a/packages/evalite/src/run-evalite.ts b/packages/evalite/src/run-evalite.ts index 82084e41..37138462 100644 --- a/packages/evalite/src/run-evalite.ts +++ b/packages/evalite/src/run-evalite.ts @@ -273,10 +273,10 @@ export const runEvalite = async (opts: { // Evalite-level "extra watch files": // Node API (opts.forceRerunTriggers) takes precedence over evalite.config.ts. // If opts.forceRerunTriggers is defined (even []), it wins. - const extraForceRerunTriggers = - opts.forceRerunTriggers !== undefined + const forceRerunTriggers = + (opts.forceRerunTriggers !== undefined ? opts.forceRerunTriggers - : config?.forceRerunTriggers; + : config?.forceRerunTriggers) ?? configDefaults.forceRerunTriggers; const filters = opts.path ? [opts.path] : undefined; process.env.EVALITE_REPORT_TRACES = "true"; @@ -343,6 +343,7 @@ export const runEvalite = async (opts: { hideTable: hideTable, }), ], + forceRerunTriggers: forceRerunTriggers, root: cwd, include: ["**/*.eval.?(m)ts"], watch: opts.mode === "watch-for-file-changes", From 802fece679f0b1178d63eb0a379b0d60b066548f Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Wed, 3 Dec 2025 12:56:18 +0000 Subject: [PATCH 4/4] Changeset --- .changeset/busy-sheep-burn.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/busy-sheep-burn.md diff --git a/.changeset/busy-sheep-burn.md b/.changeset/busy-sheep-burn.md new file mode 100644 index 00000000..ff44e517 --- /dev/null +++ b/.changeset/busy-sheep-burn.md @@ -0,0 +1,5 @@ +--- +"evalite": minor +--- + +Added forceRerunTriggers to the config to match Vitest's version.