diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e7d85fe3f..cb998f93d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,61 +142,6 @@ jobs: echo "::error::Skill files are out of date. Run 'bun run generate:docs' locally and commit the result." exit 1 - eval-skill: - name: Eval SKILL.md - needs: [changes] - if: needs.changes.outputs.skill == 'true' - runs-on: ubuntu-latest - steps: - # For fork PRs: check if eval has already passed via commit status - - name: Detect fork - id: detect-fork - run: | - if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then - echo "is_fork=true" >> "$GITHUB_OUTPUT" - fi - - name: Check fork eval status - if: steps.detect-fork.outputs.is_fork == 'true' - env: - GH_TOKEN: ${{ github.token }} - run: | - SHA="${{ github.event.pull_request.head.sha }}" - STATUS=$(gh api "repos/${{ github.repository }}/commits/$SHA/statuses" \ - --jq '[.[] | select(.context == "eval-skill/fork")] | first | .state // "none"') - if [[ "$STATUS" != "success" ]]; then - echo "::error::Fork PR modifies skill files but eval has not passed for commit $SHA." - echo "::error::A maintainer must review the code and add the 'eval-skill' label." - exit 1 - fi - echo "Fork eval passed for $SHA" - # For internal PRs: run the eval directly - - uses: actions/checkout@v6 - if: steps.detect-fork.outputs.is_fork != 'true' - - uses: oven-sh/setup-bun@v2 - if: steps.detect-fork.outputs.is_fork != 'true' - - uses: actions/cache@v5 - if: steps.detect-fork.outputs.is_fork != 'true' - id: cache - with: - path: node_modules - key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }} - - if: steps.detect-fork.outputs.is_fork != 'true' && steps.cache.outputs.cache-hit != 'true' - run: bun install --frozen-lockfile - - name: Generate docs and skill files - if: steps.detect-fork.outputs.is_fork != 'true' - run: bun run generate:schema && bun run generate:docs - - name: Eval SKILL.md - if: steps.detect-fork.outputs.is_fork != 'true' - run: bun run eval:skill - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - - name: Upload eval results - if: always() && steps.detect-fork.outputs.is_fork != 'true' - uses: actions/upload-artifact@v7 - with: - name: skill-eval-results - path: test/skill-eval/results.json - lint: name: Lint & Typecheck needs: [changes] @@ -597,7 +542,7 @@ jobs: test-e2e: name: E2E Tests - needs: [build-binary] + needs: [build-binary, changes] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -621,6 +566,9 @@ jobs: - name: E2E Tests env: SENTRY_CLI_BINARY: ${{ github.workspace }}/dist-bin/sentry-linux-x64 + # Pass API key only when skill files changed — the skill-eval e2e test + # auto-skips when the key is absent, so non-skill PRs aren't affected. + ANTHROPIC_API_KEY: ${{ needs.changes.outputs.skill == 'true' && secrets.ANTHROPIC_API_KEY || '' }} run: bun run test:e2e build-npm: @@ -726,7 +674,7 @@ jobs: ci-status: name: CI Status if: always() - needs: [changes, check-generated, eval-skill, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly] + needs: [changes, check-generated, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly] runs-on: ubuntu-latest permissions: {} steps: @@ -734,7 +682,7 @@ jobs: run: | # Check for explicit failures or cancellations in all jobs # generate-patches and publish-nightly are skipped on PRs — that's expected - results="${{ needs.check-generated.result }} ${{ needs.eval-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}" + results="${{ needs.check-generated.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}" for result in $results; do if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then echo "::error::CI failed" @@ -752,9 +700,4 @@ jobs: echo "::error::CI failed - upstream job failed causing check-generated to be skipped" exit 1 fi - if [[ "${{ needs.changes.outputs.skill }}" == "true" && "${{ needs.eval-skill.result }}" == "skipped" ]]; then - echo "::error::CI failed - upstream job failed causing eval-skill to be skipped" - exit 1 - fi - echo "CI passed" diff --git a/.github/workflows/eval-skill-fork.yml b/.github/workflows/eval-skill-fork.yml index 5380bf24b..f8ff6a120 100644 --- a/.github/workflows/eval-skill-fork.yml +++ b/.github/workflows/eval-skill-fork.yml @@ -46,6 +46,9 @@ jobs: - if: steps.cache.outputs.cache-hit != 'true' run: bun install --frozen-lockfile + - name: Generate docs and skill files + run: bun run generate:schema && bun run generate:docs + - name: Eval SKILL.md id: eval run: bun run eval:skill diff --git a/AGENTS.md b/AGENTS.md index 8d2560227..f53de10b3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1007,6 +1007,33 @@ mock.module("./some-module", () => ({ * **All view subcommands should use \ \ positional pattern**: All \`\* view\` subcommands should follow a consistent \`\ \\` positional argument pattern where target is the optional \`org/project\` specifier. During migration, use opportunistic argument swapping with a stderr warning when args are in wrong order. This is an instance of the broader CLI UX auto-correction pattern: safe when input is already invalid, correction is unambiguous, warning goes to stderr. Normalize at command level, keep parsers pure. Model after \`gh\` CLI conventions. + +* **Sentry CLI markdown-first formatting pipeline replaces ad-hoc ANSI**: Formatters build CommonMark strings; \`renderMarkdown()\` renders to ANSI for TTY or raw markdown for non-TTY. Key helpers: \`colorTag()\`, \`mdKvTable()\`, \`mdRow()\`, \`mdTableHeader()\` (\`:\` suffix = right-aligned), \`renderTextTable()\`. \`isPlainOutput()\` checks \`SENTRY\_PLAIN\_OUTPUT\` > \`NO\_COLOR\` > \`!isTTY\`. Batch path: \`formatXxxTable()\`. Streaming path: \`StreamingTable\` (TTY) or raw markdown rows (plain). Both share \`buildXxxRowCells()\`. + + +* **Sentry dashboard API rejects discover/transaction-like widget types — use spans**: The Sentry Dashboard API rejects \`widgetType: 'discover'\` and \`widgetType: 'transaction-like'\` as deprecated. Use \`widgetType: 'spans'\` for new widgets. The codebase splits types into \`WIDGET\_TYPES\` (active, for creation) and \`ALL\_WIDGET\_TYPES\` (including deprecated, for parsing server responses). \`DashboardWidgetInputSchema\` must use \`ALL\_WIDGET\_TYPES\` so editing existing widgets with deprecated types passes Zod validation. \`validateWidgetEnums()\` in \`resolve.ts\` rejects deprecated types for new widget creation — but accepts \`skipDeprecatedCheck: true\` for the edit path, where \`effectiveDataset\` may inherit a deprecated type from the existing widget. Cross-validation (display vs dataset compatibility) still runs on effective values. Tests must use \`error-events\` instead of \`discover\`; it shares \`DISCOVER\_AGGREGATE\_FUNCTIONS\` including \`failure\_rate\`. + + +* **Sentry issue stats field: time-series controlled by groupStatsPeriod**: Sentry issue stats and list table layout: \`stats\` key depends on \`groupStatsPeriod\` (\`""\`, \`"14d"\`, \`"24h"\`, \`"auto"\`); \`statsPeriod\` controls window. \*\*Critical\*\*: \`count\` is period-scoped — use \`lifetime.count\` for true total. Issue list uses \`groupStatsPeriod: 'auto'\` for sparklines. Columns: SHORT ID, ISSUE, SEEN, AGE, TREND, EVENTS, USERS, TRIAGE. TREND hidden < 100 cols. \`--compact\` tri-state: explicit overrides; \`undefined\` triggers \`shouldAutoCompact(rowCount)\` — compact if \`3N + 3 > termHeight\`. Height formula \`3N + 3\` (last row has no trailing separator). + + +* **Sentry trace-logs API is org-scoped, not project-scoped**: The Sentry trace-logs endpoint (\`/organizations/{org}/trace-logs/\`) is org-scoped, so \`trace logs\` uses \`resolveOrg()\` not \`resolveOrgAndProject()\`. The endpoint is PRIVATE in Sentry source, excluded from the public OpenAPI schema — \`@sentry/api\` has no generated types. The hand-written \`TraceLogSchema\` in \`src/types/sentry.ts\` is required until Sentry makes it public. + + +* **SKILL.md is fully generated — edit source files, not output**: The skill files under \`plugins/sentry-cli/skills/sentry-cli/\` (SKILL.md + references/\*.md) are fully generated by \`bun run generate:skill\` (script/generate-skill.ts). CI runs this after every push via a \`github-actions\[bot]\` commit, overwriting any manual edits. To change skill content, edit the \*\*sources\*\*: (1) \`docs/src/content/docs/agent-guidance.md\` — embedded into SKILL.md's Agent Guidance section with heading levels bumped. (2) \`src/commands/\*/\` flag \`brief\` strings — generate the reference file flag descriptions. (3) \`docs/src/content/docs/commands/\*.md\` — examples extracted per command via marked AST parsing. After editing sources, run \`bun run generate:skill\` locally and commit both source and generated files. CI's \`bun run check:skill\` fails if generated files are stale. + + +* **Stricli route errors are uninterceptable — only post-run detection works**: Stricli route errors, exit codes, and OutputError — error propagation gaps: (1) Route failures are uninterceptable — Stricli writes to stderr and returns \`ExitCode.UnknownCommand\` internally. Only post-\`run()\` \`process.exitCode\` check works. \`exceptionWhileRunningCommand\` only fires for errors in command \`func()\`. (2) \`ExitCode.UnknownCommand\` is \`-5\`. Bun reads \`251\` (unsigned byte), Node reads \`-5\` — compare both. (3) \`OutputError\` in \`handleOutputError\` calls \`process.exit()\` immediately, bypassing telemetry and \`exceptionWhileRunningCommand\`. Top-level typos via \`defaultCommand:help\` → \`OutputError\` → \`process.exit(1)\` skip all error reporting. + + +* **Three Sentry APIs for span custom attributes with different capabilities**: Three Sentry endpoints serve span data with different custom attribute support: (1) \`/trace/{traceId}/\` — hierarchical tree; supports \`additional\_attributes\` query param to request named attributes (must enumerate). Returns \`measurements\` (web vitals, always populated with zeros for non-browser spans). (2) \`/projects/{org}/{project}/trace-items/{itemId}/?trace\_id={id}\&item\_type=spans\` — single span full detail; returns ALL attributes automatically as \`{name, type, value}\[]\`. No enumeration needed. Frontend span detail sidebar uses this. (3) \`/events/?dataset=spans\&field=X\` — list/search; requires explicit \`field\` params. The CLI's \`span view\` calls endpoint 2 via \`getSpanDetails()\` with p-limit(5) concurrency. \`trace view\` passes \`--fields\` to endpoint 1 as \`additional\_attributes\`. + + +* **Trace view measurements filtering strips zero-valued web vitals from JSON**: The \`/trace/{traceId}/\` endpoint returns hardcoded web vitals measurements (LCP, FCP, CLS, INP, TTFB + score ratios) on ALL spans, even non-browser ones where they're all zero. \`filterSpanMeasurements()\` in \`trace/view.ts\` recursively strips zero-valued entries from the JSON output. Non-zero measurements on root pageload spans are preserved. Uses destructuring + spread (not \`delete\`) to satisfy Biome's \`noDelete\` rule. + + +* **withAuthGuard returns discriminated Result type, not fallback+onError**: \`withAuthGuard\(fn)\` in \`src/lib/errors.ts\` returns a discriminated Result: \`{ ok: true, value: T } | { ok: false, error: unknown }\`. AuthErrors always re-throw (triggers bin.ts auto-login). All other errors are captured. Callers inspect \`result.ok\` to degrade gracefully. Used across 12+ files. + * **Sentry-derived terminal color palette tuned for dual-background contrast**: The CLI's chart/dashboard palette uses 10 colors derived from Sentry's categorical chart hues (\`static/app/utils/theme/scraps/tokens/color.tsx\` in getsentry/sentry), each adjusted to mid-luminance to achieve ≥3:1 contrast on both dark (#1e1e1e) and light (#f0f0f0) backgrounds. Key adjustments: orange darkened from #FF9838→#C06F20, green #67C800→#3D8F09, yellow #FFD00E→#9E8B18, purple lightened #5D3EB2→#8B6AC8, indigo #50219C→#7B50D0. Blurple (#7553FF), pink (#F0369A), magenta (#B82D90) used as-is. Teal (#228A83) added to fill a hue gap. ANSI 16-color codes were considered but rejected in favor of hex since the mid-luminance hex values provide guaranteed contrast regardless of terminal theme configuration. @@ -1024,6 +1051,15 @@ mock.module("./some-module", () => ({ * **Git worktree blocks branch checkout of branches used in other worktrees**: \`git checkout main\` fails with "already used by worktree at ..." when another worktree has that branch checked out. In this repo's worktree setup, use \`git checkout origin/main --detach\` or create feature branches from \`origin/main\` directly: \`git checkout -b fix/foo origin/main\`. This is a standard git worktree constraint but catches people off guard in the CLI repo which uses worktrees for parallel development. + +* **Dashboard tracemetrics dataset uses comma-separated aggregate format**: SDK v10+ custom metrics (, , ) emit envelope items. Dashboard widgets for these MUST use with aggregate format — e.g., . The parameter must match the SDK emission exactly: if no unit specified, for memory metrics, for uptime. only supports , , , , display types — no or . Widgets with always require . Sort expressions must reference aggregates present in . + + +* **Skill eval judge needs explicit CLI context about auto-detect and flags**: The skill eval judge (Haiku) reads the compact Command Reference literally. Without explicit context, it rejects valid plans because: (1) it treats \`\\` positional args as mandatory when they're optional via auto-detection, and (2) it rejects standard flags (\`--json\`, \`--query\`, \`--limit\`, \`--fields\`, \`--period\`) not shown in the compact reference. Fix: the judge prompt in \`test/skill-eval/helpers/judge.ts\` must include notes that positional args like \`\\` are optional (CLI auto-detects from DSN/config), and that commands support additional flags documented in separate reference files. Also guard against empty \`commandReference\` extraction in \`script/eval-skill.ts\` — log a warning instead of silently producing an adversarial prompt where no commands can satisfy the judge. + + +* **spansIndexed is not a valid Sentry dataset — use spans**: The Sentry Events/Explore API accepts 5 dataset values: \`spans\`, \`transactions\`, \`logs\`, \`errors\`, \`discover\`. The name \`spansIndexed\` is invalid and returns a generic HTTP 500 "Internal error" with no helpful validation message. This trips up AI agents and users. Valid datasets are documented in \`src/lib/api/datasets.ts\` (\`EVENTS\_API\_DATASETS\` constant) and in \`docs/commands/api.md\`. + * **Spinner stdout/stderr collision: log messages inside withProgress appear on spinner line**: The \`withProgress\` spinner in \`src/lib/polling.ts\` writes to stdout using \`\r\x1b\[K\` (no trailing newline). Consola logger writes to stderr. On a shared terminal, any \`log.info()\` called \*\*inside\*\* the \`withProgress\` callback appears on the same line as the spinner text because stderr doesn't know about stdout's carriage-return positioning. Fix pattern: propagate data out of the callback via return value, then call \`log.info()\` \*\*after\*\* \`withProgress\` completes (when the \`finally\` block has already cleared the spinner line). This affected \`downloadBinaryToTemp\` in \`upgrade.ts\` where \`log.info('Applied delta patch...')\` fired inside the spinner callback. @@ -1032,6 +1068,15 @@ mock.module("./some-module", () => ({ ### Pattern + +* **--fields dual role: output filtering + API field selection for span list**: In \`span list\`, \`--fields\` serves dual purpose: filters JSON output AND requests extra API fields. \`extractExtraApiFields()\` in \`span/list.ts\` checks each field name against \`OUTPUT\_TO\_API\_FIELD\` mapping (e.g., \`span\_id\`→\`id\`, \`op\`→\`span.op\`). Unknown names are treated as custom attributes and added to the \`field\` param in the events API request. \`FIELD\_GROUP\_ALIASES\` supports shorthand expansion (e.g., \`gen\_ai\` → 4 gen\_ai.\* fields). Extra fields survive Zod via \`SpanListItemSchema.passthrough()\` and are forwarded by \`spanListItemToFlatSpan(item, extraFieldNames)\` onto \`FlatSpan\`'s index signature. \`formatSpanTable()\` dynamically adds columns for extra attributes. + + +* **--since is an alias for --period via shared PERIOD\_ALIASES**: \`PERIOD\_ALIASES\` in \`src/lib/list-command.ts\` maps both \`t\` and \`since\` to \`period\`. All commands using \`LIST\_PERIOD\_FLAG\` get \`--since\` as an alias for \`--period\` automatically via spread \`...PERIOD\_ALIASES\`. This was added because AI agents and humans naturally try \`--since 1h\` instead of \`--period 1h\`. + + +* **Branch naming and commit message conventions for Sentry CLI**: Branch naming: \`feat/\\` or \`fix/\-\\` (e.g., \`feat/ghcr-nightly-distribution\`, \`fix/268-limit-auto-pagination\`). Commit message format: \`type(scope): description (#issue)\` (e.g., \`fix(issue-list): auto-paginate --limit beyond 100 (#268)\`, \`feat(nightly): distribute via GHCR instead of GitHub Releases\`). Types seen: fix, refactor, meta, release, feat. PRs are created as drafts via \`gh pr create --draft\`. Implementation plans are attached to commits via \`git notes add\` rather than in PR body or commit message. + * **ClearScreen yield token for in-place terminal refresh in buildCommand wrapper**: Commands needing in-place refresh yield a \`ClearScreen\` token from \`src/lib/formatters/output.ts\`. The \`handleYieldedValue\` function in \`buildCommand\` sets a \`pendingClear\` flag; when the next \`CommandOutput\` is rendered, \`renderCommandOutput\` prepends \`\x1b\[H\x1b\[J\` and writes everything in a \*\*single \`stdout.write()\` call\*\* — no flicker. In JSON/plain modes the clear is silently ignored. Pattern: \`yield ClearScreen()\` then \`yield CommandOutput(data)\`. Critical: never split clear and content into separate writes. Also: never add a redundant clear-screen inside a \`HumanRenderer.render()\` method — the \`ClearScreen\` token is the sole mechanism. The dashboard renderer originally had its own \`\x1b\[2J\x1b\[H\` prepend on re-renders, causing double clears; this was removed. @@ -1044,6 +1089,24 @@ mock.module("./some-module", () => ({ * **Shared flag constants in list-command.ts for cross-command consistency**: \`src/lib/list-command.ts\` exports shared Stricli flag definitions (\`FIELDS\_FLAG\`, \`FRESH\_FLAG\`, \`FRESH\_ALIASES\`) reused across all commands. When adding a new global-ish flag to multiple commands, define it once here as a const satisfying Stricli's flag shape, then spread into each command's \`flags\` object. The \`--fields\` flag is \`{ kind: 'parsed', parse: String, brief: '...', optional: true }\`. \`parseFieldsList()\` in \`formatters/json.ts\` handles comma-separated parsing with trim/dedup. \`writeJson()\` accepts an optional \`fields\` array and calls \`filterFields()\` before serialization. + +* **Pagination contextKey must include all query-varying parameters with escaping**: Pagination \`contextKey\` must encode every query-varying parameter (sort, query, period) with \`escapeContextKeyValue()\` (replaces \`|\` with \`%7C\`). Always provide a fallback before escaping since \`flags.period\` may be \`undefined\` in tests despite having a default: \`flags.period ? escapeContextKeyValue(flags.period) : "90d"\`. + + +* **PR review workflow: reply, resolve, amend, force-push**: PR review workflow: (1) Read unresolved threads via GraphQL, (2) make code changes, (3) run lint+typecheck+tests, (4) create a SEPARATE commit per review round (not amend) for incremental review, (5) push normally, (6) reply to comments via REST API, (7) resolve threads via GraphQL \`resolveReviewThread\`. Only amend+force-push when user explicitly asks or pre-commit hook modified files. + + +* **Redact sensitive flags in raw argv before sending to telemetry**: Telemetry context and argv redaction patterns: \`withTelemetry\` calls \`initTelemetryContext()\` BEFORE the callback — user ID, email, instance ID, runtime, and is\_self\_hosted tags are automatically set. For org context, read \`getDefaultOrganization()\` from SQLite (no API call). When sending raw argv, redact sensitive flags: \`SENSITIVE\_FLAGS\` in \`telemetry.ts\` (currently \`token\`). Scan for \`--token\`/\`-token\`, replace following value with \`\[REDACTED]\`. Handle both \`--flag value\` and \`--flag=value\` forms. \`setFlagContext\` handles parsed flags separately. + + +* **Stricli optional boolean flags produce tri-state (true/false/undefined)**: Stricli boolean flags with \`optional: true\` (no \`default\`) produce \`boolean | undefined\` in the flags type. \`--flag\` → \`true\`, \`--no-flag\` → \`false\`, omitted → \`undefined\`. This enables auto-detect patterns: explicit user choice overrides, \`undefined\` triggers heuristic. Used by \`--compact\` on issue list. The flag type must be \`readonly field?: boolean\` (not \`readonly field: boolean\`). This differs from \`default: false\` which always produces a defined boolean. + + +* **Testing Stricli command func() bodies via spyOn mocking**: Stricli/Bun test patterns: (1) Command func tests: \`const func = await cmd.loader()\`, then \`func.call(mockContext, flags, ...args)\`. \`loader()\` return type union causes LSP errors — false positives that pass \`tsc\`. File naming: \`\*.func.test.ts\`. (2) ESM prevents \`vi.spyOn\` on Node built-in exports. Workaround: test subclass that overrides the method calling the built-in. (3) Follow-mode uses \`setTimeout\`-based scheduling; test with \`interceptSigint()\` helper. \`Bun.sleep()\` has no AbortSignal so \`setTimeout\`/\`clearTimeout\` required. + + +* **validateWidgetEnums skipDeprecatedCheck for edit-path inherited datasets**: When editing a widget, \`effectiveDataset = flags.dataset ?? existing.widgetType\` may inherit a deprecated type (e.g., \`discover\`). The \`validateWidgetEnums\` deprecation check must be skipped for inherited values — only fire when the user explicitly passes \`--dataset\`. Solution: \`validateWidgetEnums(effectiveDisplay, effectiveDataset, { skipDeprecatedCheck: true })\` in \`edit.ts\`. The cross-validation between display type and dataset still runs on effective values, catching incompatible combos. The deprecation rejection helper \`rejectInvalidDataset()\` is extracted to keep \`validateWidgetEnums\` under Biome's complexity limit of 15. + * **SKILL.md generator must filter hidden Stricli flags**: \`script/generate-skill.ts\` introspects Stricli's route tree to auto-generate SKILL.md. \`FlagDef\` must include \`hidden?: boolean\`; \`extractFlags\` propagates it so \`generateCommandDoc\` filters out hidden flags alongside \`help\`/\`helpAll\`. Hidden flags from \`buildCommand\` (\`--log-level\`, \`--verbose\`) appear globally in \`docs/src/content/docs/commands/index.md\` Global Options section, pulled into SKILL.md via \`loadCommandsOverview\`. When \`cmd.jsonFields\` is present (from Zod schema registration), \`generateFullCommandDoc\` renders a markdown "JSON Fields" table with field name, type, and description columns in reference docs. diff --git a/script/eval-skill.ts b/script/eval-skill.ts index 47fcce906..3679a808e 100644 --- a/script/eval-skill.ts +++ b/script/eval-skill.ts @@ -4,9 +4,10 @@ * * Sends test prompts to agent models (Opus 4.6 + Sonnet 4.6) with SKILL.md * as context, then grades the planned commands on efficiency criteria. + * Commands are verified against the real CLI binary (via `-h`) to ground + * the LLM judge with empirical results. * * Requires ANTHROPIC_API_KEY env var for Anthropic API access. - * In CI, the key is stored in the "skill-eval" environment (protected). * * Usage: * bun run eval:skill @@ -17,6 +18,7 @@ * EVAL_AGENT_MODELS - Comma-separated model IDs (default: sonnet-4-6, opus-4-6) * EVAL_JUDGE_MODEL - Judge model ID (default: haiku-4-5) * EVAL_THRESHOLD - Minimum pass rate 0-1 (default: 0.75) + * SENTRY_CLI_BINARY - Path to pre-built binary (falls back to bun run src/bin.ts) */ import cases from "../test/skill-eval/cases.json"; diff --git a/test/e2e/skill-eval.test.ts b/test/e2e/skill-eval.test.ts new file mode 100644 index 000000000..ddb2c31b4 --- /dev/null +++ b/test/e2e/skill-eval.test.ts @@ -0,0 +1,86 @@ +/** + * SKILL.md Effectiveness Evaluation (E2E) + * + * Tests whether SKILL.md effectively guides LLMs to plan correct CLI commands. + * Uses the real CLI binary (via SENTRY_CLI_BINARY or dev mode) to verify + * that planned commands actually exist. + * + * Skips automatically when ANTHROPIC_API_KEY is not set. + * In CI, the key is only passed when skill-related files change. + */ + +import { afterAll, beforeAll, describe, expect, test } from "bun:test"; +import cases from "../skill-eval/cases.json"; +import { judgePlan } from "../skill-eval/helpers/judge.js"; +import { createClient } from "../skill-eval/helpers/llm-client.js"; +import { generatePlan } from "../skill-eval/helpers/planner.js"; +import type { CaseResult, TestCase } from "../skill-eval/helpers/types.js"; + +const SKILL_PATH = "plugins/sentry-cli/skills/sentry-cli/SKILL.md"; +const DEFAULT_THRESHOLD = 0.75; + +const apiKey = process.env.ANTHROPIC_API_KEY; + +/** + * The test preload mocks globalThis.fetch to block external network calls. + * This test needs real fetch for Anthropic API calls, so we restore it + * during the describe block and put the mock back when done. + */ +const originalFetch = (globalThis as { __originalFetch?: typeof fetch }) + .__originalFetch; + +describe.skipIf(!apiKey)("skill eval", () => { + const savedFetch = globalThis.fetch; + + beforeAll(() => { + if (originalFetch) { + globalThis.fetch = originalFetch; + } + }); + + afterAll(() => { + globalThis.fetch = savedFetch; + }); + const testCases = cases as unknown as TestCase[]; + const threshold = process.env.EVAL_THRESHOLD + ? Number.parseFloat(process.env.EVAL_THRESHOLD) + : DEFAULT_THRESHOLD; + + /** + * Run the full eval for a single model and assert it meets the threshold. + * Each model gets its own test so failures are attributed clearly. + */ + async function runEvalForModel(model: string): Promise { + const client = await createClient(apiKey as string); + const skillContent = await Bun.file(SKILL_PATH).text(); + + const results: CaseResult[] = []; + for (const testCase of testCases) { + const plan = await generatePlan( + client, + model, + skillContent, + testCase.prompt + ); + const result = await judgePlan(client, testCase, plan); + results.push(result); + } + + const passed = results.filter((r) => r.passed).length; + const score = passed / testCases.length; + // biome-ignore lint/suspicious/noMisplacedAssertion: called from test() via helper + expect(score).toBeGreaterThanOrEqual(threshold); + } + + test( + "claude-sonnet-4-6 meets threshold", + () => runEvalForModel("claude-sonnet-4-6"), + { timeout: 120_000 } + ); + + test( + "claude-opus-4-6 meets threshold", + () => runEvalForModel("claude-opus-4-6"), + { timeout: 120_000 } + ); +}); diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts index 3afe99912..a5557c984 100644 --- a/test/skill-eval/helpers/judge.ts +++ b/test/skill-eval/helpers/judge.ts @@ -1,9 +1,10 @@ /** * Phase 2: Grade the agent's plan against test case criteria. * - * Two passes: + * Three passes: * 1. Deterministic — string matching for anti-patterns, expected-patterns, max-commands - * 2. LLM judge — coherence/quality check using a cheap model (Haiku 4.6) + * 2. Command verification — run each planned command with `-h` against the real binary + * 3. LLM judge — coherence/quality check using a cheap model (Haiku 4.5) */ import type { LLMClient } from "./llm-client.js"; @@ -15,6 +16,7 @@ import type { CriterionResult, TestCase, } from "./types.js"; +import { formatVerifications, verifyPlannedCommands } from "./verify.js"; /** * Evaluate a single deterministic criterion against the plan's commands. @@ -74,12 +76,16 @@ function evaluateDeterministic( /** * Use the LLM judge to evaluate overall plan quality. - * Returns null if the judge call fails. + * + * The judge receives empirical verification results from running each + * planned command with `-h` against the real binary — no command reference + * or "allowed list" is provided, keeping the judge independent. */ async function evaluateWithLLMJudge( client: LLMClient, prompt: string, - plan: AgentPlan + plan: AgentPlan, + verificationSummary: string ): Promise { const commandList = plan.commands .map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`) @@ -87,6 +93,11 @@ async function evaluateWithLLMJudge( const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good. +The agent planned commands for the \`sentry\` CLI — a modern command-line tool (distinct from the legacy \`sentry-cli\`). + +We verified each planned command against the real CLI binary by running it with \`-h\`: +${verificationSummary} + The user asked: "${prompt}" The agent's plan: @@ -96,7 +107,7 @@ ${commandList} Notes: ${plan.notes} Evaluate the plan on overall quality. A good plan: -- Uses the right Sentry CLI commands for the task +- Uses commands verified as VALID above - Would actually work if executed - Is efficient (no unnecessary commands) - Directly addresses what the user asked for @@ -145,7 +156,9 @@ or /** * Evaluate a test case's plan against all its criteria. - * Runs deterministic checks first, then the LLM judge for overall quality. + * + * Runs deterministic checks first, then verifies commands against the + * real binary, then passes verification results to the LLM judge. */ export async function judgePlan( client: LLMClient, @@ -180,8 +193,17 @@ export async function judgePlan( criteria.push(evaluateDeterministic(name, def, plan)); } - // Run LLM judge for overall quality - const llmVerdict = await evaluateWithLLMJudge(client, testCase.prompt, plan); + // Verify commands against the real binary + const verifications = await verifyPlannedCommands(plan.commands); + const verificationSummary = formatVerifications(verifications); + + // Run LLM judge with verification results + const llmVerdict = await evaluateWithLLMJudge( + client, + testCase.prompt, + plan, + verificationSummary + ); criteria.push(llmVerdict); // Compute score: fraction of criteria that passed diff --git a/test/skill-eval/helpers/verify.ts b/test/skill-eval/helpers/verify.ts new file mode 100644 index 000000000..b78e1c0cd --- /dev/null +++ b/test/skill-eval/helpers/verify.ts @@ -0,0 +1,105 @@ +/** + * Verify planned CLI commands against the real binary. + * + * Runs each command's subcommand route with `-h` to check it exists. + * A valid route returns exit code 0 (Stricli intercepts `-h` before + * command execution, so no auth/env setup is needed). + * An unknown route returns exit code 251 (Bun) or -5 (Node). + */ + +import { getCliCommand } from "../../fixture.js"; +import type { PlannedCommand } from "./types.js"; + +/** Result of verifying a single planned command against the real CLI binary */ +export type CommandVerification = { + command: string; + valid: boolean; + /** First few lines of help output (for valid commands) or error message */ + detail: string; +}; + +/** + * Extract the subcommand route from a full CLI command string. + * + * Stops at flags (`-*`), paths (`/`), quoted values, or assignments (`=`). + * E.g., `"sentry issue list --query '...'"` → `["issue", "list"]` + * E.g., `"sentry api /api/0/..."` → `["api"]` + */ +function extractRoute(command: string): string[] { + const tokens = command.trim().split(/\s+/); + const start = tokens[0] === "sentry" ? 1 : 0; + + const route: string[] = []; + for (let i = start; i < tokens.length; i++) { + const token = tokens[i]; + if ( + token.startsWith("-") || + token.includes("/") || + token.includes("=") || + token.startsWith('"') || + token.startsWith("'") + ) { + break; + } + route.push(token); + } + return route; +} + +/** + * Verify each planned command by running its route with `-h`. + * + * Uses `getCliCommand()` which respects `SENTRY_CLI_BINARY` env var + * (pre-built binary in e2e CI) or falls back to `bun run src/bin.ts`. + */ +export async function verifyPlannedCommands( + commands: PlannedCommand[] +): Promise { + const cliCmd = getCliCommand(); + const results: CommandVerification[] = []; + + for (const { command } of commands) { + const route = extractRoute(command); + if (route.length === 0) { + results.push({ + command, + valid: false, + detail: "Could not extract subcommand from command string", + }); + continue; + } + + const proc = Bun.spawn([...cliCmd, ...route, "-h"], { + stdout: "pipe", + stderr: "pipe", + env: { ...process.env, SENTRY_CLI_NO_TELEMETRY: "1" }, + }); + + const [stdout, stderr] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + ]); + const exitCode = await proc.exited; + + const valid = exitCode === 0; + const output = (stdout || stderr).trim(); + // Take first 3 lines as a summary + const detail = output.split("\n").slice(0, 3).join("\n").trim(); + + results.push({ command, valid, detail }); + } + + return results; +} + +/** Format verification results as a human-readable string for the judge prompt */ +export function formatVerifications( + verifications: CommandVerification[] +): string { + return verifications + .map((v, i) => { + const icon = v.valid ? "VALID" : "INVALID"; + return `${i + 1}. \`${v.command}\`: ${icon} — ${v.detail}`; + }) + .join("\n"); +}