From 593ce56906e18d0c824cddf91afe3031d93f6e77 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Fri, 10 Apr 2026 10:17:46 +0000 Subject: [PATCH 1/5] fix(eval): ground LLM judge with command reference to prevent false negatives The skill eval judge (Haiku 4.5) had no context about the sentry CLI and was hallucinating that valid commands don't exist, confusing it with the legacy sentry-cli. This caused Opus 4.6 to fail 3/8 eval cases (62.5%, below the 75% threshold) on the overall-quality criterion. Extract the Command Reference section from SKILL.md and inject it into the judge prompt so it can verify planned commands against actual CLI capabilities. --- script/eval-skill.ts | 15 ++++++++++++++- test/skill-eval/helpers/judge.ts | 28 +++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/script/eval-skill.ts b/script/eval-skill.ts index 47fcce906..efd36c53e 100644 --- a/script/eval-skill.ts +++ b/script/eval-skill.ts @@ -46,6 +46,18 @@ const VERSION_RE = /^version:\s*(.+)$/m; */ const DEFAULT_THRESHOLD = 0.75; +/** Extract the "## Command Reference" section from SKILL.md for the judge. */ +function extractCommandReference(skillContent: string): string { + const start = skillContent.indexOf("## Command Reference"); + if (start === -1) { + return ""; + } + const end = skillContent.indexOf("\n## ", start + 1); + return end === -1 + ? skillContent.slice(start) + : skillContent.slice(start, end); +} + /** Run all eval cases against a single model */ async function evalModel( client: Awaited>, @@ -53,6 +65,7 @@ async function evalModel( skillContent: string, testCases: TestCase[] ): Promise { + const commandReference = extractCommandReference(skillContent); console.log(`\nEvaluating: ${model}`); console.log("─".repeat(40)); @@ -67,7 +80,7 @@ async function evalModel( skillContent, testCase.prompt ); - const result = await judgePlan(client, testCase, plan); + const result = await judgePlan(client, testCase, plan, commandReference); results.push(result); const icon = result.passed ? "✓" : "✗"; diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts index 3afe99912..33a4a7a99 100644 --- a/test/skill-eval/helpers/judge.ts +++ b/test/skill-eval/helpers/judge.ts @@ -74,12 +74,14 @@ function evaluateDeterministic( /** * Use the LLM judge to evaluate overall plan quality. - * Returns null if the judge call fails. + * The command reference (extracted from SKILL.md) grounds the judge so it + * doesn't hallucinate that valid `sentry` commands don't exist. */ async function evaluateWithLLMJudge( client: LLMClient, prompt: string, - plan: AgentPlan + plan: AgentPlan, + commandReference: string ): Promise { const commandList = plan.commands .map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`) @@ -87,6 +89,11 @@ async function evaluateWithLLMJudge( const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good. +The agent was given a skill guide for the \`sentry\` CLI (not the legacy \`sentry-cli\`). +Here are the valid commands from that guide: + +${commandReference} + The user asked: "${prompt}" The agent's plan: @@ -96,11 +103,13 @@ ${commandList} Notes: ${plan.notes} Evaluate the plan on overall quality. A good plan: -- Uses the right Sentry CLI commands for the task +- Uses commands that exist in the reference above - Would actually work if executed - Is efficient (no unnecessary commands) - Directly addresses what the user asked for +Do NOT penalize commands that appear in the reference above. This is a real CLI tool. + Return ONLY valid JSON: {"pass": true, "reason": "Brief explanation"} @@ -146,11 +155,15 @@ or /** * Evaluate a test case's plan against all its criteria. * Runs deterministic checks first, then the LLM judge for overall quality. + * + * @param commandReference - The Command Reference section from SKILL.md, + * injected into the judge prompt so it can verify commands exist. */ export async function judgePlan( client: LLMClient, testCase: TestCase, - plan: AgentPlan | null + plan: AgentPlan | null, + commandReference: string ): Promise { // If the planner failed to produce a plan, fail all criteria if (!plan) { @@ -181,7 +194,12 @@ export async function judgePlan( } // Run LLM judge for overall quality - const llmVerdict = await evaluateWithLLMJudge(client, testCase.prompt, plan); + const llmVerdict = await evaluateWithLLMJudge( + client, + testCase.prompt, + plan, + commandReference + ); criteria.push(llmVerdict); // Compute score: fraction of criteria that passed From be0b68b55b4547377f267c881dd5f3bfa69b8db5 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Fri, 10 Apr 2026 10:27:41 +0000 Subject: [PATCH 2/5] fix(eval): add CLI context to judge prompt for auto-detect and flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The judge was too strict — it read the compact command reference literally and rejected plans for omitting args (which are optional via auto-detection) and using standard flags like --json, --query, --limit that aren't listed in the compact reference. Also add a warning when the Command Reference section is missing from SKILL.md, per Bugbot feedback. --- script/eval-skill.ts | 5 +++++ test/skill-eval/helpers/judge.ts | 10 +++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/script/eval-skill.ts b/script/eval-skill.ts index efd36c53e..218ae6c42 100644 --- a/script/eval-skill.ts +++ b/script/eval-skill.ts @@ -66,6 +66,11 @@ async function evalModel( testCases: TestCase[] ): Promise { const commandReference = extractCommandReference(skillContent); + if (!commandReference) { + console.error( + 'Warning: "## Command Reference" section not found in SKILL.md — judge will lack command context' + ); + } console.log(`\nEvaluating: ${model}`); console.log("─".repeat(40)); diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts index 33a4a7a99..39b832d38 100644 --- a/test/skill-eval/helpers/judge.ts +++ b/test/skill-eval/helpers/judge.ts @@ -94,6 +94,11 @@ Here are the valid commands from that guide: ${commandReference} +Important context about how this CLI works: +- Positional args like \`\` are OPTIONAL — the CLI auto-detects org and project from the local directory context (DSN detection). Omitting them is correct and expected. +- Each command supports additional flags (e.g., --json, --query, --limit, --period, --fields) documented in separate reference files. The compact listing above only shows command signatures, not all flags. +- --json is a global flag available on all list/view commands. + The user asked: "${prompt}" The agent's plan: @@ -108,7 +113,10 @@ Evaluate the plan on overall quality. A good plan: - Is efficient (no unnecessary commands) - Directly addresses what the user asked for -Do NOT penalize commands that appear in the reference above. This is a real CLI tool. +Do NOT penalize: +- Commands that appear in the reference above — this is a real CLI tool +- Omitting org/project args — auto-detection is a core feature +- Using flags like --json, --query, --limit, --fields, --period — they are real flags Return ONLY valid JSON: {"pass": true, "reason": "Brief explanation"} From fafffd5ef8dfe666cd1571f43b4ff651c36b7035 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Fri, 10 Apr 2026 11:05:39 +0000 Subject: [PATCH 3/5] refactor(eval): verify commands against real binary instead of reference injection Replace the command reference injection approach with empirical verification: commands are now run with `-h` against the real CLI binary to check they exist. The judge receives verification results instead of a pre-built allowed-commands list, keeping it independent and honest. Move the eval into the e2e test suite where the pre-built binary is available, eliminating the standalone CI job. The e2e test auto-skips when ANTHROPIC_API_KEY is absent (non-skill PRs, fork PRs). The fork workflow continues using dev mode (bun run src/bin.ts) for verification. --- .github/workflows/ci.yml | 69 ++--------------- .github/workflows/eval-skill-fork.yml | 3 + AGENTS.md | 32 +++++--- script/eval-skill.ts | 24 +----- test/e2e/skill-eval.test.ts | 67 ++++++++++++++++ test/skill-eval/helpers/judge.ts | 48 ++++++------ test/skill-eval/helpers/verify.ts | 105 ++++++++++++++++++++++++++ 7 files changed, 228 insertions(+), 120 deletions(-) create mode 100644 test/e2e/skill-eval.test.ts create mode 100644 test/skill-eval/helpers/verify.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e7d85fe3f..cb998f93d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,61 +142,6 @@ jobs: echo "::error::Skill files are out of date. Run 'bun run generate:docs' locally and commit the result." exit 1 - eval-skill: - name: Eval SKILL.md - needs: [changes] - if: needs.changes.outputs.skill == 'true' - runs-on: ubuntu-latest - steps: - # For fork PRs: check if eval has already passed via commit status - - name: Detect fork - id: detect-fork - run: | - if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then - echo "is_fork=true" >> "$GITHUB_OUTPUT" - fi - - name: Check fork eval status - if: steps.detect-fork.outputs.is_fork == 'true' - env: - GH_TOKEN: ${{ github.token }} - run: | - SHA="${{ github.event.pull_request.head.sha }}" - STATUS=$(gh api "repos/${{ github.repository }}/commits/$SHA/statuses" \ - --jq '[.[] | select(.context == "eval-skill/fork")] | first | .state // "none"') - if [[ "$STATUS" != "success" ]]; then - echo "::error::Fork PR modifies skill files but eval has not passed for commit $SHA." - echo "::error::A maintainer must review the code and add the 'eval-skill' label." - exit 1 - fi - echo "Fork eval passed for $SHA" - # For internal PRs: run the eval directly - - uses: actions/checkout@v6 - if: steps.detect-fork.outputs.is_fork != 'true' - - uses: oven-sh/setup-bun@v2 - if: steps.detect-fork.outputs.is_fork != 'true' - - uses: actions/cache@v5 - if: steps.detect-fork.outputs.is_fork != 'true' - id: cache - with: - path: node_modules - key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }} - - if: steps.detect-fork.outputs.is_fork != 'true' && steps.cache.outputs.cache-hit != 'true' - run: bun install --frozen-lockfile - - name: Generate docs and skill files - if: steps.detect-fork.outputs.is_fork != 'true' - run: bun run generate:schema && bun run generate:docs - - name: Eval SKILL.md - if: steps.detect-fork.outputs.is_fork != 'true' - run: bun run eval:skill - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - - name: Upload eval results - if: always() && steps.detect-fork.outputs.is_fork != 'true' - uses: actions/upload-artifact@v7 - with: - name: skill-eval-results - path: test/skill-eval/results.json - lint: name: Lint & Typecheck needs: [changes] @@ -597,7 +542,7 @@ jobs: test-e2e: name: E2E Tests - needs: [build-binary] + needs: [build-binary, changes] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -621,6 +566,9 @@ jobs: - name: E2E Tests env: SENTRY_CLI_BINARY: ${{ github.workspace }}/dist-bin/sentry-linux-x64 + # Pass API key only when skill files changed — the skill-eval e2e test + # auto-skips when the key is absent, so non-skill PRs aren't affected. + ANTHROPIC_API_KEY: ${{ needs.changes.outputs.skill == 'true' && secrets.ANTHROPIC_API_KEY || '' }} run: bun run test:e2e build-npm: @@ -726,7 +674,7 @@ jobs: ci-status: name: CI Status if: always() - needs: [changes, check-generated, eval-skill, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly] + needs: [changes, check-generated, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly] runs-on: ubuntu-latest permissions: {} steps: @@ -734,7 +682,7 @@ jobs: run: | # Check for explicit failures or cancellations in all jobs # generate-patches and publish-nightly are skipped on PRs — that's expected - results="${{ needs.check-generated.result }} ${{ needs.eval-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}" + results="${{ needs.check-generated.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}" for result in $results; do if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then echo "::error::CI failed" @@ -752,9 +700,4 @@ jobs: echo "::error::CI failed - upstream job failed causing check-generated to be skipped" exit 1 fi - if [[ "${{ needs.changes.outputs.skill }}" == "true" && "${{ needs.eval-skill.result }}" == "skipped" ]]; then - echo "::error::CI failed - upstream job failed causing eval-skill to be skipped" - exit 1 - fi - echo "CI passed" diff --git a/.github/workflows/eval-skill-fork.yml b/.github/workflows/eval-skill-fork.yml index 5380bf24b..f8ff6a120 100644 --- a/.github/workflows/eval-skill-fork.yml +++ b/.github/workflows/eval-skill-fork.yml @@ -46,6 +46,9 @@ jobs: - if: steps.cache.outputs.cache-hit != 'true' run: bun install --frozen-lockfile + - name: Generate docs and skill files + run: bun run generate:schema && bun run generate:docs + - name: Eval SKILL.md id: eval run: bun run eval:skill diff --git a/AGENTS.md b/AGENTS.md index 4f40f7f87..d9006e9b1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1021,11 +1021,17 @@ mock.module("./some-module", () => ({ * **Sentry trace-logs API is org-scoped, not project-scoped**: The Sentry trace-logs endpoint (\`/organizations/{org}/trace-logs/\`) is org-scoped, so \`trace logs\` uses \`resolveOrg()\` not \`resolveOrgAndProject()\`. The endpoint is PRIVATE in Sentry source, excluded from the public OpenAPI schema — \`@sentry/api\` has no generated types. The hand-written \`TraceLogSchema\` in \`src/types/sentry.ts\` is required until Sentry makes it public. -* **SKILL.md is fully generated — edit fragment files for custom content, not output**: The skill files under \`plugins/sentry-cli/skills/sentry-cli/\` (SKILL.md + references/\*.md) are fully generated by \`bun run generate:skill\` (script/generate-skill.ts). They are rebuilt automatically by \`bun run generate:docs\` which runs as part of \`dev\`, \`build\`, \`typecheck\`, and \`test\` scripts. To change skill content, edit the \*\*sources\*\*: (1) \`docs/src/content/docs/agent-guidance.md\` — embedded into SKILL.md's Agent Guidance section with heading levels bumped. (2) \`src/commands/\*/\` flag \`brief\` strings — generate the reference file flag descriptions. (3) \`docs/src/fragments/commands/\*.md\` — hand-written examples and guides appended to generated command docs. Command docs (\`docs/src/content/docs/commands/\*.md\`) are also gitignored and rebuilt from fragments + CLI metadata by \`generate:command-docs\`. \`bun run check:fragments\` validates fragment ↔ route consistency. +* **SKILL.md is fully generated — edit source files, not output**: The skill files under \`plugins/sentry-cli/skills/sentry-cli/\` (SKILL.md + references/\*.md) are fully generated by \`bun run generate:skill\` (script/generate-skill.ts). CI runs this after every push via a \`github-actions\[bot]\` commit, overwriting any manual edits. To change skill content, edit the \*\*sources\*\*: (1) \`docs/src/content/docs/agent-guidance.md\` — embedded into SKILL.md's Agent Guidance section with heading levels bumped. (2) \`src/commands/\*/\` flag \`brief\` strings — generate the reference file flag descriptions. (3) \`docs/src/content/docs/commands/\*.md\` — examples extracted per command via marked AST parsing. After editing sources, run \`bun run generate:skill\` locally and commit both source and generated files. CI's \`bun run check:skill\` fails if generated files are stale. * **Stricli route errors are uninterceptable — only post-run detection works**: Stricli route errors, exit codes, and OutputError — error propagation gaps: (1) Route failures are uninterceptable — Stricli writes to stderr and returns \`ExitCode.UnknownCommand\` internally. Only post-\`run()\` \`process.exitCode\` check works. \`exceptionWhileRunningCommand\` only fires for errors in command \`func()\`. (2) \`ExitCode.UnknownCommand\` is \`-5\`. Bun reads \`251\` (unsigned byte), Node reads \`-5\` — compare both. (3) \`OutputError\` in \`handleOutputError\` calls \`process.exit()\` immediately, bypassing telemetry and \`exceptionWhileRunningCommand\`. Top-level typos via \`defaultCommand:help\` → \`OutputError\` → \`process.exit(1)\` skip all error reporting. + +* **Three Sentry APIs for span custom attributes with different capabilities**: Three Sentry endpoints serve span data with different custom attribute support: (1) \`/trace/{traceId}/\` — hierarchical tree; supports \`additional\_attributes\` query param to request named attributes (must enumerate). Returns \`measurements\` (web vitals, always populated with zeros for non-browser spans). (2) \`/projects/{org}/{project}/trace-items/{itemId}/?trace\_id={id}\&item\_type=spans\` — single span full detail; returns ALL attributes automatically as \`{name, type, value}\[]\`. No enumeration needed. Frontend span detail sidebar uses this. (3) \`/events/?dataset=spans\&field=X\` — list/search; requires explicit \`field\` params. The CLI's \`span view\` calls endpoint 2 via \`getSpanDetails()\` with p-limit(5) concurrency. \`trace view\` passes \`--fields\` to endpoint 1 as \`additional\_attributes\`. + + +* **Trace view measurements filtering strips zero-valued web vitals from JSON**: The \`/trace/{traceId}/\` endpoint returns hardcoded web vitals measurements (LCP, FCP, CLS, INP, TTFB + score ratios) on ALL spans, even non-browser ones where they're all zero. \`filterSpanMeasurements()\` in \`trace/view.ts\` recursively strips zero-valued entries from the JSON output. Non-zero measurements on root pageload spans are preserved. Uses destructuring + spread (not \`delete\`) to satisfy Biome's \`noDelete\` rule. + * **withAuthGuard returns discriminated Result type, not fallback+onError**: \`withAuthGuard\(fn)\` in \`src/lib/errors.ts\` returns a discriminated Result: \`{ ok: true, value: T } | { ok: false, error: unknown }\`. AuthErrors always re-throw (triggers bin.ts auto-login). All other errors are captured. Callers inspect \`result.ok\` to degrade gracefully. Used across 12+ files. @@ -1043,13 +1049,26 @@ mock.module("./some-module", () => ({ * **CLI telemetry command tags use sentry. prefix with dots not bare names**: The \`buildCommand\` wrapper sets the \`command\` telemetry tag using the full Stricli command prefix joined with dots: \`sentry.issue.explain\`, \`sentry.issue.list\`, \`sentry.api\`, etc. — NOT bare names like \`issue.explain\`. When querying Sentry Discover or building dashboard widgets, always use the \`sentry.\` prefix. Verify actual tag values with a Discover query (\`field:command, count()\`, grouped by \`command\`) before assuming the format. -* **Dashboard tracemetrics dataset uses comma-separated aggregate format**: SDK v10+ custom metrics (`Sentry.metrics.distribution()`, `.gauge()`, `.count()`) emit `trace_metric` envelope items. Dashboard widgets for these MUST use `--dataset tracemetrics` with aggregate format `aggregation(value,metric_name,metric_type,unit)` — e.g., `p50(value,completion.duration_ms,distribution,none)`. The `unit` parameter must match the SDK emission exactly: `none` if no unit specified, `byte` for memory metrics, `second` for uptime. `tracemetrics` only supports `line`, `area`, `bar`, `big_number`, `categorical_bar` display types — no `table` or `stacked_area`. Widgets with `--group-by` always require `--limit`. Sort expressions must reference aggregates present in `--query`. + +* **Dashboard tracemetrics dataset uses comma-separated aggregate format**: SDK v10+ custom metrics (, , ) emit envelope items. Dashboard widgets for these MUST use with aggregate format — e.g., . The parameter must match the SDK emission exactly: if no unit specified, for memory metrics, for uptime. only supports , , , , display types — no or . Widgets with always require . Sort expressions must reference aggregates present in . + + +* **Skill eval judge needs explicit CLI context about auto-detect and flags**: The skill eval judge (Haiku) reads the compact Command Reference literally. Without explicit context, it rejects valid plans because: (1) it treats \`\\` positional args as mandatory when they're optional via auto-detection, and (2) it rejects standard flags (\`--json\`, \`--query\`, \`--limit\`, \`--fields\`, \`--period\`) not shown in the compact reference. Fix: the judge prompt in \`test/skill-eval/helpers/judge.ts\` must include notes that positional args like \`\\` are optional (CLI auto-detects from DSN/config), and that commands support additional flags documented in separate reference files. Also guard against empty \`commandReference\` extraction in \`script/eval-skill.ts\` — log a warning instead of silently producing an adversarial prompt where no commands can satisfy the judge. + + +* **spansIndexed is not a valid Sentry dataset — use spans**: The Sentry Events/Explore API accepts 5 dataset values: \`spans\`, \`transactions\`, \`logs\`, \`errors\`, \`discover\`. The name \`spansIndexed\` is invalid and returns a generic HTTP 500 "Internal error" with no helpful validation message. This trips up AI agents and users. Valid datasets are documented in \`src/lib/api/datasets.ts\` (\`EVENTS\_API\_DATASETS\` constant) and in \`docs/commands/api.md\`. * **Use toMatchObject not toEqual when testing resolution results with optional fields**: When \`resolveProjectBySlug()\` or \`resolveOrgProjectTarget()\` adds optional fields (like \`projectData\`) to the return type, tests using \`expect(result).toEqual({ org, project })\` fail because \`toEqual\` requires exact match. Use \`toMatchObject({ org, project })\` instead — it checks the specified subset without failing on extra properties. This affects tests across \`event/view\`, \`log/view\`, \`trace/view\`, and \`trace/list\` test files. ### Pattern + +* **--fields dual role: output filtering + API field selection for span list**: In \`span list\`, \`--fields\` serves dual purpose: filters JSON output AND requests extra API fields. \`extractExtraApiFields()\` in \`span/list.ts\` checks each field name against \`OUTPUT\_TO\_API\_FIELD\` mapping (e.g., \`span\_id\`→\`id\`, \`op\`→\`span.op\`). Unknown names are treated as custom attributes and added to the \`field\` param in the events API request. \`FIELD\_GROUP\_ALIASES\` supports shorthand expansion (e.g., \`gen\_ai\` → 4 gen\_ai.\* fields). Extra fields survive Zod via \`SpanListItemSchema.passthrough()\` and are forwarded by \`spanListItemToFlatSpan(item, extraFieldNames)\` onto \`FlatSpan\`'s index signature. \`formatSpanTable()\` dynamically adds columns for extra attributes. + + +* **--since is an alias for --period via shared PERIOD\_ALIASES**: \`PERIOD\_ALIASES\` in \`src/lib/list-command.ts\` maps both \`t\` and \`since\` to \`period\`. All commands using \`LIST\_PERIOD\_FLAG\` get \`--since\` as an alias for \`--period\` automatically via spread \`...PERIOD\_ALIASES\`. This was added because AI agents and humans naturally try \`--since 1h\` instead of \`--period 1h\`. + * **Branch naming and commit message conventions for Sentry CLI**: Branch naming: \`feat/\\` or \`fix/\-\\` (e.g., \`feat/ghcr-nightly-distribution\`, \`fix/268-limit-auto-pagination\`). Commit message format: \`type(scope): description (#issue)\` (e.g., \`fix(issue-list): auto-paginate --limit beyond 100 (#268)\`, \`feat(nightly): distribute via GHCR instead of GitHub Releases\`). Types seen: fix, refactor, meta, release, feat. PRs are created as drafts via \`gh pr create --draft\`. Implementation plans are attached to commits via \`git notes add\` rather than in PR body or commit message. @@ -1062,12 +1081,6 @@ mock.module("./some-module", () => ({ * **Help-as-positional recovery uses error-path, not pre-execution interception**: Three layers of help-as-positional recovery exist: (1) \*\*Leaf commands\*\* (\`sentry issue list help\`): \`maybeRecoverWithHelp\` in \`buildCommand\` wrapper catches \`CliError\` (excluding \`OutputError\`) in the error handler. If any positional arg was \`"help"\`, shows help via \`introspectCommand()\` using \`commandPrefix\` stored on \`SentryContext\`. Only \`-h\` is NOT checked — Stricli intercepts it natively during route scanning. (2) \*\*Route groups\*\* (\`sentry dashboard help\`): Post-run check in \`bin.ts\` detects \`ExitCode.UnknownCommand\` + last arg \`"help"\`, rewrites argv to \`\["help", ...rest]\` and re-runs through the custom help command. (3) Both require \`commandPrefix\` on \`SentryContext\` (set in \`forCommand\`). Dynamic-imports \`help.js\` to avoid circular deps. - -* **Sentry SDK tree-shaking patches must be regenerated via bun patch workflow**: The CLI uses \`patchedDependencies\` in \`package.json\` to tree-shake unused exports from \`@sentry/core\` and \`@sentry/node-core\` (AI integrations, feature flags, profiler, etc.). When bumping SDK versions: (1) remove old patches and \`patchedDependencies\` entries, (2) \`rm -rf ~/.bun/install/cache/@sentry\` to clear bun's cache (edits persist in cache otherwise), (3) \`bun install\` fresh, (4) \`bun patch @sentry/core\` then edit files and \`bun patch --commit\`, repeat for node-core. Key preserved exports: \`\_INTERNAL\_safeUnref\`, \`\_INTERNAL\_safeDateNow\` (core), \`nodeRuntimeMetricsIntegration\` (node-core). Manually generating patch files with \`git diff\` may fail — bun expects specific hash formats. Always use \`bun patch --commit\` to generate patches. - - -* **Shared pagination infrastructure: buildPaginationContextKey and parseCursorFlag**: Schema v12 replaced \`pagination\_cursors.cursor TEXT\` with \`cursor\_stack TEXT\` (JSON array) + \`page\_index INTEGER\`. Stack-based API in \`src/lib/db/pagination.ts\`: \`resolveCursor(flag, key, contextKey)\` maps keywords (next/prev/previous/first/last) to \`{cursor, direction}\`. \`advancePaginationState(key, contextKey, direction, nextCursor)\` pushes/pops the stack — back-then-forward truncates stale entries. \`hasPreviousPage(key, contextKey)\` checks \`page\_index > 0\`. \`clearPaginationState(key)\` removes state. \`parseCursorFlag\` in \`list-command.ts\` accepts next/prev/previous/first/last keywords. \`paginationHint()\` in \`org-list.ts\` builds bidirectional hints (\`-c prev | -c next\`). JSON envelope includes \`hasPrev\` boolean. All 7 list commands (trace, span, issue, project, team, repo, dashboard) use this stack API. \`resolveCursor()\` must be called inside \`org-all\` override closures. - * **Pagination contextKey must include all query-varying parameters with escaping**: Pagination \`contextKey\` must encode every query-varying parameter (sort, query, period) with \`escapeContextKeyValue()\` (replaces \`|\` with \`%7C\`). Always provide a fallback before escaping since \`flags.period\` may be \`undefined\` in tests despite having a default: \`flags.period ? escapeContextKeyValue(flags.period) : "90d"\`. @@ -1085,7 +1098,4 @@ mock.module("./some-module", () => ({ * **validateWidgetEnums skipDeprecatedCheck for edit-path inherited datasets**: When editing a widget, \`effectiveDataset = flags.dataset ?? existing.widgetType\` may inherit a deprecated type (e.g., \`discover\`). The \`validateWidgetEnums\` deprecation check must be skipped for inherited values — only fire when the user explicitly passes \`--dataset\`. Solution: \`validateWidgetEnums(effectiveDisplay, effectiveDataset, { skipDeprecatedCheck: true })\` in \`edit.ts\`. The cross-validation between display type and dataset still runs on effective values, catching incompatible combos. The deprecation rejection helper \`rejectInvalidDataset()\` is extracted to keep \`validateWidgetEnums\` under Biome's complexity limit of 15. - - -* **set-commits default mode makes speculative --auto API call by design**: When \`release set-commits\` is called without \`--auto\` or \`--local\`, it tries auto-discovery first and falls back to local git on 400 error. This matches the reference sentry-cli behavior (parity-correct). A per-org negative cache in the \`metadata\` table (\`repos_configured.\` = \`"false"\`, 1-hour TTL) skips the speculative auto call on subsequent runs when no repo integration is configured. The cache clears on successful auto-discovery. diff --git a/script/eval-skill.ts b/script/eval-skill.ts index 218ae6c42..3679a808e 100644 --- a/script/eval-skill.ts +++ b/script/eval-skill.ts @@ -4,9 +4,10 @@ * * Sends test prompts to agent models (Opus 4.6 + Sonnet 4.6) with SKILL.md * as context, then grades the planned commands on efficiency criteria. + * Commands are verified against the real CLI binary (via `-h`) to ground + * the LLM judge with empirical results. * * Requires ANTHROPIC_API_KEY env var for Anthropic API access. - * In CI, the key is stored in the "skill-eval" environment (protected). * * Usage: * bun run eval:skill @@ -17,6 +18,7 @@ * EVAL_AGENT_MODELS - Comma-separated model IDs (default: sonnet-4-6, opus-4-6) * EVAL_JUDGE_MODEL - Judge model ID (default: haiku-4-5) * EVAL_THRESHOLD - Minimum pass rate 0-1 (default: 0.75) + * SENTRY_CLI_BINARY - Path to pre-built binary (falls back to bun run src/bin.ts) */ import cases from "../test/skill-eval/cases.json"; @@ -46,18 +48,6 @@ const VERSION_RE = /^version:\s*(.+)$/m; */ const DEFAULT_THRESHOLD = 0.75; -/** Extract the "## Command Reference" section from SKILL.md for the judge. */ -function extractCommandReference(skillContent: string): string { - const start = skillContent.indexOf("## Command Reference"); - if (start === -1) { - return ""; - } - const end = skillContent.indexOf("\n## ", start + 1); - return end === -1 - ? skillContent.slice(start) - : skillContent.slice(start, end); -} - /** Run all eval cases against a single model */ async function evalModel( client: Awaited>, @@ -65,12 +55,6 @@ async function evalModel( skillContent: string, testCases: TestCase[] ): Promise { - const commandReference = extractCommandReference(skillContent); - if (!commandReference) { - console.error( - 'Warning: "## Command Reference" section not found in SKILL.md — judge will lack command context' - ); - } console.log(`\nEvaluating: ${model}`); console.log("─".repeat(40)); @@ -85,7 +69,7 @@ async function evalModel( skillContent, testCase.prompt ); - const result = await judgePlan(client, testCase, plan, commandReference); + const result = await judgePlan(client, testCase, plan); results.push(result); const icon = result.passed ? "✓" : "✗"; diff --git a/test/e2e/skill-eval.test.ts b/test/e2e/skill-eval.test.ts new file mode 100644 index 000000000..de3e719f1 --- /dev/null +++ b/test/e2e/skill-eval.test.ts @@ -0,0 +1,67 @@ +/** + * SKILL.md Effectiveness Evaluation (E2E) + * + * Tests whether SKILL.md effectively guides LLMs to plan correct CLI commands. + * Uses the real CLI binary (via SENTRY_CLI_BINARY or dev mode) to verify + * that planned commands actually exist. + * + * Skips automatically when ANTHROPIC_API_KEY is not set. + * In CI, the key is only passed when skill-related files change. + */ + +import { describe, expect, test } from "bun:test"; +import cases from "../skill-eval/cases.json"; +import { judgePlan } from "../skill-eval/helpers/judge.js"; +import { createClient } from "../skill-eval/helpers/llm-client.js"; +import { generatePlan } from "../skill-eval/helpers/planner.js"; +import type { CaseResult, TestCase } from "../skill-eval/helpers/types.js"; + +const SKILL_PATH = "plugins/sentry-cli/skills/sentry-cli/SKILL.md"; +const DEFAULT_THRESHOLD = 0.75; + +const apiKey = process.env.ANTHROPIC_API_KEY; + +describe.skipIf(!apiKey)("skill eval", () => { + const testCases = cases as unknown as TestCase[]; + const threshold = process.env.EVAL_THRESHOLD + ? Number.parseFloat(process.env.EVAL_THRESHOLD) + : DEFAULT_THRESHOLD; + + /** + * Run the full eval for a single model and assert it meets the threshold. + * Each model gets its own test so failures are attributed clearly. + */ + async function runEvalForModel(model: string): Promise { + const client = await createClient(apiKey as string); + const skillContent = await Bun.file(SKILL_PATH).text(); + + const results: CaseResult[] = []; + for (const testCase of testCases) { + const plan = await generatePlan( + client, + model, + skillContent, + testCase.prompt + ); + const result = await judgePlan(client, testCase, plan); + results.push(result); + } + + const passed = results.filter((r) => r.passed).length; + const score = passed / testCases.length; + // biome-ignore lint/suspicious/noMisplacedAssertion: called from test() via helper + expect(score).toBeGreaterThanOrEqual(threshold); + } + + test( + "claude-sonnet-4-6 meets threshold", + () => runEvalForModel("claude-sonnet-4-6"), + { timeout: 120_000 } + ); + + test( + "claude-opus-4-6 meets threshold", + () => runEvalForModel("claude-opus-4-6"), + { timeout: 120_000 } + ); +}); diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts index 39b832d38..a5557c984 100644 --- a/test/skill-eval/helpers/judge.ts +++ b/test/skill-eval/helpers/judge.ts @@ -1,9 +1,10 @@ /** * Phase 2: Grade the agent's plan against test case criteria. * - * Two passes: + * Three passes: * 1. Deterministic — string matching for anti-patterns, expected-patterns, max-commands - * 2. LLM judge — coherence/quality check using a cheap model (Haiku 4.6) + * 2. Command verification — run each planned command with `-h` against the real binary + * 3. LLM judge — coherence/quality check using a cheap model (Haiku 4.5) */ import type { LLMClient } from "./llm-client.js"; @@ -15,6 +16,7 @@ import type { CriterionResult, TestCase, } from "./types.js"; +import { formatVerifications, verifyPlannedCommands } from "./verify.js"; /** * Evaluate a single deterministic criterion against the plan's commands. @@ -74,14 +76,16 @@ function evaluateDeterministic( /** * Use the LLM judge to evaluate overall plan quality. - * The command reference (extracted from SKILL.md) grounds the judge so it - * doesn't hallucinate that valid `sentry` commands don't exist. + * + * The judge receives empirical verification results from running each + * planned command with `-h` against the real binary — no command reference + * or "allowed list" is provided, keeping the judge independent. */ async function evaluateWithLLMJudge( client: LLMClient, prompt: string, plan: AgentPlan, - commandReference: string + verificationSummary: string ): Promise { const commandList = plan.commands .map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`) @@ -89,15 +93,10 @@ async function evaluateWithLLMJudge( const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good. -The agent was given a skill guide for the \`sentry\` CLI (not the legacy \`sentry-cli\`). -Here are the valid commands from that guide: - -${commandReference} +The agent planned commands for the \`sentry\` CLI — a modern command-line tool (distinct from the legacy \`sentry-cli\`). -Important context about how this CLI works: -- Positional args like \`\` are OPTIONAL — the CLI auto-detects org and project from the local directory context (DSN detection). Omitting them is correct and expected. -- Each command supports additional flags (e.g., --json, --query, --limit, --period, --fields) documented in separate reference files. The compact listing above only shows command signatures, not all flags. -- --json is a global flag available on all list/view commands. +We verified each planned command against the real CLI binary by running it with \`-h\`: +${verificationSummary} The user asked: "${prompt}" @@ -108,16 +107,11 @@ ${commandList} Notes: ${plan.notes} Evaluate the plan on overall quality. A good plan: -- Uses commands that exist in the reference above +- Uses commands verified as VALID above - Would actually work if executed - Is efficient (no unnecessary commands) - Directly addresses what the user asked for -Do NOT penalize: -- Commands that appear in the reference above — this is a real CLI tool -- Omitting org/project args — auto-detection is a core feature -- Using flags like --json, --query, --limit, --fields, --period — they are real flags - Return ONLY valid JSON: {"pass": true, "reason": "Brief explanation"} @@ -162,16 +156,14 @@ or /** * Evaluate a test case's plan against all its criteria. - * Runs deterministic checks first, then the LLM judge for overall quality. * - * @param commandReference - The Command Reference section from SKILL.md, - * injected into the judge prompt so it can verify commands exist. + * Runs deterministic checks first, then verifies commands against the + * real binary, then passes verification results to the LLM judge. */ export async function judgePlan( client: LLMClient, testCase: TestCase, - plan: AgentPlan | null, - commandReference: string + plan: AgentPlan | null ): Promise { // If the planner failed to produce a plan, fail all criteria if (!plan) { @@ -201,12 +193,16 @@ export async function judgePlan( criteria.push(evaluateDeterministic(name, def, plan)); } - // Run LLM judge for overall quality + // Verify commands against the real binary + const verifications = await verifyPlannedCommands(plan.commands); + const verificationSummary = formatVerifications(verifications); + + // Run LLM judge with verification results const llmVerdict = await evaluateWithLLMJudge( client, testCase.prompt, plan, - commandReference + verificationSummary ); criteria.push(llmVerdict); diff --git a/test/skill-eval/helpers/verify.ts b/test/skill-eval/helpers/verify.ts new file mode 100644 index 000000000..b78e1c0cd --- /dev/null +++ b/test/skill-eval/helpers/verify.ts @@ -0,0 +1,105 @@ +/** + * Verify planned CLI commands against the real binary. + * + * Runs each command's subcommand route with `-h` to check it exists. + * A valid route returns exit code 0 (Stricli intercepts `-h` before + * command execution, so no auth/env setup is needed). + * An unknown route returns exit code 251 (Bun) or -5 (Node). + */ + +import { getCliCommand } from "../../fixture.js"; +import type { PlannedCommand } from "./types.js"; + +/** Result of verifying a single planned command against the real CLI binary */ +export type CommandVerification = { + command: string; + valid: boolean; + /** First few lines of help output (for valid commands) or error message */ + detail: string; +}; + +/** + * Extract the subcommand route from a full CLI command string. + * + * Stops at flags (`-*`), paths (`/`), quoted values, or assignments (`=`). + * E.g., `"sentry issue list --query '...'"` → `["issue", "list"]` + * E.g., `"sentry api /api/0/..."` → `["api"]` + */ +function extractRoute(command: string): string[] { + const tokens = command.trim().split(/\s+/); + const start = tokens[0] === "sentry" ? 1 : 0; + + const route: string[] = []; + for (let i = start; i < tokens.length; i++) { + const token = tokens[i]; + if ( + token.startsWith("-") || + token.includes("/") || + token.includes("=") || + token.startsWith('"') || + token.startsWith("'") + ) { + break; + } + route.push(token); + } + return route; +} + +/** + * Verify each planned command by running its route with `-h`. + * + * Uses `getCliCommand()` which respects `SENTRY_CLI_BINARY` env var + * (pre-built binary in e2e CI) or falls back to `bun run src/bin.ts`. + */ +export async function verifyPlannedCommands( + commands: PlannedCommand[] +): Promise { + const cliCmd = getCliCommand(); + const results: CommandVerification[] = []; + + for (const { command } of commands) { + const route = extractRoute(command); + if (route.length === 0) { + results.push({ + command, + valid: false, + detail: "Could not extract subcommand from command string", + }); + continue; + } + + const proc = Bun.spawn([...cliCmd, ...route, "-h"], { + stdout: "pipe", + stderr: "pipe", + env: { ...process.env, SENTRY_CLI_NO_TELEMETRY: "1" }, + }); + + const [stdout, stderr] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + ]); + const exitCode = await proc.exited; + + const valid = exitCode === 0; + const output = (stdout || stderr).trim(); + // Take first 3 lines as a summary + const detail = output.split("\n").slice(0, 3).join("\n").trim(); + + results.push({ command, valid, detail }); + } + + return results; +} + +/** Format verification results as a human-readable string for the judge prompt */ +export function formatVerifications( + verifications: CommandVerification[] +): string { + return verifications + .map((v, i) => { + const icon = v.valid ? "VALID" : "INVALID"; + return `${i + 1}. \`${v.command}\`: ${icon} — ${v.detail}`; + }) + .join("\n"); +} From fb4cfabc05aa1dbddf4fd1844e663b4ac6c14988 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Fri, 10 Apr 2026 11:14:08 +0000 Subject: [PATCH 4/5] ci: re-trigger CI pipeline From 5897259f14325a4465ba12e6bba28a5feeb1d068 Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Fri, 10 Apr 2026 11:30:13 +0000 Subject: [PATCH 5/5] fix(eval): restore real fetch for Anthropic API calls in e2e test The test preload mocks globalThis.fetch to block external network calls. The skill eval test needs real fetch for Anthropic API, so restore __originalFetch during the describe block. --- test/e2e/skill-eval.test.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/e2e/skill-eval.test.ts b/test/e2e/skill-eval.test.ts index de3e719f1..ddb2c31b4 100644 --- a/test/e2e/skill-eval.test.ts +++ b/test/e2e/skill-eval.test.ts @@ -9,7 +9,7 @@ * In CI, the key is only passed when skill-related files change. */ -import { describe, expect, test } from "bun:test"; +import { afterAll, beforeAll, describe, expect, test } from "bun:test"; import cases from "../skill-eval/cases.json"; import { judgePlan } from "../skill-eval/helpers/judge.js"; import { createClient } from "../skill-eval/helpers/llm-client.js"; @@ -21,7 +21,26 @@ const DEFAULT_THRESHOLD = 0.75; const apiKey = process.env.ANTHROPIC_API_KEY; +/** + * The test preload mocks globalThis.fetch to block external network calls. + * This test needs real fetch for Anthropic API calls, so we restore it + * during the describe block and put the mock back when done. + */ +const originalFetch = (globalThis as { __originalFetch?: typeof fetch }) + .__originalFetch; + describe.skipIf(!apiKey)("skill eval", () => { + const savedFetch = globalThis.fetch; + + beforeAll(() => { + if (originalFetch) { + globalThis.fetch = originalFetch; + } + }); + + afterAll(() => { + globalThis.fetch = savedFetch; + }); const testCases = cases as unknown as TestCase[]; const threshold = process.env.EVAL_THRESHOLD ? Number.parseFloat(process.env.EVAL_THRESHOLD)