diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e7d85fe3f..cb998f93d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -142,61 +142,6 @@ jobs:
           echo "::error::Skill files are out of date. Run 'bun run generate:docs' locally and commit the result."
           exit 1
 
-  eval-skill:
-    name: Eval SKILL.md
-    needs: [changes]
-    if: needs.changes.outputs.skill == 'true'
-    runs-on: ubuntu-latest
-    steps:
-      # For fork PRs: check if eval has already passed via commit status
-      - name: Detect fork
-        id: detect-fork
-        run: |
-          if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
-            echo "is_fork=true" >> "$GITHUB_OUTPUT"
-          fi
-      - name: Check fork eval status
-        if: steps.detect-fork.outputs.is_fork == 'true'
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          SHA="${{ github.event.pull_request.head.sha }}"
-          STATUS=$(gh api "repos/${{ github.repository }}/commits/$SHA/statuses" \
-            --jq '[.[] | select(.context == "eval-skill/fork")] | first | .state // "none"')
-          if [[ "$STATUS" != "success" ]]; then
-            echo "::error::Fork PR modifies skill files but eval has not passed for commit $SHA."
-            echo "::error::A maintainer must review the code and add the 'eval-skill' label."
-            exit 1
-          fi
-          echo "Fork eval passed for $SHA"
-      # For internal PRs: run the eval directly
-      - uses: actions/checkout@v6
-        if: steps.detect-fork.outputs.is_fork != 'true'
-      - uses: oven-sh/setup-bun@v2
-        if: steps.detect-fork.outputs.is_fork != 'true'
-      - uses: actions/cache@v5
-        if: steps.detect-fork.outputs.is_fork != 'true'
-        id: cache
-        with:
-          path: node_modules
-          key: node-modules-${{ hashFiles('bun.lock', 'patches/**') }}
-      - if: steps.detect-fork.outputs.is_fork != 'true' && steps.cache.outputs.cache-hit != 'true'
-        run: bun install --frozen-lockfile
-      - name: Generate docs and skill files
-        if: steps.detect-fork.outputs.is_fork != 'true'
-        run: bun run generate:schema && bun run generate:docs
-      - name: Eval SKILL.md
-        if: steps.detect-fork.outputs.is_fork != 'true'
-        run: bun run eval:skill
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-      - name: Upload eval results
-        if: always() && steps.detect-fork.outputs.is_fork != 'true'
-        uses: actions/upload-artifact@v7
-        with:
-          name: skill-eval-results
-          path: test/skill-eval/results.json
-
   lint:
     name: Lint & Typecheck
     needs: [changes]
@@ -597,7 +542,7 @@ jobs:
 
   test-e2e:
     name: E2E Tests
-    needs: [build-binary]
+    needs: [build-binary, changes]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
@@ -621,6 +566,9 @@ jobs:
       - name: E2E Tests
         env:
           SENTRY_CLI_BINARY: ${{ github.workspace }}/dist-bin/sentry-linux-x64
+          # Pass API key only when skill files changed — the skill-eval e2e test
+          # auto-skips when the key is absent, so non-skill PRs aren't affected.
+          ANTHROPIC_API_KEY: ${{ needs.changes.outputs.skill == 'true' && secrets.ANTHROPIC_API_KEY || '' }}
         run: bun run test:e2e
 
   build-npm:
@@ -726,7 +674,7 @@ jobs:
   ci-status:
     name: CI Status
     if: always()
-    needs: [changes, check-generated, eval-skill, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly]
+    needs: [changes, check-generated, build-binary, build-npm, build-docs, test-e2e, generate-patches, publish-nightly]
     runs-on: ubuntu-latest
     permissions: {}
     steps:
@@ -734,7 +682,7 @@ jobs:
         run: |
           # Check for explicit failures or cancellations in all jobs
           # generate-patches and publish-nightly are skipped on PRs — that's expected
-          results="${{ needs.check-generated.result }} ${{ needs.eval-skill.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}"
+          results="${{ needs.check-generated.result }} ${{ needs.build-binary.result }} ${{ needs.build-npm.result }} ${{ needs.build-docs.result }} ${{ needs.test-e2e.result }} ${{ needs.generate-patches.result }} ${{ needs.publish-nightly.result }}"
           for result in $results; do
             if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
               echo "::error::CI failed"
@@ -752,9 +700,4 @@ jobs:
             echo "::error::CI failed - upstream job failed causing check-generated to be skipped"
             exit 1
           fi
-          if [[ "${{ needs.changes.outputs.skill }}" == "true" && "${{ needs.eval-skill.result }}" == "skipped" ]]; then
-            echo "::error::CI failed - upstream job failed causing eval-skill to be skipped"
-            exit 1
-          fi
-
           echo "CI passed"
diff --git a/.github/workflows/eval-skill-fork.yml b/.github/workflows/eval-skill-fork.yml
index 5380bf24b..f8ff6a120 100644
--- a/.github/workflows/eval-skill-fork.yml
+++ b/.github/workflows/eval-skill-fork.yml
@@ -46,6 +46,9 @@ jobs:
       - if: steps.cache.outputs.cache-hit != 'true'
         run: bun install --frozen-lockfile
 
+      - name: Generate docs and skill files
+        run: bun run generate:schema && bun run generate:docs
+
       - name: Eval SKILL.md
         id: eval
         run: bun run eval:skill
diff --git a/AGENTS.md b/AGENTS.md
index 8d2560227..f53de10b3 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1007,6 +1007,33 @@ mock.module("./some-module", () => ({
 <!-- lore:019cc2ef-9be5-722d-bc9f-b07a8197eeed -->
 * **All view subcommands should use \<target> \<id> positional pattern**: All \`\* view\` subcommands should follow a consistent \`\<target> \<id>\` positional argument pattern where target is the optional \`org/project\` specifier. During migration, use opportunistic argument swapping with a stderr warning when args are in wrong order. This is an instance of the broader CLI UX auto-correction pattern: safe when input is already invalid, correction is unambiguous, warning goes to stderr. Normalize at command level, keep parsers pure. Model after \`gh\` CLI conventions.
 
+<!-- lore:019ca9c3-989c-7c8d-bcd0-9f308fd2c3d7 -->
+* **Sentry CLI markdown-first formatting pipeline replaces ad-hoc ANSI**: Formatters build CommonMark strings; \`renderMarkdown()\` renders to ANSI for TTY or raw markdown for non-TTY. Key helpers: \`colorTag()\`, \`mdKvTable()\`, \`mdRow()\`, \`mdTableHeader()\` (\`:\` suffix = right-aligned), \`renderTextTable()\`. \`isPlainOutput()\` checks \`SENTRY\_PLAIN\_OUTPUT\` > \`NO\_COLOR\` > \`!isTTY\`. Batch path: \`formatXxxTable()\`. Streaming path: \`StreamingTable\` (TTY) or raw markdown rows (plain). Both share \`buildXxxRowCells()\`.
+
+<!-- lore:019d3e86-a74e-7e27-92e1-8a59e43fd37f -->
+* **Sentry dashboard API rejects discover/transaction-like widget types — use spans**: The Sentry Dashboard API rejects \`widgetType: 'discover'\` and \`widgetType: 'transaction-like'\` as deprecated. Use \`widgetType: 'spans'\` for new widgets. The codebase splits types into \`WIDGET\_TYPES\` (active, for creation) and \`ALL\_WIDGET\_TYPES\` (including deprecated, for parsing server responses). \`DashboardWidgetInputSchema\` must use \`ALL\_WIDGET\_TYPES\` so editing existing widgets with deprecated types passes Zod validation. \`validateWidgetEnums()\` in \`resolve.ts\` rejects deprecated types for new widget creation — but accepts \`skipDeprecatedCheck: true\` for the edit path, where \`effectiveDataset\` may inherit a deprecated type from the existing widget. Cross-validation (display vs dataset compatibility) still runs on effective values. Tests must use \`error-events\` instead of \`discover\`; it shares \`DISCOVER\_AGGREGATE\_FUNCTIONS\` including \`failure\_rate\`.
+
+<!-- lore:019cd2b7-bb98-730e-a0d3-ec25bfa6cf4c -->
+* **Sentry issue stats field: time-series controlled by groupStatsPeriod**: Sentry issue stats and list table layout: \`stats\` key depends on \`groupStatsPeriod\` (\`""\`, \`"14d"\`, \`"24h"\`, \`"auto"\`); \`statsPeriod\` controls window. \*\*Critical\*\*: \`count\` is period-scoped — use \`lifetime.count\` for true total. Issue list uses \`groupStatsPeriod: 'auto'\` for sparklines. Columns: SHORT ID, ISSUE, SEEN, AGE, TREND, EVENTS, USERS, TRIAGE. TREND hidden < 100 cols. \`--compact\` tri-state: explicit overrides; \`undefined\` triggers \`shouldAutoCompact(rowCount)\` — compact if \`3N + 3 > termHeight\`. Height formula \`3N + 3\` (last row has no trailing separator).
+
+<!-- lore:019ca9c3-98a2-7a81-9db7-d36c2e71237c -->
+* **Sentry trace-logs API is org-scoped, not project-scoped**: The Sentry trace-logs endpoint (\`/organizations/{org}/trace-logs/\`) is org-scoped, so \`trace logs\` uses \`resolveOrg()\` not \`resolveOrgAndProject()\`. The endpoint is PRIVATE in Sentry source, excluded from the public OpenAPI schema — \`@sentry/api\` has no generated types. The hand-written \`TraceLogSchema\` in \`src/types/sentry.ts\` is required until Sentry makes it public.
+
+<!-- lore:019d3ea5-fd4a-75b0-8dab-0b1f1cb96b0e -->
+* **SKILL.md is fully generated — edit source files, not output**: The skill files under \`plugins/sentry-cli/skills/sentry-cli/\` (SKILL.md + references/\*.md) are fully generated by \`bun run generate:skill\` (script/generate-skill.ts). CI runs this after every push via a \`github-actions\[bot]\` commit, overwriting any manual edits. To change skill content, edit the \*\*sources\*\*: (1) \`docs/src/content/docs/agent-guidance.md\` — embedded into SKILL.md's Agent Guidance section with heading levels bumped. (2) \`src/commands/\*/\` flag \`brief\` strings — generate the reference file flag descriptions. (3) \`docs/src/content/docs/commands/\*.md\` — examples extracted per command via marked AST parsing. After editing sources, run \`bun run generate:skill\` locally and commit both source and generated files. CI's \`bun run check:skill\` fails if generated files are stale.
+
+<!-- lore:019d275c-7cf0-7e13-bdc8-10cbbdbda933 -->
+* **Stricli route errors are uninterceptable — only post-run detection works**: Stricli route errors, exit codes, and OutputError — error propagation gaps: (1) Route failures are uninterceptable — Stricli writes to stderr and returns \`ExitCode.UnknownCommand\` internally. Only post-\`run()\` \`process.exitCode\` check works. \`exceptionWhileRunningCommand\` only fires for errors in command \`func()\`. (2) \`ExitCode.UnknownCommand\` is \`-5\`. Bun reads \`251\` (unsigned byte), Node reads \`-5\` — compare both. (3) \`OutputError\` in \`handleOutputError\` calls \`process.exit()\` immediately, bypassing telemetry and \`exceptionWhileRunningCommand\`. Top-level typos via \`defaultCommand:help\` → \`OutputError\` → \`process.exit(1)\` skip all error reporting.
+
+<!-- lore:019d4a08-22c3-765b-ba12-d91b29e9d497 -->
+* **Three Sentry APIs for span custom attributes with different capabilities**: Three Sentry endpoints serve span data with different custom attribute support: (1) \`/trace/{traceId}/\` — hierarchical tree; supports \`additional\_attributes\` query param to request named attributes (must enumerate). Returns \`measurements\` (web vitals, always populated with zeros for non-browser spans). (2) \`/projects/{org}/{project}/trace-items/{itemId}/?trace\_id={id}\&item\_type=spans\` — single span full detail; returns ALL attributes automatically as \`{name, type, value}\[]\`. No enumeration needed. Frontend span detail sidebar uses this. (3) \`/events/?dataset=spans\&field=X\` — list/search; requires explicit \`field\` params. The CLI's \`span view\` calls endpoint 2 via \`getSpanDetails()\` with p-limit(5) concurrency. \`trace view\` passes \`--fields\` to endpoint 1 as \`additional\_attributes\`.
+
+<!-- lore:019d4a08-22db-7d73-923f-57f51b3cb203 -->
+* **Trace view measurements filtering strips zero-valued web vitals from JSON**: The \`/trace/{traceId}/\` endpoint returns hardcoded web vitals measurements (LCP, FCP, CLS, INP, TTFB + score ratios) on ALL spans, even non-browser ones where they're all zero. \`filterSpanMeasurements()\` in \`trace/view.ts\` recursively strips zero-valued entries from the JSON output. Non-zero measurements on root pageload spans are preserved. Uses destructuring + spread (not \`delete\`) to satisfy Biome's \`noDelete\` rule.
+
+<!-- lore:019cbf3f-6dc2-727d-8dca-228555e9603f -->
+* **withAuthGuard returns discriminated Result type, not fallback+onError**: \`withAuthGuard\<T>(fn)\` in \`src/lib/errors.ts\` returns a discriminated Result: \`{ ok: true, value: T } | { ok: false, error: unknown }\`. AuthErrors always re-throw (triggers bin.ts auto-login). All other errors are captured. Callers inspect \`result.ok\` to degrade gracefully. Used across 12+ files.
+
 <!-- lore:019d20f7-717f-77a2-a71a-9fdbf1c48dea -->
 * **Sentry-derived terminal color palette tuned for dual-background contrast**: The CLI's chart/dashboard palette uses 10 colors derived from Sentry's categorical chart hues (\`static/app/utils/theme/scraps/tokens/color.tsx\` in getsentry/sentry), each adjusted to mid-luminance to achieve ≥3:1 contrast on both dark (#1e1e1e) and light (#f0f0f0) backgrounds. Key adjustments: orange darkened from #FF9838→#C06F20, green #67C800→#3D8F09, yellow #FFD00E→#9E8B18, purple lightened #5D3EB2→#8B6AC8, indigo #50219C→#7B50D0. Blurple (#7553FF), pink (#F0369A), magenta (#B82D90) used as-is. Teal (#228A83) added to fill a hue gap. ANSI 16-color codes were considered but rejected in favor of hex since the mid-luminance hex values provide guaranteed contrast regardless of terminal theme configuration.
 
@@ -1024,6 +1051,15 @@ mock.module("./some-module", () => ({
 <!-- lore:019d2f4c-cd7a-77a6-8d4b-f9911aa45f19 -->
 * **Git worktree blocks branch checkout of branches used in other worktrees**: \`git checkout main\` fails with "already used by worktree at ..." when another worktree has that branch checked out. In this repo's worktree setup, use \`git checkout origin/main --detach\` or create feature branches from \`origin/main\` directly: \`git checkout -b fix/foo origin/main\`. This is a standard git worktree constraint but catches people off guard in the CLI repo which uses worktrees for parallel development.
 
+<!-- lore:019d738f-bf14-7c12-940b-e027ad0db225 -->
+* **Dashboard tracemetrics dataset uses comma-separated aggregate format**: SDK v10+ custom metrics (, , ) emit  envelope items. Dashboard widgets for these MUST use  with aggregate format  — e.g., . The  parameter must match the SDK emission exactly:  if no unit specified,  for memory metrics,  for uptime.  only supports , , , ,  display types — no  or . Widgets with  always require . Sort expressions must reference aggregates present in .
+
+<!-- lore:019d76f5-94c0-7ba6-83fb-f6888b7fb197 -->
+* **Skill eval judge needs explicit CLI context about auto-detect and flags**: The skill eval judge (Haiku) reads the compact Command Reference literally. Without explicit context, it rejects valid plans because: (1) it treats \`\<org/project>\` positional args as mandatory when they're optional via auto-detection, and (2) it rejects standard flags (\`--json\`, \`--query\`, \`--limit\`, \`--fields\`, \`--period\`) not shown in the compact reference. Fix: the judge prompt in \`test/skill-eval/helpers/judge.ts\` must include notes that positional args like \`\<org/project>\` are optional (CLI auto-detects from DSN/config), and that commands support additional flags documented in separate reference files. Also guard against empty \`commandReference\` extraction in \`script/eval-skill.ts\` — log a warning instead of silently producing an adversarial prompt where no commands can satisfy the judge.
+
+<!-- lore:019d4a08-22d7-78d8-ab00-cdfb3ea05373 -->
+* **spansIndexed is not a valid Sentry dataset — use spans**: The Sentry Events/Explore API accepts 5 dataset values: \`spans\`, \`transactions\`, \`logs\`, \`errors\`, \`discover\`. The name \`spansIndexed\` is invalid and returns a generic HTTP 500 "Internal error" with no helpful validation message. This trips up AI agents and users. Valid datasets are documented in \`src/lib/api/datasets.ts\` (\`EVENTS\_API\_DATASETS\` constant) and in \`docs/commands/api.md\`.
+
 <!-- lore:019d2f4c-cd76-7f74-b75c-b279dd12014c -->
 * **Spinner stdout/stderr collision: log messages inside withProgress appear on spinner line**: The \`withProgress\` spinner in \`src/lib/polling.ts\` writes to stdout using \`\r\x1b\[K\` (no trailing newline). Consola logger writes to stderr. On a shared terminal, any \`log.info()\` called \*\*inside\*\* the \`withProgress\` callback appears on the same line as the spinner text because stderr doesn't know about stdout's carriage-return positioning. Fix pattern: propagate data out of the callback via return value, then call \`log.info()\` \*\*after\*\* \`withProgress\` completes (when the \`finally\` block has already cleared the spinner line). This affected \`downloadBinaryToTemp\` in \`upgrade.ts\` where \`log.info('Applied delta patch...')\` fired inside the spinner callback.
 
@@ -1032,6 +1068,15 @@ mock.module("./some-module", () => ({
 
 ### Pattern
 
+<!-- lore:019d4a08-22d0-71b0-87ae-20bf5d94e018 -->
+* **--fields dual role: output filtering + API field selection for span list**: In \`span list\`, \`--fields\` serves dual purpose: filters JSON output AND requests extra API fields. \`extractExtraApiFields()\` in \`span/list.ts\` checks each field name against \`OUTPUT\_TO\_API\_FIELD\` mapping (e.g., \`span\_id\`→\`id\`, \`op\`→\`span.op\`). Unknown names are treated as custom attributes and added to the \`field\` param in the events API request. \`FIELD\_GROUP\_ALIASES\` supports shorthand expansion (e.g., \`gen\_ai\` → 4 gen\_ai.\* fields). Extra fields survive Zod via \`SpanListItemSchema.passthrough()\` and are forwarded by \`spanListItemToFlatSpan(item, extraFieldNames)\` onto \`FlatSpan\`'s index signature. \`formatSpanTable()\` dynamically adds columns for extra attributes.
+
+<!-- lore:019d4a08-22d4-7738-b1a2-9e09ef55daa1 -->
+* **--since is an alias for --period via shared PERIOD\_ALIASES**: \`PERIOD\_ALIASES\` in \`src/lib/list-command.ts\` maps both \`t\` and \`since\` to \`period\`. All commands using \`LIST\_PERIOD\_FLAG\` get \`--since\` as an alias for \`--period\` automatically via spread \`...PERIOD\_ALIASES\`. This was added because AI agents and humans naturally try \`--since 1h\` instead of \`--period 1h\`.
+
+<!-- lore:dbd63348-2049-42b3-bb99-d6a3d64369c7 -->
+* **Branch naming and commit message conventions for Sentry CLI**: Branch naming: \`feat/\<short-description>\` or \`fix/\<issue-number>-\<short-description>\` (e.g., \`feat/ghcr-nightly-distribution\`, \`fix/268-limit-auto-pagination\`). Commit message format: \`type(scope): description (#issue)\` (e.g., \`fix(issue-list): auto-paginate --limit beyond 100 (#268)\`, \`feat(nightly): distribute via GHCR instead of GitHub Releases\`). Types seen: fix, refactor, meta, release, feat. PRs are created as drafts via \`gh pr create --draft\`. Implementation plans are attached to commits via \`git notes add\` rather than in PR body or commit message.
+
 <!-- lore:019d2105-6caa-7077-a28e-126ecc9d0dbc -->
 * **ClearScreen yield token for in-place terminal refresh in buildCommand wrapper**: Commands needing in-place refresh yield a \`ClearScreen\` token from \`src/lib/formatters/output.ts\`. The \`handleYieldedValue\` function in \`buildCommand\` sets a \`pendingClear\` flag; when the next \`CommandOutput\` is rendered, \`renderCommandOutput\` prepends \`\x1b\[H\x1b\[J\` and writes everything in a \*\*single \`stdout.write()\` call\*\* — no flicker. In JSON/plain modes the clear is silently ignored. Pattern: \`yield ClearScreen()\` then \`yield CommandOutput(data)\`. Critical: never split clear and content into separate writes. Also: never add a redundant clear-screen inside a \`HumanRenderer.render()\` method — the \`ClearScreen\` token is the sole mechanism. The dashboard renderer originally had its own \`\x1b\[2J\x1b\[H\` prepend on re-renders, causing double clears; this was removed.
 
@@ -1044,6 +1089,24 @@ mock.module("./some-module", () => ({
 <!-- lore:019cd379-4c71-7477-9cc6-3c0dfc7fb597 -->
 * **Shared flag constants in list-command.ts for cross-command consistency**: \`src/lib/list-command.ts\` exports shared Stricli flag definitions (\`FIELDS\_FLAG\`, \`FRESH\_FLAG\`, \`FRESH\_ALIASES\`) reused across all commands. When adding a new global-ish flag to multiple commands, define it once here as a const satisfying Stricli's flag shape, then spread into each command's \`flags\` object. The \`--fields\` flag is \`{ kind: 'parsed', parse: String, brief: '...', optional: true }\`. \`parseFieldsList()\` in \`formatters/json.ts\` handles comma-separated parsing with trim/dedup. \`writeJson()\` accepts an optional \`fields\` array and calls \`filterFields()\` before serialization.
 
+<!-- lore:019c90f5-913b-7995-8bac-84289cf5d6d9 -->
+* **Pagination contextKey must include all query-varying parameters with escaping**: Pagination \`contextKey\` must encode every query-varying parameter (sort, query, period) with \`escapeContextKeyValue()\` (replaces \`|\` with \`%7C\`). Always provide a fallback before escaping since \`flags.period\` may be \`undefined\` in tests despite having a default: \`flags.period ? escapeContextKeyValue(flags.period) : "90d"\`.
+
+<!-- lore:019c8a8a-64ee-703c-8c1e-ed32ae8a90a7 -->
+* **PR review workflow: reply, resolve, amend, force-push**: PR review workflow: (1) Read unresolved threads via GraphQL, (2) make code changes, (3) run lint+typecheck+tests, (4) create a SEPARATE commit per review round (not amend) for incremental review, (5) push normally, (6) reply to comments via REST API, (7) resolve threads via GraphQL \`resolveReviewThread\`. Only amend+force-push when user explicitly asks or pre-commit hook modified files.
+
+<!-- lore:019d275c-7cfb-7ecb-a8cf-8ff845eb946f -->
+* **Redact sensitive flags in raw argv before sending to telemetry**: Telemetry context and argv redaction patterns: \`withTelemetry\` calls \`initTelemetryContext()\` BEFORE the callback — user ID, email, instance ID, runtime, and is\_self\_hosted tags are automatically set. For org context, read \`getDefaultOrganization()\` from SQLite (no API call). When sending raw argv, redact sensitive flags: \`SENSITIVE\_FLAGS\` in \`telemetry.ts\` (currently \`token\`). Scan for \`--token\`/\`-token\`, replace following value with \`\[REDACTED]\`. Handle both \`--flag value\` and \`--flag=value\` forms. \`setFlagContext\` handles parsed flags separately.
+
+<!-- lore:019cdd9b-330a-784f-9487-0abf7b80be3c -->
+* **Stricli optional boolean flags produce tri-state (true/false/undefined)**: Stricli boolean flags with \`optional: true\` (no \`default\`) produce \`boolean | undefined\` in the flags type. \`--flag\` → \`true\`, \`--no-flag\` → \`false\`, omitted → \`undefined\`. This enables auto-detect patterns: explicit user choice overrides, \`undefined\` triggers heuristic. Used by \`--compact\` on issue list. The flag type must be \`readonly field?: boolean\` (not \`readonly field: boolean\`). This differs from \`default: false\` which always produces a defined boolean.
+
+<!-- lore:019cc325-d322-7e6e-86cc-93010b71abee -->
+* **Testing Stricli command func() bodies via spyOn mocking**: Stricli/Bun test patterns: (1) Command func tests: \`const func = await cmd.loader()\`, then \`func.call(mockContext, flags, ...args)\`. \`loader()\` return type union causes LSP errors — false positives that pass \`tsc\`. File naming: \`\*.func.test.ts\`. (2) ESM prevents \`vi.spyOn\` on Node built-in exports. Workaround: test subclass that overrides the method calling the built-in. (3) Follow-mode uses \`setTimeout\`-based scheduling; test with \`interceptSigint()\` helper. \`Bun.sleep()\` has no AbortSignal so \`setTimeout\`/\`clearTimeout\` required.
+
+<!-- lore:019d3f09-d39b-7795-b8a0-4a3e58a996f9 -->
+* **validateWidgetEnums skipDeprecatedCheck for edit-path inherited datasets**: When editing a widget, \`effectiveDataset = flags.dataset ?? existing.widgetType\` may inherit a deprecated type (e.g., \`discover\`). The \`validateWidgetEnums\` deprecation check must be skipped for inherited values — only fire when the user explicitly passes \`--dataset\`. Solution: \`validateWidgetEnums(effectiveDisplay, effectiveDataset, { skipDeprecatedCheck: true })\` in \`edit.ts\`. The cross-validation between display type and dataset still runs on effective values, catching incompatible combos. The deprecation rejection helper \`rejectInvalidDataset()\` is extracted to keep \`validateWidgetEnums\` under Biome's complexity limit of 15.
+
 <!-- lore:019cbe44-7687-7288-81a2-662feefc28ea -->
 * **SKILL.md generator must filter hidden Stricli flags**: \`script/generate-skill.ts\` introspects Stricli's route tree to auto-generate SKILL.md. \`FlagDef\` must include \`hidden?: boolean\`; \`extractFlags\` propagates it so \`generateCommandDoc\` filters out hidden flags alongside \`help\`/\`helpAll\`. Hidden flags from \`buildCommand\` (\`--log-level\`, \`--verbose\`) appear globally in \`docs/src/content/docs/commands/index.md\` Global Options section, pulled into SKILL.md via \`loadCommandsOverview\`. When \`cmd.jsonFields\` is present (from Zod schema registration), \`generateFullCommandDoc\` renders a markdown "JSON Fields" table with field name, type, and description columns in reference docs.
 <!-- End lore-managed section -->
diff --git a/script/eval-skill.ts b/script/eval-skill.ts
index 47fcce906..3679a808e 100644
--- a/script/eval-skill.ts
+++ b/script/eval-skill.ts
@@ -4,9 +4,10 @@
  *
  * Sends test prompts to agent models (Opus 4.6 + Sonnet 4.6) with SKILL.md
  * as context, then grades the planned commands on efficiency criteria.
+ * Commands are verified against the real CLI binary (via `-h`) to ground
+ * the LLM judge with empirical results.
  *
  * Requires ANTHROPIC_API_KEY env var for Anthropic API access.
- * In CI, the key is stored in the "skill-eval" environment (protected).
  *
  * Usage:
  *   bun run eval:skill
@@ -17,6 +18,7 @@
  *   EVAL_AGENT_MODELS   - Comma-separated model IDs (default: sonnet-4-6, opus-4-6)
  *   EVAL_JUDGE_MODEL    - Judge model ID (default: haiku-4-5)
  *   EVAL_THRESHOLD      - Minimum pass rate 0-1 (default: 0.75)
+ *   SENTRY_CLI_BINARY   - Path to pre-built binary (falls back to bun run src/bin.ts)
  */
 
 import cases from "../test/skill-eval/cases.json";
diff --git a/test/e2e/skill-eval.test.ts b/test/e2e/skill-eval.test.ts
new file mode 100644
index 000000000..ddb2c31b4
--- /dev/null
+++ b/test/e2e/skill-eval.test.ts
@@ -0,0 +1,86 @@
+/**
+ * SKILL.md Effectiveness Evaluation (E2E)
+ *
+ * Tests whether SKILL.md effectively guides LLMs to plan correct CLI commands.
+ * Uses the real CLI binary (via SENTRY_CLI_BINARY or dev mode) to verify
+ * that planned commands actually exist.
+ *
+ * Skips automatically when ANTHROPIC_API_KEY is not set.
+ * In CI, the key is only passed when skill-related files change.
+ */
+
+import { afterAll, beforeAll, describe, expect, test } from "bun:test";
+import cases from "../skill-eval/cases.json";
+import { judgePlan } from "../skill-eval/helpers/judge.js";
+import { createClient } from "../skill-eval/helpers/llm-client.js";
+import { generatePlan } from "../skill-eval/helpers/planner.js";
+import type { CaseResult, TestCase } from "../skill-eval/helpers/types.js";
+
+const SKILL_PATH = "plugins/sentry-cli/skills/sentry-cli/SKILL.md";
+const DEFAULT_THRESHOLD = 0.75;
+
+const apiKey = process.env.ANTHROPIC_API_KEY;
+
+/**
+ * The test preload mocks globalThis.fetch to block external network calls.
+ * This test needs real fetch for Anthropic API calls, so we restore it
+ * during the describe block and put the mock back when done.
+ */
+const originalFetch = (globalThis as { __originalFetch?: typeof fetch })
+  .__originalFetch;
+
+describe.skipIf(!apiKey)("skill eval", () => {
+  const savedFetch = globalThis.fetch;
+
+  beforeAll(() => {
+    if (originalFetch) {
+      globalThis.fetch = originalFetch;
+    }
+  });
+
+  afterAll(() => {
+    globalThis.fetch = savedFetch;
+  });
+  const testCases = cases as unknown as TestCase[];
+  const threshold = process.env.EVAL_THRESHOLD
+    ? Number.parseFloat(process.env.EVAL_THRESHOLD)
+    : DEFAULT_THRESHOLD;
+
+  /**
+   * Run the full eval for a single model and assert it meets the threshold.
+   * Each model gets its own test so failures are attributed clearly.
+   */
+  async function runEvalForModel(model: string): Promise<void> {
+    const client = await createClient(apiKey as string);
+    const skillContent = await Bun.file(SKILL_PATH).text();
+
+    const results: CaseResult[] = [];
+    for (const testCase of testCases) {
+      const plan = await generatePlan(
+        client,
+        model,
+        skillContent,
+        testCase.prompt
+      );
+      const result = await judgePlan(client, testCase, plan);
+      results.push(result);
+    }
+
+    const passed = results.filter((r) => r.passed).length;
+    const score = passed / testCases.length;
+    // biome-ignore lint/suspicious/noMisplacedAssertion: called from test() via helper
+    expect(score).toBeGreaterThanOrEqual(threshold);
+  }
+
+  test(
+    "claude-sonnet-4-6 meets threshold",
+    () => runEvalForModel("claude-sonnet-4-6"),
+    { timeout: 120_000 }
+  );
+
+  test(
+    "claude-opus-4-6 meets threshold",
+    () => runEvalForModel("claude-opus-4-6"),
+    { timeout: 120_000 }
+  );
+});
diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts
index 3afe99912..a5557c984 100644
--- a/test/skill-eval/helpers/judge.ts
+++ b/test/skill-eval/helpers/judge.ts
@@ -1,9 +1,10 @@
 /**
  * Phase 2: Grade the agent's plan against test case criteria.
  *
- * Two passes:
+ * Three passes:
  * 1. Deterministic — string matching for anti-patterns, expected-patterns, max-commands
- * 2. LLM judge — coherence/quality check using a cheap model (Haiku 4.6)
+ * 2. Command verification — run each planned command with `-h` against the real binary
+ * 3. LLM judge — coherence/quality check using a cheap model (Haiku 4.5)
  */
 
 import type { LLMClient } from "./llm-client.js";
@@ -15,6 +16,7 @@ import type {
   CriterionResult,
   TestCase,
 } from "./types.js";
+import { formatVerifications, verifyPlannedCommands } from "./verify.js";
 
 /**
  * Evaluate a single deterministic criterion against the plan's commands.
@@ -74,12 +76,16 @@ function evaluateDeterministic(
 
 /**
  * Use the LLM judge to evaluate overall plan quality.
- * Returns null if the judge call fails.
+ *
+ * The judge receives empirical verification results from running each
+ * planned command with `-h` against the real binary — no command reference
+ * or "allowed list" is provided, keeping the judge independent.
  */
 async function evaluateWithLLMJudge(
   client: LLMClient,
   prompt: string,
-  plan: AgentPlan
+  plan: AgentPlan,
+  verificationSummary: string
 ): Promise<CriterionResult> {
   const commandList = plan.commands
     .map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`)
@@ -87,6 +93,11 @@ async function evaluateWithLLMJudge(
 
   const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good.
 
+The agent planned commands for the \`sentry\` CLI — a modern command-line tool (distinct from the legacy \`sentry-cli\`).
+
+We verified each planned command against the real CLI binary by running it with \`-h\`:
+${verificationSummary}
+
 The user asked: "${prompt}"
 
 The agent's plan:
@@ -96,7 +107,7 @@ ${commandList}
 Notes: ${plan.notes}
 
 Evaluate the plan on overall quality. A good plan:
-- Uses the right Sentry CLI commands for the task
+- Uses commands verified as VALID above
 - Would actually work if executed
 - Is efficient (no unnecessary commands)
 - Directly addresses what the user asked for
@@ -145,7 +156,9 @@ or
 
 /**
  * Evaluate a test case's plan against all its criteria.
- * Runs deterministic checks first, then the LLM judge for overall quality.
+ *
+ * Runs deterministic checks first, then verifies commands against the
+ * real binary, then passes verification results to the LLM judge.
  */
 export async function judgePlan(
   client: LLMClient,
@@ -180,8 +193,17 @@ export async function judgePlan(
     criteria.push(evaluateDeterministic(name, def, plan));
   }
 
-  // Run LLM judge for overall quality
-  const llmVerdict = await evaluateWithLLMJudge(client, testCase.prompt, plan);
+  // Verify commands against the real binary
+  const verifications = await verifyPlannedCommands(plan.commands);
+  const verificationSummary = formatVerifications(verifications);
+
+  // Run LLM judge with verification results
+  const llmVerdict = await evaluateWithLLMJudge(
+    client,
+    testCase.prompt,
+    plan,
+    verificationSummary
+  );
   criteria.push(llmVerdict);
 
   // Compute score: fraction of criteria that passed
diff --git a/test/skill-eval/helpers/verify.ts b/test/skill-eval/helpers/verify.ts
new file mode 100644
index 000000000..b78e1c0cd
--- /dev/null
+++ b/test/skill-eval/helpers/verify.ts
@@ -0,0 +1,105 @@
+/**
+ * Verify planned CLI commands against the real binary.
+ *
+ * Runs each command's subcommand route with `-h` to check it exists.
+ * A valid route returns exit code 0 (Stricli intercepts `-h` before
+ * command execution, so no auth/env setup is needed).
+ * An unknown route returns exit code 251 (Bun) or -5 (Node).
+ */
+
+import { getCliCommand } from "../../fixture.js";
+import type { PlannedCommand } from "./types.js";
+
+/** Result of verifying a single planned command against the real CLI binary */
+export type CommandVerification = {
+  command: string;
+  valid: boolean;
+  /** First few lines of help output (for valid commands) or error message */
+  detail: string;
+};
+
+/**
+ * Extract the subcommand route from a full CLI command string.
+ *
+ * Stops at flags (`-*`), paths (`/`), quoted values, or assignments (`=`).
+ * E.g., `"sentry issue list --query '...'"` → `["issue", "list"]`
+ * E.g., `"sentry api /api/0/..."` → `["api"]`
+ */
+function extractRoute(command: string): string[] {
+  const tokens = command.trim().split(/\s+/);
+  const start = tokens[0] === "sentry" ? 1 : 0;
+
+  const route: string[] = [];
+  for (let i = start; i < tokens.length; i++) {
+    const token = tokens[i];
+    if (
+      token.startsWith("-") ||
+      token.includes("/") ||
+      token.includes("=") ||
+      token.startsWith('"') ||
+      token.startsWith("'")
+    ) {
+      break;
+    }
+    route.push(token);
+  }
+  return route;
+}
+
+/**
+ * Verify each planned command by running its route with `-h`.
+ *
+ * Uses `getCliCommand()` which respects `SENTRY_CLI_BINARY` env var
+ * (pre-built binary in e2e CI) or falls back to `bun run src/bin.ts`.
+ */
+export async function verifyPlannedCommands(
+  commands: PlannedCommand[]
+): Promise<CommandVerification[]> {
+  const cliCmd = getCliCommand();
+  const results: CommandVerification[] = [];
+
+  for (const { command } of commands) {
+    const route = extractRoute(command);
+    if (route.length === 0) {
+      results.push({
+        command,
+        valid: false,
+        detail: "Could not extract subcommand from command string",
+      });
+      continue;
+    }
+
+    const proc = Bun.spawn([...cliCmd, ...route, "-h"], {
+      stdout: "pipe",
+      stderr: "pipe",
+      env: { ...process.env, SENTRY_CLI_NO_TELEMETRY: "1" },
+    });
+
+    const [stdout, stderr] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+    ]);
+    const exitCode = await proc.exited;
+
+    const valid = exitCode === 0;
+    const output = (stdout || stderr).trim();
+    // Take first 3 lines as a summary
+    const detail = output.split("\n").slice(0, 3).join("\n").trim();
+
+    results.push({ command, valid, detail });
+  }
+
+  return results;
+}
+
+/** Format verification results as a human-readable string for the judge prompt */
+export function formatVerifications(
+  verifications: CommandVerification[]
+): string {
+  return verifications
+    .map((v, i) => {
+      const icon = v.valid ? "VALID" : "INVALID";
+      return `${i + 1}. \`${v.command}\`: ${icon} — ${v.detail}`;
+    })
+    .join("\n");
+}