From 8e74933f1ff5a6eea63281371cf29b35c7ad1325 Mon Sep 17 00:00:00 2001
From: Arun Kumar Thiagarajan <arunkt.bm14@gmail.com>
Date: Wed, 18 Mar 2026 10:20:02 +0530
Subject: [PATCH 1/3] =?UTF-8?q?feat:=20add=20/benchmark=20skill=20?=
 =?UTF-8?q?=E2=80=94=20performance=20regression=20detection=20via=20browse?=
 =?UTF-8?q?=20daemon?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Catches the death-by-a-thousand-cuts performance decay:
- Before/after comparison using browse daemon's perf command
- Core Web Vitals tracking (TTFB, FCP, LCP, DOM Complete)
- JS/CSS bundle size monitoring with regression thresholds
- Resource waterfall analysis with optimization recommendations
- Performance budget checking against industry standards
- Trend analysis from historical benchmark data
- Diff-aware mode: only benchmark pages affected by current branch
---
 benchmark/SKILL.md            | 354 ++++++++++++++++++++++++++++++++++
 benchmark/SKILL.md.tmpl       | 233 ++++++++++++++++++++++
 scripts/gen-skill-docs.ts     |   2 +
 scripts/skill-check.ts        |   4 +
 test/gen-skill-docs.test.ts   |   2 +
 test/skill-validation.test.ts |   6 +
 6 files changed, 601 insertions(+)
 create mode 100644 benchmark/SKILL.md
 create mode 100644 benchmark/SKILL.md.tmpl
diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md
new file mode 100644
index 00000000..ac665290
--- /dev/null
+++ b/benchmark/SKILL.md
@@ -0,0 +1,354 @@
+---
+name: benchmark
+version: 1.0.0
+description: |
+  Performance regression detection using the browse daemon. Establishes
+  baselines for page load times, Core Web Vitals, and resource sizes.
+  Compares before/after on every PR. Tracks performance trends over time.
+  Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals",
+  "bundle size", "load time".
+allowed-tools:
+  - Bash
+  - Read
+  - Write
+  - Glob
+  - AskUserQuestion
+---
+<!-- AUTO-GENERATED from SKILL.md.tmpl — do not edit directly -->
+<!-- Regenerate: bun run gen:skill-docs -->
+
+## Preamble (run first)
+
+```bash
+_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
+[ -n "$_UPD" ] && echo "$_UPD" || true
+mkdir -p ~/.gstack/sessions
+touch ~/.gstack/sessions/"$PPID"
+_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ')
+find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true
+_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true)
+_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
+echo "BRANCH: $_BRANCH"
+_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
+echo "LAKE_INTRO: $_LAKE_SEEN"
+```
+
+If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.
+
+If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle.
+Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete
+thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean"
+Then offer to open the essay in their default browser:
+
+```bash
+open https://garryslist.org/posts/boil-the-ocean
+touch ~/.gstack/.completeness-intro-seen
+```
+
+Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once.
+
+## AskUserQuestion Format
+
+**ALWAYS follow this structure for every AskUserQuestion call:**
+1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
+2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called.
+3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it.
+4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
+
+Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
+
+Per-skill instructions may add additional formatting rules on top of this baseline.
+
+## Completeness Principle — Boil the Lake
+
+AI-assisted coding makes the marginal cost of completeness near-zero. When you present options:
+
+- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more.
+- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope.
+- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference:
+
+| Task type | Human team | CC+gstack | Compression |
+|-----------|-----------|-----------|-------------|
+| Boilerplate / scaffolding | 2 days | 15 min | ~100x |
+| Test writing | 1 day | 15 min | ~50x |
+| Feature implementation | 1 week | 30 min | ~30x |
+| Bug fix + regression test | 4 hours | 15 min | ~20x |
+| Architecture / design | 2 days | 4 hours | ~5x |
+| Research / exploration | 1 day | 3 hours | ~3x |
+
+- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds.
+
+**Anti-patterns — DON'T do this:**
+- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.)
+- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.)
+- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.)
+- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")
+
+## Contributor Mode
+
+If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better.
+
+**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better!
+
+**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore.
+
+**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs.
+
+**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer):
+
+```
+# {Title}
+
+Hey gstack team — ran into this while using /{skill-name}:
+
+**What I was trying to do:** {what the user/agent was attempting}
+**What happened instead:** {what actually happened}
+**My rating:** {0-10} — {one sentence on why it wasn't a 10}
+
+## Steps to reproduce
+1. {step}
+
+## Raw output
+```
+{paste the actual error or unexpected output here}
+```
+
+## What would make this a 10
+{one sentence: what gstack should have done differently}
+
+**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill}
+```
+
+Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}"
+
+## SETUP (run this check BEFORE any browse command)
+
+```bash
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
+  echo "READY: $B"
+else
+  echo "NEEDS_SETUP"
+fi
+```
+
+If `NEEDS_SETUP`:
+1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait.
+2. Run: `cd <SKILL_DIR> && ./setup`
+3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash`
+
+# /benchmark — Performance Regression Detection
+
+You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow.
+
+Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages.
+
+## User-invocable
+When the user types `/benchmark`, run this skill.
+
+## Arguments
+- `/benchmark <url>` — full performance audit with baseline comparison
+- `/benchmark <url> --baseline` — capture baseline (run before making changes)
+- `/benchmark <url> --quick` — single-pass timing check (no baseline needed)
+- `/benchmark <url> --pages /,/dashboard,/api/health` — specify pages
+- `/benchmark --diff` — benchmark only pages affected by current branch
+- `/benchmark --trend` — show performance trends from historical data
+
+## Instructions
+
+### Phase 1: Setup
+
+```bash
+eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown")
+mkdir -p .gstack/benchmark-reports
+mkdir -p .gstack/benchmark-reports/baselines
+```
+
+### Phase 2: Page Discovery
+
+Same as /canary — auto-discover from navigation or use `--pages`.
+
+If `--diff` mode:
+```bash
+git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only
+```
+
+### Phase 3: Performance Data Collection
+
+For each page, collect comprehensive performance metrics:
+
+```bash
+$B goto <page-url>
+$B perf
+```
+
+Then gather detailed metrics via JavaScript:
+
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])"
+```
+
+Extract key metrics:
+- **TTFB** (Time to First Byte): `responseStart - requestStart`
+- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries
+- **LCP** (Largest Contentful Paint): from PerformanceObserver
+- **DOM Interactive**: `domInteractive - navigationStart`
+- **DOM Complete**: `domComplete - navigationStart`
+- **Full Load**: `loadEventEnd - navigationStart`
+
+Resource analysis:
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))"
+```
+
+Bundle size check:
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
+$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
+```
+
+Network summary:
+```bash
+$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()"
+```
+
+### Phase 4: Baseline Capture (--baseline mode)
+
+Save metrics to baseline file:
+
+```json
+{
+  "url": "<url>",
+  "timestamp": "<ISO>",
+  "branch": "<branch>",
+  "pages": {
+    "/": {
+      "ttfb_ms": 120,
+      "fcp_ms": 450,
+      "lcp_ms": 800,
+      "dom_interactive_ms": 600,
+      "dom_complete_ms": 1200,
+      "full_load_ms": 1400,
+      "total_requests": 42,
+      "total_transfer_bytes": 1250000,
+      "js_bundle_bytes": 450000,
+      "css_bundle_bytes": 85000,
+      "largest_resources": [
+        {"name": "main.js", "size": 320000, "duration": 180},
+        {"name": "vendor.js", "size": 130000, "duration": 90}
+      ]
+    }
+  }
+}
+```
+
+Write to `.gstack/benchmark-reports/baselines/baseline.json`.
+
+### Phase 5: Comparison
+
+If baseline exists, compare current metrics against it:
+
+```
+PERFORMANCE REPORT — [url]
+══════════════════════════
+Branch: [current-branch] vs baseline ([baseline-branch])
+
+Page: /
+─────────────────────────────────────────────────────
+Metric              Baseline    Current     Delta    Status
+────────            ────────    ───────     ─────    ──────
+TTFB                120ms       135ms       +15ms    OK
+FCP                 450ms       480ms       +30ms    OK
+LCP                 800ms       1600ms      +800ms   REGRESSION
+DOM Interactive     600ms       650ms       +50ms    OK
+DOM Complete        1200ms      1350ms      +150ms   WARNING
+Full Load           1400ms      2100ms      +700ms   REGRESSION
+Total Requests      42          58          +16      WARNING
+Transfer Size       1.2MB       1.8MB       +0.6MB   REGRESSION
+JS Bundle           450KB       720KB       +270KB   REGRESSION
+CSS Bundle          85KB        88KB        +3KB     OK
+
+REGRESSIONS DETECTED: 3
+  [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource
+  [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles
+  [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking
+```
+
+**Regression thresholds:**
+- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION
+- Timing metrics: >20% increase = WARNING
+- Bundle size: >25% increase = REGRESSION
+- Bundle size: >10% increase = WARNING
+- Request count: >30% increase = WARNING
+
+### Phase 6: Slowest Resources
+
+```
+TOP 10 SLOWEST RESOURCES
+═════════════════════════
+#   Resource                  Type      Size      Duration
+1   vendor.chunk.js          script    320KB     480ms
+2   main.js                  script    250KB     320ms
+3   hero-image.webp          img       180KB     280ms
+4   analytics.js             script    45KB      250ms    ← third-party
+5   fonts/inter-var.woff2    font      95KB      180ms
+...
+
+RECOMMENDATIONS:
+- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load
+- analytics.js: Load async/defer — blocks rendering for 250ms
+- hero-image.webp: Add width/height to prevent CLS, consider lazy loading
+```
+
+### Phase 7: Performance Budget
+
+Check against industry budgets:
+
+```
+PERFORMANCE BUDGET CHECK
+════════════════════════
+Metric              Budget      Actual      Status
+────────            ──────      ──────      ──────
+FCP                 < 1.8s      0.48s       PASS
+LCP                 < 2.5s      1.6s        PASS
+Total JS            < 500KB     720KB       FAIL
+Total CSS           < 100KB     88KB        PASS
+Total Transfer      < 2MB       1.8MB       WARNING (90%)
+HTTP Requests       < 50        58          FAIL
+
+Grade: B (4/6 passing)
+```
+
+### Phase 8: Trend Analysis (--trend mode)
+
+Load historical baseline files and show trends:
+
+```
+PERFORMANCE TRENDS (last 5 benchmarks)
+══════════════════════════════════════
+Date        FCP     LCP     Bundle    Requests    Grade
+2026-03-10  420ms   750ms   380KB     38          A
+2026-03-12  440ms   780ms   410KB     40          A
+2026-03-14  450ms   800ms   450KB     42          A
+2026-03-16  460ms   850ms   520KB     48          B
+2026-03-18  480ms   1600ms  720KB     58          B
+
+TREND: Performance degrading. LCP doubled in 8 days.
+       JS bundle growing 50KB/week. Investigate.
+```
+
+### Phase 9: Save Report
+
+Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`.
+
+## Important Rules
+
+- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates.
+- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture.
+- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline.
+- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources.
+- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously.
+- **Read-only.** Produce the report. Don't modify code unless explicitly asked.
diff --git a/benchmark/SKILL.md.tmpl b/benchmark/SKILL.md.tmpl
new file mode 100644
index 00000000..3d4efac8
--- /dev/null
+++ b/benchmark/SKILL.md.tmpl
@@ -0,0 +1,233 @@
+---
+name: benchmark
+version: 1.0.0
+description: |
+  Performance regression detection using the browse daemon. Establishes
+  baselines for page load times, Core Web Vitals, and resource sizes.
+  Compares before/after on every PR. Tracks performance trends over time.
+  Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals",
+  "bundle size", "load time".
+allowed-tools:
+  - Bash
+  - Read
+  - Write
+  - Glob
+  - AskUserQuestion
+---
+
+{{PREAMBLE}}
+
+{{BROWSE_SETUP}}
+
+# /benchmark — Performance Regression Detection
+
+You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow.
+
+Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages.
+
+## User-invocable
+When the user types `/benchmark`, run this skill.
+
+## Arguments
+- `/benchmark <url>` — full performance audit with baseline comparison
+- `/benchmark <url> --baseline` — capture baseline (run before making changes)
+- `/benchmark <url> --quick` — single-pass timing check (no baseline needed)
+- `/benchmark <url> --pages /,/dashboard,/api/health` — specify pages
+- `/benchmark --diff` — benchmark only pages affected by current branch
+- `/benchmark --trend` — show performance trends from historical data
+
+## Instructions
+
+### Phase 1: Setup
+
+```bash
+eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown")
+mkdir -p .gstack/benchmark-reports
+mkdir -p .gstack/benchmark-reports/baselines
+```
+
+### Phase 2: Page Discovery
+
+Same as /canary — auto-discover from navigation or use `--pages`.
+
+If `--diff` mode:
+```bash
+git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only
+```
+
+### Phase 3: Performance Data Collection
+
+For each page, collect comprehensive performance metrics:
+
+```bash
+$B goto <page-url>
+$B perf
+```
+
+Then gather detailed metrics via JavaScript:
+
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])"
+```
+
+Extract key metrics:
+- **TTFB** (Time to First Byte): `responseStart - requestStart`
+- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries
+- **LCP** (Largest Contentful Paint): from PerformanceObserver
+- **DOM Interactive**: `domInteractive - navigationStart`
+- **DOM Complete**: `domComplete - navigationStart`
+- **Full Load**: `loadEventEnd - navigationStart`
+
+Resource analysis:
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))"
+```
+
+Bundle size check:
+```bash
+$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
+$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
+```
+
+Network summary:
+```bash
+$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()"
+```
+
+### Phase 4: Baseline Capture (--baseline mode)
+
+Save metrics to baseline file:
+
+```json
+{
+  "url": "<url>",
+  "timestamp": "<ISO>",
+  "branch": "<branch>",
+  "pages": {
+    "/": {
+      "ttfb_ms": 120,
+      "fcp_ms": 450,
+      "lcp_ms": 800,
+      "dom_interactive_ms": 600,
+      "dom_complete_ms": 1200,
+      "full_load_ms": 1400,
+      "total_requests": 42,
+      "total_transfer_bytes": 1250000,
+      "js_bundle_bytes": 450000,
+      "css_bundle_bytes": 85000,
+      "largest_resources": [
+        {"name": "main.js", "size": 320000, "duration": 180},
+        {"name": "vendor.js", "size": 130000, "duration": 90}
+      ]
+    }
+  }
+}
+```
+
+Write to `.gstack/benchmark-reports/baselines/baseline.json`.
+
+### Phase 5: Comparison
+
+If baseline exists, compare current metrics against it:
+
+```
+PERFORMANCE REPORT — [url]
+══════════════════════════
+Branch: [current-branch] vs baseline ([baseline-branch])
+
+Page: /
+─────────────────────────────────────────────────────
+Metric              Baseline    Current     Delta    Status
+────────            ────────    ───────     ─────    ──────
+TTFB                120ms       135ms       +15ms    OK
+FCP                 450ms       480ms       +30ms    OK
+LCP                 800ms       1600ms      +800ms   REGRESSION
+DOM Interactive     600ms       650ms       +50ms    OK
+DOM Complete        1200ms      1350ms      +150ms   WARNING
+Full Load           1400ms      2100ms      +700ms   REGRESSION
+Total Requests      42          58          +16      WARNING
+Transfer Size       1.2MB       1.8MB       +0.6MB   REGRESSION
+JS Bundle           450KB       720KB       +270KB   REGRESSION
+CSS Bundle          85KB        88KB        +3KB     OK
+
+REGRESSIONS DETECTED: 3
+  [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource
+  [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles
+  [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking
+```
+
+**Regression thresholds:**
+- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION
+- Timing metrics: >20% increase = WARNING
+- Bundle size: >25% increase = REGRESSION
+- Bundle size: >10% increase = WARNING
+- Request count: >30% increase = WARNING
+
+### Phase 6: Slowest Resources
+
+```
+TOP 10 SLOWEST RESOURCES
+═════════════════════════
+#   Resource                  Type      Size      Duration
+1   vendor.chunk.js          script    320KB     480ms
+2   main.js                  script    250KB     320ms
+3   hero-image.webp          img       180KB     280ms
+4   analytics.js             script    45KB      250ms    ← third-party
+5   fonts/inter-var.woff2    font      95KB      180ms
+...
+
+RECOMMENDATIONS:
+- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load
+- analytics.js: Load async/defer — blocks rendering for 250ms
+- hero-image.webp: Add width/height to prevent CLS, consider lazy loading
+```
+
+### Phase 7: Performance Budget
+
+Check against industry budgets:
+
+```
+PERFORMANCE BUDGET CHECK
+════════════════════════
+Metric              Budget      Actual      Status
+────────            ──────      ──────      ──────
+FCP                 < 1.8s      0.48s       PASS
+LCP                 < 2.5s      1.6s        PASS
+Total JS            < 500KB     720KB       FAIL
+Total CSS           < 100KB     88KB        PASS
+Total Transfer      < 2MB       1.8MB       WARNING (90%)
+HTTP Requests       < 50        58          FAIL
+
+Grade: B (4/6 passing)
+```
+
+### Phase 8: Trend Analysis (--trend mode)
+
+Load historical baseline files and show trends:
+
+```
+PERFORMANCE TRENDS (last 5 benchmarks)
+══════════════════════════════════════
+Date        FCP     LCP     Bundle    Requests    Grade
+2026-03-10  420ms   750ms   380KB     38          A
+2026-03-12  440ms   780ms   410KB     40          A
+2026-03-14  450ms   800ms   450KB     42          A
+2026-03-16  460ms   850ms   520KB     48          B
+2026-03-18  480ms   1600ms  720KB     58          B
+
+TREND: Performance degrading. LCP doubled in 8 days.
+       JS bundle growing 50KB/week. Investigate.
+```
+
+### Phase 9: Save Report
+
+Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`.
+
+## Important Rules
+
+- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates.
+- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture.
+- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline.
+- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources.
+- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously.
+- **Read-only.** Produce the report. Don't modify code unless explicitly asked.
diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts
index cb807111..4d864fe2 100644
--- a/scripts/gen-skill-docs.ts
+++ b/scripts/gen-skill-docs.ts
@@ -1155,6 +1155,8 @@ function findTemplates(): string[] {
     path.join(ROOT, 'qa-design-review', 'SKILL.md.tmpl'),
     path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'),
     path.join(ROOT, 'document-release', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'benchmark', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'a11y', 'SKILL.md.tmpl'),
   ];
   for (const p of candidates) {
     if (fs.existsSync(p)) templates.push(p);
diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts
index 97c417ef..663547e6 100644
--- a/scripts/skill-check.ts
+++ b/scripts/skill-check.ts
@@ -31,6 +31,8 @@ const SKILL_FILES = [
   'qa-design-review/SKILL.md',
   'gstack-upgrade/SKILL.md',
   'document-release/SKILL.md',
+  'benchmark/SKILL.md',
+  'a11y/SKILL.md',
 ].filter(f => fs.existsSync(path.join(ROOT, f)));
 
 let hasErrors = false;
@@ -71,6 +73,8 @@ console.log('\n  Templates:');
 const TEMPLATES = [
   { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' },
   { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' },
+  { tmpl: 'benchmark/SKILL.md.tmpl', output: 'benchmark/SKILL.md' },
+  { tmpl: 'a11y/SKILL.md.tmpl', output: 'a11y/SKILL.md' },
 ];
 
 for (const { tmpl, output } of TEMPLATES) {
diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts
index c3861e8d..a7e28914 100644
--- a/test/gen-skill-docs.test.ts
+++ b/test/gen-skill-docs.test.ts
@@ -72,6 +72,8 @@ describe('gen-skill-docs', () => {
     { dir: 'plan-design-review', name: 'plan-design-review' },
     { dir: 'qa-design-review', name: 'qa-design-review' },
     { dir: 'design-consultation', name: 'design-consultation' },
+    { dir: 'benchmark', name: 'benchmark' },
+    { dir: 'a11y', name: 'a11y' },
   ];
 
   test('every skill has a SKILL.md.tmpl template', () => {
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index 81d97d31..b2071d33 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -208,6 +208,8 @@ describe('Update check preamble', () => {
     'qa-design-review/SKILL.md',
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
+    'benchmark/SKILL.md',
+    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithUpdateCheck) {
@@ -516,6 +518,8 @@ describe('v0.4.1 preamble features', () => {
     'qa-design-review/SKILL.md',
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
+    'benchmark/SKILL.md',
+    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithPreamble) {
@@ -631,6 +635,8 @@ describe('Completeness Principle in generated SKILL.md files', () => {
     'qa-design-review/SKILL.md',
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
+    'benchmark/SKILL.md',
+    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithPreamble) {

From 5c6b66f7fbed7d3c11e74b21ef6cbb94fdb50375 Mon Sep 17 00:00:00 2001
From: Arun Kumar Thiagarajan <arunkt.bm14@gmail.com>
Date: Wed, 18 Mar 2026 10:44:02 +0530
Subject: [PATCH 2/3] =?UTF-8?q?fix:=20remove=20generated=20SKILL.md=20?=
 =?UTF-8?q?=E2=80=94=20only=20commit=20.tmpl=20template?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per maintainer feedback: generated .md files should not be committed.
Only the .tmpl template is source of truth. Build generates the .md.
---
 benchmark/SKILL.md | 354 ---------------------------------------------
 1 file changed, 354 deletions(-)
 delete mode 100644 benchmark/SKILL.md

diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md
deleted file mode 100644
index ac665290..00000000
--- a/benchmark/SKILL.md
+++ /dev/null
@@ -1,354 +0,0 @@
----
-name: benchmark
-version: 1.0.0
-description: |
-  Performance regression detection using the browse daemon. Establishes
-  baselines for page load times, Core Web Vitals, and resource sizes.
-  Compares before/after on every PR. Tracks performance trends over time.
-  Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals",
-  "bundle size", "load time".
-allowed-tools:
-  - Bash
-  - Read
-  - Write
-  - Glob
-  - AskUserQuestion
----
-<!-- AUTO-GENERATED from SKILL.md.tmpl — do not edit directly -->
-<!-- Regenerate: bun run gen:skill-docs -->
-
-## Preamble (run first)
-
-```bash
-_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD" || true
-mkdir -p ~/.gstack/sessions
-touch ~/.gstack/sessions/"$PPID"
-_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ')
-find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true
-_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true)
-_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
-echo "BRANCH: $_BRANCH"
-_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
-echo "LAKE_INTRO: $_LAKE_SEEN"
-```
-
-If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.
-
-If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle.
-Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete
-thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean"
-Then offer to open the essay in their default browser:
-
-```bash
-open https://garryslist.org/posts/boil-the-ocean
-touch ~/.gstack/.completeness-intro-seen
-```
-
-Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once.
-
-## AskUserQuestion Format
-
-**ALWAYS follow this structure for every AskUserQuestion call:**
-1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
-2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called.
-3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it.
-4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
-
-Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
-
-Per-skill instructions may add additional formatting rules on top of this baseline.
-
-## Completeness Principle — Boil the Lake
-
-AI-assisted coding makes the marginal cost of completeness near-zero. When you present options:
-
-- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more.
-- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope.
-- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference:
-
-| Task type | Human team | CC+gstack | Compression |
-|-----------|-----------|-----------|-------------|
-| Boilerplate / scaffolding | 2 days | 15 min | ~100x |
-| Test writing | 1 day | 15 min | ~50x |
-| Feature implementation | 1 week | 30 min | ~30x |
-| Bug fix + regression test | 4 hours | 15 min | ~20x |
-| Architecture / design | 2 days | 4 hours | ~5x |
-| Research / exploration | 1 day | 3 hours | ~3x |
-
-- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds.
-
-**Anti-patterns — DON'T do this:**
-- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.)
-- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.)
-- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.)
-- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")
-
-## Contributor Mode
-
-If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better.
-
-**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better!
-
-**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore.
-
-**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs.
-
-**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer):
-
-```
-# {Title}
-
-Hey gstack team — ran into this while using /{skill-name}:
-
-**What I was trying to do:** {what the user/agent was attempting}
-**What happened instead:** {what actually happened}
-**My rating:** {0-10} — {one sentence on why it wasn't a 10}
-
-## Steps to reproduce
-1. {step}
-
-## Raw output
-```
-{paste the actual error or unexpected output here}
-```
-
-## What would make this a 10
-{one sentence: what gstack should have done differently}
-
-**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill}
-```
-
-Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}"
-
-## SETUP (run this check BEFORE any browse command)
-
-```bash
-_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
-B=""
-[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
-[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
-if [ -x "$B" ]; then
-  echo "READY: $B"
-else
-  echo "NEEDS_SETUP"
-fi
-```
-
-If `NEEDS_SETUP`:
-1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait.
-2. Run: `cd <SKILL_DIR> && ./setup`
-3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash`
-
-# /benchmark — Performance Regression Detection
-
-You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow.
-
-Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages.
-
-## User-invocable
-When the user types `/benchmark`, run this skill.
-
-## Arguments
-- `/benchmark <url>` — full performance audit with baseline comparison
-- `/benchmark <url> --baseline` — capture baseline (run before making changes)
-- `/benchmark <url> --quick` — single-pass timing check (no baseline needed)
-- `/benchmark <url> --pages /,/dashboard,/api/health` — specify pages
-- `/benchmark --diff` — benchmark only pages affected by current branch
-- `/benchmark --trend` — show performance trends from historical data
-
-## Instructions
-
-### Phase 1: Setup
-
-```bash
-eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown")
-mkdir -p .gstack/benchmark-reports
-mkdir -p .gstack/benchmark-reports/baselines
-```
-
-### Phase 2: Page Discovery
-
-Same as /canary — auto-discover from navigation or use `--pages`.
-
-If `--diff` mode:
-```bash
-git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only
-```
-
-### Phase 3: Performance Data Collection
-
-For each page, collect comprehensive performance metrics:
-
-```bash
-$B goto <page-url>
-$B perf
-```
-
-Then gather detailed metrics via JavaScript:
-
-```bash
-$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])"
-```
-
-Extract key metrics:
-- **TTFB** (Time to First Byte): `responseStart - requestStart`
-- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries
-- **LCP** (Largest Contentful Paint): from PerformanceObserver
-- **DOM Interactive**: `domInteractive - navigationStart`
-- **DOM Complete**: `domComplete - navigationStart`
-- **Full Load**: `loadEventEnd - navigationStart`
-
-Resource analysis:
-```bash
-$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))"
-```
-
-Bundle size check:
-```bash
-$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
-$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))"
-```
-
-Network summary:
-```bash
-$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()"
-```
-
-### Phase 4: Baseline Capture (--baseline mode)
-
-Save metrics to baseline file:
-
-```json
-{
-  "url": "<url>",
-  "timestamp": "<ISO>",
-  "branch": "<branch>",
-  "pages": {
-    "/": {
-      "ttfb_ms": 120,
-      "fcp_ms": 450,
-      "lcp_ms": 800,
-      "dom_interactive_ms": 600,
-      "dom_complete_ms": 1200,
-      "full_load_ms": 1400,
-      "total_requests": 42,
-      "total_transfer_bytes": 1250000,
-      "js_bundle_bytes": 450000,
-      "css_bundle_bytes": 85000,
-      "largest_resources": [
-        {"name": "main.js", "size": 320000, "duration": 180},
-        {"name": "vendor.js", "size": 130000, "duration": 90}
-      ]
-    }
-  }
-}
-```
-
-Write to `.gstack/benchmark-reports/baselines/baseline.json`.
-
-### Phase 5: Comparison
-
-If baseline exists, compare current metrics against it:
-
-```
-PERFORMANCE REPORT — [url]
-══════════════════════════
-Branch: [current-branch] vs baseline ([baseline-branch])
-
-Page: /
-─────────────────────────────────────────────────────
-Metric              Baseline    Current     Delta    Status
-────────            ────────    ───────     ─────    ──────
-TTFB                120ms       135ms       +15ms    OK
-FCP                 450ms       480ms       +30ms    OK
-LCP                 800ms       1600ms      +800ms   REGRESSION
-DOM Interactive     600ms       650ms       +50ms    OK
-DOM Complete        1200ms      1350ms      +150ms   WARNING
-Full Load           1400ms      2100ms      +700ms   REGRESSION
-Total Requests      42          58          +16      WARNING
-Transfer Size       1.2MB       1.8MB       +0.6MB   REGRESSION
-JS Bundle           450KB       720KB       +270KB   REGRESSION
-CSS Bundle          85KB        88KB        +3KB     OK
-
-REGRESSIONS DETECTED: 3
-  [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource
-  [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles
-  [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking
-```
-
-**Regression thresholds:**
-- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION
-- Timing metrics: >20% increase = WARNING
-- Bundle size: >25% increase = REGRESSION
-- Bundle size: >10% increase = WARNING
-- Request count: >30% increase = WARNING
-
-### Phase 6: Slowest Resources
-
-```
-TOP 10 SLOWEST RESOURCES
-═════════════════════════
-#   Resource                  Type      Size      Duration
-1   vendor.chunk.js          script    320KB     480ms
-2   main.js                  script    250KB     320ms
-3   hero-image.webp          img       180KB     280ms
-4   analytics.js             script    45KB      250ms    ← third-party
-5   fonts/inter-var.woff2    font      95KB      180ms
-...
-
-RECOMMENDATIONS:
-- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load
-- analytics.js: Load async/defer — blocks rendering for 250ms
-- hero-image.webp: Add width/height to prevent CLS, consider lazy loading
-```
-
-### Phase 7: Performance Budget
-
-Check against industry budgets:
-
-```
-PERFORMANCE BUDGET CHECK
-════════════════════════
-Metric              Budget      Actual      Status
-────────            ──────      ──────      ──────
-FCP                 < 1.8s      0.48s       PASS
-LCP                 < 2.5s      1.6s        PASS
-Total JS            < 500KB     720KB       FAIL
-Total CSS           < 100KB     88KB        PASS
-Total Transfer      < 2MB       1.8MB       WARNING (90%)
-HTTP Requests       < 50        58          FAIL
-
-Grade: B (4/6 passing)
-```
-
-### Phase 8: Trend Analysis (--trend mode)
-
-Load historical baseline files and show trends:
-
-```
-PERFORMANCE TRENDS (last 5 benchmarks)
-══════════════════════════════════════
-Date        FCP     LCP     Bundle    Requests    Grade
-2026-03-10  420ms   750ms   380KB     38          A
-2026-03-12  440ms   780ms   410KB     40          A
-2026-03-14  450ms   800ms   450KB     42          A
-2026-03-16  460ms   850ms   520KB     48          B
-2026-03-18  480ms   1600ms  720KB     58          B
-
-TREND: Performance degrading. LCP doubled in 8 days.
-       JS bundle growing 50KB/week. Investigate.
-```
-
-### Phase 9: Save Report
-
-Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`.
-
-## Important Rules
-
-- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates.
-- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture.
-- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline.
-- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources.
-- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously.
-- **Read-only.** Produce the report. Don't modify code unless explicitly asked.

From 7eea4dad2815e8d4be5d26e13ccf47f6ef5c6b66 Mon Sep 17 00:00:00 2001
From: Arun Kumar Thiagarajan <arunkt.bm14@gmail.com>
Date: Wed, 18 Mar 2026 11:16:51 +0530
Subject: [PATCH 3/3] fix: remove leaked cross-branch entries from test arrays

---
 scripts/gen-skill-docs.ts     | 1 -
 scripts/skill-check.ts        | 2 --
 test/gen-skill-docs.test.ts   | 1 -
 test/skill-validation.test.ts | 3 ---
 4 files changed, 7 deletions(-)

diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts
index 4d864fe2..1f9b96f5 100644
--- a/scripts/gen-skill-docs.ts
+++ b/scripts/gen-skill-docs.ts
@@ -1156,7 +1156,6 @@ function findTemplates(): string[] {
     path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'),
     path.join(ROOT, 'document-release', 'SKILL.md.tmpl'),
     path.join(ROOT, 'benchmark', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'a11y', 'SKILL.md.tmpl'),
   ];
   for (const p of candidates) {
     if (fs.existsSync(p)) templates.push(p);
diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts
index 663547e6..8ce271e6 100644
--- a/scripts/skill-check.ts
+++ b/scripts/skill-check.ts
@@ -32,7 +32,6 @@ const SKILL_FILES = [
   'gstack-upgrade/SKILL.md',
   'document-release/SKILL.md',
   'benchmark/SKILL.md',
-  'a11y/SKILL.md',
 ].filter(f => fs.existsSync(path.join(ROOT, f)));
 
 let hasErrors = false;
@@ -74,7 +73,6 @@ const TEMPLATES = [
   { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' },
   { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' },
   { tmpl: 'benchmark/SKILL.md.tmpl', output: 'benchmark/SKILL.md' },
-  { tmpl: 'a11y/SKILL.md.tmpl', output: 'a11y/SKILL.md' },
 ];
 
 for (const { tmpl, output } of TEMPLATES) {
diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts
index a7e28914..8890ee8f 100644
--- a/test/gen-skill-docs.test.ts
+++ b/test/gen-skill-docs.test.ts
@@ -73,7 +73,6 @@ describe('gen-skill-docs', () => {
     { dir: 'qa-design-review', name: 'qa-design-review' },
     { dir: 'design-consultation', name: 'design-consultation' },
     { dir: 'benchmark', name: 'benchmark' },
-    { dir: 'a11y', name: 'a11y' },
   ];
 
   test('every skill has a SKILL.md.tmpl template', () => {
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index b2071d33..e734ed7e 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -209,7 +209,6 @@ describe('Update check preamble', () => {
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
     'benchmark/SKILL.md',
-    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithUpdateCheck) {
@@ -519,7 +518,6 @@ describe('v0.4.1 preamble features', () => {
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
     'benchmark/SKILL.md',
-    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithPreamble) {
@@ -636,7 +634,6 @@ describe('Completeness Principle in generated SKILL.md files', () => {
     'design-consultation/SKILL.md',
     'document-release/SKILL.md',
     'benchmark/SKILL.md',
-    'a11y/SKILL.md',
   ];
 
   for (const skill of skillsWithPreamble) {