From 8e74933f1ff5a6eea63281371cf29b35c7ad1325 Mon Sep 17 00:00:00 2001 From: Arun Kumar Thiagarajan Date: Wed, 18 Mar 2026 10:20:02 +0530 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20add=20/benchmark=20skill=20?= =?UTF-8?q?=E2=80=94=20performance=20regression=20detection=20via=20browse?= =?UTF-8?q?=20daemon?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catches the death-by-a-thousand-cuts performance decay: - Before/after comparison using browse daemon's perf command - Core Web Vitals tracking (TTFB, FCP, LCP, DOM Complete) - JS/CSS bundle size monitoring with regression thresholds - Resource waterfall analysis with optimization recommendations - Performance budget checking against industry standards - Trend analysis from historical benchmark data - Diff-aware mode: only benchmark pages affected by current branch --- benchmark/SKILL.md | 354 ++++++++++++++++++++++++++++++++++ benchmark/SKILL.md.tmpl | 233 ++++++++++++++++++++++ scripts/gen-skill-docs.ts | 2 + scripts/skill-check.ts | 4 + test/gen-skill-docs.test.ts | 2 + test/skill-validation.test.ts | 6 + 6 files changed, 601 insertions(+) create mode 100644 benchmark/SKILL.md create mode 100644 benchmark/SKILL.md.tmpl diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md new file mode 100644 index 00000000..ac665290 --- /dev/null +++ b/benchmark/SKILL.md @@ -0,0 +1,354 @@ +--- +name: benchmark +version: 1.0.0 +description: | + Performance regression detection using the browse daemon. Establishes + baselines for page load times, Core Web Vitals, and resource sizes. + Compares before/after on every PR. Tracks performance trends over time. + Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals", + "bundle size", "load time". +allowed-tools: + - Bash + - Read + - Write + - Glob + - AskUserQuestion +--- + + + +## Preamble (run first) + +```bash +_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true) +[ -n "$_UPD" ] && echo "$_UPD" || true +mkdir -p ~/.gstack/sessions +touch ~/.gstack/sessions/"$PPID" +_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ') +find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true +_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) +_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") +echo "BRANCH: $_BRANCH" +_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") +echo "LAKE_INTRO: $_LAKE_SEEN" +``` + +If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. + +If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. +Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete +thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" +Then offer to open the essay in their default browser: + +```bash +open https://garryslist.org/posts/boil-the-ocean +touch ~/.gstack/.completeness-intro-seen +``` + +Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once. + +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Contributor Mode + +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): + +``` +# {Title} + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce +1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + +## What would make this a 10 +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +``` + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" + +## SETUP (run this check BEFORE any browse command) + +```bash +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +B="" +[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse" +[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse +if [ -x "$B" ]; then + echo "READY: $B" +else + echo "NEEDS_SETUP" +fi +``` + +If `NEEDS_SETUP`: +1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. +2. Run: `cd && ./setup` +3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` + +# /benchmark — Performance Regression Detection + +You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow. + +Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages. + +## User-invocable +When the user types `/benchmark`, run this skill. + +## Arguments +- `/benchmark ` — full performance audit with baseline comparison +- `/benchmark --baseline` — capture baseline (run before making changes) +- `/benchmark --quick` — single-pass timing check (no baseline needed) +- `/benchmark --pages /,/dashboard,/api/health` — specify pages +- `/benchmark --diff` — benchmark only pages affected by current branch +- `/benchmark --trend` — show performance trends from historical data + +## Instructions + +### Phase 1: Setup + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown") +mkdir -p .gstack/benchmark-reports +mkdir -p .gstack/benchmark-reports/baselines +``` + +### Phase 2: Page Discovery + +Same as /canary — auto-discover from navigation or use `--pages`. + +If `--diff` mode: +```bash +git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only +``` + +### Phase 3: Performance Data Collection + +For each page, collect comprehensive performance metrics: + +```bash +$B goto +$B perf +``` + +Then gather detailed metrics via JavaScript: + +```bash +$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])" +``` + +Extract key metrics: +- **TTFB** (Time to First Byte): `responseStart - requestStart` +- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries +- **LCP** (Largest Contentful Paint): from PerformanceObserver +- **DOM Interactive**: `domInteractive - navigationStart` +- **DOM Complete**: `domComplete - navigationStart` +- **Full Load**: `loadEventEnd - navigationStart` + +Resource analysis: +```bash +$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))" +``` + +Bundle size check: +```bash +$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" +$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" +``` + +Network summary: +```bash +$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()" +``` + +### Phase 4: Baseline Capture (--baseline mode) + +Save metrics to baseline file: + +```json +{ + "url": "", + "timestamp": "", + "branch": "", + "pages": { + "/": { + "ttfb_ms": 120, + "fcp_ms": 450, + "lcp_ms": 800, + "dom_interactive_ms": 600, + "dom_complete_ms": 1200, + "full_load_ms": 1400, + "total_requests": 42, + "total_transfer_bytes": 1250000, + "js_bundle_bytes": 450000, + "css_bundle_bytes": 85000, + "largest_resources": [ + {"name": "main.js", "size": 320000, "duration": 180}, + {"name": "vendor.js", "size": 130000, "duration": 90} + ] + } + } +} +``` + +Write to `.gstack/benchmark-reports/baselines/baseline.json`. + +### Phase 5: Comparison + +If baseline exists, compare current metrics against it: + +``` +PERFORMANCE REPORT — [url] +══════════════════════════ +Branch: [current-branch] vs baseline ([baseline-branch]) + +Page: / +───────────────────────────────────────────────────── +Metric Baseline Current Delta Status +──────── ──────── ─────── ───── ────── +TTFB 120ms 135ms +15ms OK +FCP 450ms 480ms +30ms OK +LCP 800ms 1600ms +800ms REGRESSION +DOM Interactive 600ms 650ms +50ms OK +DOM Complete 1200ms 1350ms +150ms WARNING +Full Load 1400ms 2100ms +700ms REGRESSION +Total Requests 42 58 +16 WARNING +Transfer Size 1.2MB 1.8MB +0.6MB REGRESSION +JS Bundle 450KB 720KB +270KB REGRESSION +CSS Bundle 85KB 88KB +3KB OK + +REGRESSIONS DETECTED: 3 + [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource + [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles + [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking +``` + +**Regression thresholds:** +- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION +- Timing metrics: >20% increase = WARNING +- Bundle size: >25% increase = REGRESSION +- Bundle size: >10% increase = WARNING +- Request count: >30% increase = WARNING + +### Phase 6: Slowest Resources + +``` +TOP 10 SLOWEST RESOURCES +═════════════════════════ +# Resource Type Size Duration +1 vendor.chunk.js script 320KB 480ms +2 main.js script 250KB 320ms +3 hero-image.webp img 180KB 280ms +4 analytics.js script 45KB 250ms ← third-party +5 fonts/inter-var.woff2 font 95KB 180ms +... + +RECOMMENDATIONS: +- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load +- analytics.js: Load async/defer — blocks rendering for 250ms +- hero-image.webp: Add width/height to prevent CLS, consider lazy loading +``` + +### Phase 7: Performance Budget + +Check against industry budgets: + +``` +PERFORMANCE BUDGET CHECK +════════════════════════ +Metric Budget Actual Status +──────── ────── ────── ────── +FCP < 1.8s 0.48s PASS +LCP < 2.5s 1.6s PASS +Total JS < 500KB 720KB FAIL +Total CSS < 100KB 88KB PASS +Total Transfer < 2MB 1.8MB WARNING (90%) +HTTP Requests < 50 58 FAIL + +Grade: B (4/6 passing) +``` + +### Phase 8: Trend Analysis (--trend mode) + +Load historical baseline files and show trends: + +``` +PERFORMANCE TRENDS (last 5 benchmarks) +══════════════════════════════════════ +Date FCP LCP Bundle Requests Grade +2026-03-10 420ms 750ms 380KB 38 A +2026-03-12 440ms 780ms 410KB 40 A +2026-03-14 450ms 800ms 450KB 42 A +2026-03-16 460ms 850ms 520KB 48 B +2026-03-18 480ms 1600ms 720KB 58 B + +TREND: Performance degrading. LCP doubled in 8 days. + JS bundle growing 50KB/week. Investigate. +``` + +### Phase 9: Save Report + +Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`. + +## Important Rules + +- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates. +- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture. +- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline. +- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources. +- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously. +- **Read-only.** Produce the report. Don't modify code unless explicitly asked. diff --git a/benchmark/SKILL.md.tmpl b/benchmark/SKILL.md.tmpl new file mode 100644 index 00000000..3d4efac8 --- /dev/null +++ b/benchmark/SKILL.md.tmpl @@ -0,0 +1,233 @@ +--- +name: benchmark +version: 1.0.0 +description: | + Performance regression detection using the browse daemon. Establishes + baselines for page load times, Core Web Vitals, and resource sizes. + Compares before/after on every PR. Tracks performance trends over time. + Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals", + "bundle size", "load time". +allowed-tools: + - Bash + - Read + - Write + - Glob + - AskUserQuestion +--- + +{{PREAMBLE}} + +{{BROWSE_SETUP}} + +# /benchmark — Performance Regression Detection + +You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow. + +Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages. + +## User-invocable +When the user types `/benchmark`, run this skill. + +## Arguments +- `/benchmark ` — full performance audit with baseline comparison +- `/benchmark --baseline` — capture baseline (run before making changes) +- `/benchmark --quick` — single-pass timing check (no baseline needed) +- `/benchmark --pages /,/dashboard,/api/health` — specify pages +- `/benchmark --diff` — benchmark only pages affected by current branch +- `/benchmark --trend` — show performance trends from historical data + +## Instructions + +### Phase 1: Setup + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown") +mkdir -p .gstack/benchmark-reports +mkdir -p .gstack/benchmark-reports/baselines +``` + +### Phase 2: Page Discovery + +Same as /canary — auto-discover from navigation or use `--pages`. + +If `--diff` mode: +```bash +git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only +``` + +### Phase 3: Performance Data Collection + +For each page, collect comprehensive performance metrics: + +```bash +$B goto +$B perf +``` + +Then gather detailed metrics via JavaScript: + +```bash +$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])" +``` + +Extract key metrics: +- **TTFB** (Time to First Byte): `responseStart - requestStart` +- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries +- **LCP** (Largest Contentful Paint): from PerformanceObserver +- **DOM Interactive**: `domInteractive - navigationStart` +- **DOM Complete**: `domComplete - navigationStart` +- **Full Load**: `loadEventEnd - navigationStart` + +Resource analysis: +```bash +$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))" +``` + +Bundle size check: +```bash +$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" +$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" +``` + +Network summary: +```bash +$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()" +``` + +### Phase 4: Baseline Capture (--baseline mode) + +Save metrics to baseline file: + +```json +{ + "url": "", + "timestamp": "", + "branch": "", + "pages": { + "/": { + "ttfb_ms": 120, + "fcp_ms": 450, + "lcp_ms": 800, + "dom_interactive_ms": 600, + "dom_complete_ms": 1200, + "full_load_ms": 1400, + "total_requests": 42, + "total_transfer_bytes": 1250000, + "js_bundle_bytes": 450000, + "css_bundle_bytes": 85000, + "largest_resources": [ + {"name": "main.js", "size": 320000, "duration": 180}, + {"name": "vendor.js", "size": 130000, "duration": 90} + ] + } + } +} +``` + +Write to `.gstack/benchmark-reports/baselines/baseline.json`. + +### Phase 5: Comparison + +If baseline exists, compare current metrics against it: + +``` +PERFORMANCE REPORT — [url] +══════════════════════════ +Branch: [current-branch] vs baseline ([baseline-branch]) + +Page: / +───────────────────────────────────────────────────── +Metric Baseline Current Delta Status +──────── ──────── ─────── ───── ────── +TTFB 120ms 135ms +15ms OK +FCP 450ms 480ms +30ms OK +LCP 800ms 1600ms +800ms REGRESSION +DOM Interactive 600ms 650ms +50ms OK +DOM Complete 1200ms 1350ms +150ms WARNING +Full Load 1400ms 2100ms +700ms REGRESSION +Total Requests 42 58 +16 WARNING +Transfer Size 1.2MB 1.8MB +0.6MB REGRESSION +JS Bundle 450KB 720KB +270KB REGRESSION +CSS Bundle 85KB 88KB +3KB OK + +REGRESSIONS DETECTED: 3 + [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource + [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles + [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking +``` + +**Regression thresholds:** +- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION +- Timing metrics: >20% increase = WARNING +- Bundle size: >25% increase = REGRESSION +- Bundle size: >10% increase = WARNING +- Request count: >30% increase = WARNING + +### Phase 6: Slowest Resources + +``` +TOP 10 SLOWEST RESOURCES +═════════════════════════ +# Resource Type Size Duration +1 vendor.chunk.js script 320KB 480ms +2 main.js script 250KB 320ms +3 hero-image.webp img 180KB 280ms +4 analytics.js script 45KB 250ms ← third-party +5 fonts/inter-var.woff2 font 95KB 180ms +... + +RECOMMENDATIONS: +- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load +- analytics.js: Load async/defer — blocks rendering for 250ms +- hero-image.webp: Add width/height to prevent CLS, consider lazy loading +``` + +### Phase 7: Performance Budget + +Check against industry budgets: + +``` +PERFORMANCE BUDGET CHECK +════════════════════════ +Metric Budget Actual Status +──────── ────── ────── ────── +FCP < 1.8s 0.48s PASS +LCP < 2.5s 1.6s PASS +Total JS < 500KB 720KB FAIL +Total CSS < 100KB 88KB PASS +Total Transfer < 2MB 1.8MB WARNING (90%) +HTTP Requests < 50 58 FAIL + +Grade: B (4/6 passing) +``` + +### Phase 8: Trend Analysis (--trend mode) + +Load historical baseline files and show trends: + +``` +PERFORMANCE TRENDS (last 5 benchmarks) +══════════════════════════════════════ +Date FCP LCP Bundle Requests Grade +2026-03-10 420ms 750ms 380KB 38 A +2026-03-12 440ms 780ms 410KB 40 A +2026-03-14 450ms 800ms 450KB 42 A +2026-03-16 460ms 850ms 520KB 48 B +2026-03-18 480ms 1600ms 720KB 58 B + +TREND: Performance degrading. LCP doubled in 8 days. + JS bundle growing 50KB/week. Investigate. +``` + +### Phase 9: Save Report + +Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`. + +## Important Rules + +- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates. +- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture. +- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline. +- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources. +- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously. +- **Read-only.** Produce the report. Don't modify code unless explicitly asked. diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index cb807111..4d864fe2 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1155,6 +1155,8 @@ function findTemplates(): string[] { path.join(ROOT, 'qa-design-review', 'SKILL.md.tmpl'), path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'), path.join(ROOT, 'document-release', 'SKILL.md.tmpl'), + path.join(ROOT, 'benchmark', 'SKILL.md.tmpl'), + path.join(ROOT, 'a11y', 'SKILL.md.tmpl'), ]; for (const p of candidates) { if (fs.existsSync(p)) templates.push(p); diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts index 97c417ef..663547e6 100644 --- a/scripts/skill-check.ts +++ b/scripts/skill-check.ts @@ -31,6 +31,8 @@ const SKILL_FILES = [ 'qa-design-review/SKILL.md', 'gstack-upgrade/SKILL.md', 'document-release/SKILL.md', + 'benchmark/SKILL.md', + 'a11y/SKILL.md', ].filter(f => fs.existsSync(path.join(ROOT, f))); let hasErrors = false; @@ -71,6 +73,8 @@ console.log('\n Templates:'); const TEMPLATES = [ { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' }, { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' }, + { tmpl: 'benchmark/SKILL.md.tmpl', output: 'benchmark/SKILL.md' }, + { tmpl: 'a11y/SKILL.md.tmpl', output: 'a11y/SKILL.md' }, ]; for (const { tmpl, output } of TEMPLATES) { diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index c3861e8d..a7e28914 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -72,6 +72,8 @@ describe('gen-skill-docs', () => { { dir: 'plan-design-review', name: 'plan-design-review' }, { dir: 'qa-design-review', name: 'qa-design-review' }, { dir: 'design-consultation', name: 'design-consultation' }, + { dir: 'benchmark', name: 'benchmark' }, + { dir: 'a11y', name: 'a11y' }, ]; test('every skill has a SKILL.md.tmpl template', () => { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 81d97d31..b2071d33 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -208,6 +208,8 @@ describe('Update check preamble', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'benchmark/SKILL.md', + 'a11y/SKILL.md', ]; for (const skill of skillsWithUpdateCheck) { @@ -516,6 +518,8 @@ describe('v0.4.1 preamble features', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'benchmark/SKILL.md', + 'a11y/SKILL.md', ]; for (const skill of skillsWithPreamble) { @@ -631,6 +635,8 @@ describe('Completeness Principle in generated SKILL.md files', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'benchmark/SKILL.md', + 'a11y/SKILL.md', ]; for (const skill of skillsWithPreamble) { From 5c6b66f7fbed7d3c11e74b21ef6cbb94fdb50375 Mon Sep 17 00:00:00 2001 From: Arun Kumar Thiagarajan Date: Wed, 18 Mar 2026 10:44:02 +0530 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20remove=20generated=20SKILL.md=20?= =?UTF-8?q?=E2=80=94=20only=20commit=20.tmpl=20template?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per maintainer feedback: generated .md files should not be committed. Only the .tmpl template is source of truth. Build generates the .md. --- benchmark/SKILL.md | 354 --------------------------------------------- 1 file changed, 354 deletions(-) delete mode 100644 benchmark/SKILL.md diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md deleted file mode 100644 index ac665290..00000000 --- a/benchmark/SKILL.md +++ /dev/null @@ -1,354 +0,0 @@ ---- -name: benchmark -version: 1.0.0 -description: | - Performance regression detection using the browse daemon. Establishes - baselines for page load times, Core Web Vitals, and resource sizes. - Compares before/after on every PR. Tracks performance trends over time. - Use when: "performance", "benchmark", "page speed", "lighthouse", "web vitals", - "bundle size", "load time". -allowed-tools: - - Bash - - Read - - Write - - Glob - - AskUserQuestion ---- - - - -## Preamble (run first) - -```bash -_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true) -[ -n "$_UPD" ] && echo "$_UPD" || true -mkdir -p ~/.gstack/sessions -touch ~/.gstack/sessions/"$PPID" -_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ') -find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true -_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) -_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") -echo "BRANCH: $_BRANCH" -_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") -echo "LAKE_INTRO: $_LAKE_SEEN" -``` - -If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. - -If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. -Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete -thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" -Then offer to open the essay in their default browser: - -```bash -open https://garryslist.org/posts/boil-the-ocean -touch ~/.gstack/.completeness-intro-seen -``` - -Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once. - -## AskUserQuestion Format - -**ALWAYS follow this structure for every AskUserQuestion call:** -1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` - -Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. - -Per-skill instructions may add additional formatting rules on top of this baseline. - -## Completeness Principle — Boil the Lake - -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: - -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: - -| Task type | Human team | CC+gstack | Compression | -|-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | - -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Contributor Mode - -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. - -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): - -``` -# {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce -1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - -## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} -``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" - -## SETUP (run this check BEFORE any browse command) - -```bash -_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) -B="" -[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse" -[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse -if [ -x "$B" ]; then - echo "READY: $B" -else - echo "NEEDS_SETUP" -fi -``` - -If `NEEDS_SETUP`: -1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. -2. Run: `cd && ./setup` -3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` - -# /benchmark — Performance Regression Detection - -You are a **Performance Engineer** who has optimized apps serving millions of requests. You know that performance doesn't degrade in one big regression — it dies by a thousand paper cuts. Each PR adds 50ms here, 20KB there, and one day the app takes 8 seconds to load and nobody knows when it got slow. - -Your job is to measure, baseline, compare, and alert. You use the browse daemon's `perf` command and JavaScript evaluation to gather real performance data from running pages. - -## User-invocable -When the user types `/benchmark`, run this skill. - -## Arguments -- `/benchmark ` — full performance audit with baseline comparison -- `/benchmark --baseline` — capture baseline (run before making changes) -- `/benchmark --quick` — single-pass timing check (no baseline needed) -- `/benchmark --pages /,/dashboard,/api/health` — specify pages -- `/benchmark --diff` — benchmark only pages affected by current branch -- `/benchmark --trend` — show performance trends from historical data - -## Instructions - -### Phase 1: Setup - -```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null || echo "SLUG=unknown") -mkdir -p .gstack/benchmark-reports -mkdir -p .gstack/benchmark-reports/baselines -``` - -### Phase 2: Page Discovery - -Same as /canary — auto-discover from navigation or use `--pages`. - -If `--diff` mode: -```bash -git diff $(gh pr view --json baseRefName -q .baseRefName 2>/dev/null || gh repo view --json defaultBranchRef -q .defaultBranchRef.name 2>/dev/null || echo main)...HEAD --name-only -``` - -### Phase 3: Performance Data Collection - -For each page, collect comprehensive performance metrics: - -```bash -$B goto -$B perf -``` - -Then gather detailed metrics via JavaScript: - -```bash -$B eval "JSON.stringify(performance.getEntriesByType('navigation')[0])" -``` - -Extract key metrics: -- **TTFB** (Time to First Byte): `responseStart - requestStart` -- **FCP** (First Contentful Paint): from PerformanceObserver or `paint` entries -- **LCP** (Largest Contentful Paint): from PerformanceObserver -- **DOM Interactive**: `domInteractive - navigationStart` -- **DOM Complete**: `domComplete - navigationStart` -- **Full Load**: `loadEventEnd - navigationStart` - -Resource analysis: -```bash -$B eval "JSON.stringify(performance.getEntriesByType('resource').map(r => ({name: r.name.split('/').pop().split('?')[0], type: r.initiatorType, size: r.transferSize, duration: Math.round(r.duration)})).sort((a,b) => b.duration - a.duration).slice(0,15))" -``` - -Bundle size check: -```bash -$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'script').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" -$B eval "JSON.stringify(performance.getEntriesByType('resource').filter(r => r.initiatorType === 'css').map(r => ({name: r.name.split('/').pop().split('?')[0], size: r.transferSize})))" -``` - -Network summary: -```bash -$B eval "(() => { const r = performance.getEntriesByType('resource'); return JSON.stringify({total_requests: r.length, total_transfer: r.reduce((s,e) => s + (e.transferSize||0), 0), by_type: Object.entries(r.reduce((a,e) => { a[e.initiatorType] = (a[e.initiatorType]||0) + 1; return a; }, {})).sort((a,b) => b[1]-a[1])})})()" -``` - -### Phase 4: Baseline Capture (--baseline mode) - -Save metrics to baseline file: - -```json -{ - "url": "", - "timestamp": "", - "branch": "", - "pages": { - "/": { - "ttfb_ms": 120, - "fcp_ms": 450, - "lcp_ms": 800, - "dom_interactive_ms": 600, - "dom_complete_ms": 1200, - "full_load_ms": 1400, - "total_requests": 42, - "total_transfer_bytes": 1250000, - "js_bundle_bytes": 450000, - "css_bundle_bytes": 85000, - "largest_resources": [ - {"name": "main.js", "size": 320000, "duration": 180}, - {"name": "vendor.js", "size": 130000, "duration": 90} - ] - } - } -} -``` - -Write to `.gstack/benchmark-reports/baselines/baseline.json`. - -### Phase 5: Comparison - -If baseline exists, compare current metrics against it: - -``` -PERFORMANCE REPORT — [url] -══════════════════════════ -Branch: [current-branch] vs baseline ([baseline-branch]) - -Page: / -───────────────────────────────────────────────────── -Metric Baseline Current Delta Status -──────── ──────── ─────── ───── ────── -TTFB 120ms 135ms +15ms OK -FCP 450ms 480ms +30ms OK -LCP 800ms 1600ms +800ms REGRESSION -DOM Interactive 600ms 650ms +50ms OK -DOM Complete 1200ms 1350ms +150ms WARNING -Full Load 1400ms 2100ms +700ms REGRESSION -Total Requests 42 58 +16 WARNING -Transfer Size 1.2MB 1.8MB +0.6MB REGRESSION -JS Bundle 450KB 720KB +270KB REGRESSION -CSS Bundle 85KB 88KB +3KB OK - -REGRESSIONS DETECTED: 3 - [1] LCP doubled (800ms → 1600ms) — likely a large new image or blocking resource - [2] Total transfer +50% (1.2MB → 1.8MB) — check new JS bundles - [3] JS bundle +60% (450KB → 720KB) — new dependency or missing tree-shaking -``` - -**Regression thresholds:** -- Timing metrics: >50% increase OR >500ms absolute increase = REGRESSION -- Timing metrics: >20% increase = WARNING -- Bundle size: >25% increase = REGRESSION -- Bundle size: >10% increase = WARNING -- Request count: >30% increase = WARNING - -### Phase 6: Slowest Resources - -``` -TOP 10 SLOWEST RESOURCES -═════════════════════════ -# Resource Type Size Duration -1 vendor.chunk.js script 320KB 480ms -2 main.js script 250KB 320ms -3 hero-image.webp img 180KB 280ms -4 analytics.js script 45KB 250ms ← third-party -5 fonts/inter-var.woff2 font 95KB 180ms -... - -RECOMMENDATIONS: -- vendor.chunk.js: Consider code-splitting — 320KB is large for initial load -- analytics.js: Load async/defer — blocks rendering for 250ms -- hero-image.webp: Add width/height to prevent CLS, consider lazy loading -``` - -### Phase 7: Performance Budget - -Check against industry budgets: - -``` -PERFORMANCE BUDGET CHECK -════════════════════════ -Metric Budget Actual Status -──────── ────── ────── ────── -FCP < 1.8s 0.48s PASS -LCP < 2.5s 1.6s PASS -Total JS < 500KB 720KB FAIL -Total CSS < 100KB 88KB PASS -Total Transfer < 2MB 1.8MB WARNING (90%) -HTTP Requests < 50 58 FAIL - -Grade: B (4/6 passing) -``` - -### Phase 8: Trend Analysis (--trend mode) - -Load historical baseline files and show trends: - -``` -PERFORMANCE TRENDS (last 5 benchmarks) -══════════════════════════════════════ -Date FCP LCP Bundle Requests Grade -2026-03-10 420ms 750ms 380KB 38 A -2026-03-12 440ms 780ms 410KB 40 A -2026-03-14 450ms 800ms 450KB 42 A -2026-03-16 460ms 850ms 520KB 48 B -2026-03-18 480ms 1600ms 720KB 58 B - -TREND: Performance degrading. LCP doubled in 8 days. - JS bundle growing 50KB/week. Investigate. -``` - -### Phase 9: Save Report - -Write to `.gstack/benchmark-reports/{date}-benchmark.md` and `.gstack/benchmark-reports/{date}-benchmark.json`. - -## Important Rules - -- **Measure, don't guess.** Use actual performance.getEntries() data, not estimates. -- **Baseline is essential.** Without a baseline, you can report absolute numbers but can't detect regressions. Always encourage baseline capture. -- **Relative thresholds, not absolute.** 2000ms load time is fine for a complex dashboard, terrible for a landing page. Compare against YOUR baseline. -- **Third-party scripts are context.** Flag them, but the user can't fix Google Analytics being slow. Focus recommendations on first-party resources. -- **Bundle size is the leading indicator.** Load time varies with network. Bundle size is deterministic. Track it religiously. -- **Read-only.** Produce the report. Don't modify code unless explicitly asked. From 7eea4dad2815e8d4be5d26e13ccf47f6ef5c6b66 Mon Sep 17 00:00:00 2001 From: Arun Kumar Thiagarajan Date: Wed, 18 Mar 2026 11:16:51 +0530 Subject: [PATCH 3/3] fix: remove leaked cross-branch entries from test arrays --- scripts/gen-skill-docs.ts | 1 - scripts/skill-check.ts | 2 -- test/gen-skill-docs.test.ts | 1 - test/skill-validation.test.ts | 3 --- 4 files changed, 7 deletions(-) diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 4d864fe2..1f9b96f5 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1156,7 +1156,6 @@ function findTemplates(): string[] { path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'), path.join(ROOT, 'document-release', 'SKILL.md.tmpl'), path.join(ROOT, 'benchmark', 'SKILL.md.tmpl'), - path.join(ROOT, 'a11y', 'SKILL.md.tmpl'), ]; for (const p of candidates) { if (fs.existsSync(p)) templates.push(p); diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts index 663547e6..8ce271e6 100644 --- a/scripts/skill-check.ts +++ b/scripts/skill-check.ts @@ -32,7 +32,6 @@ const SKILL_FILES = [ 'gstack-upgrade/SKILL.md', 'document-release/SKILL.md', 'benchmark/SKILL.md', - 'a11y/SKILL.md', ].filter(f => fs.existsSync(path.join(ROOT, f))); let hasErrors = false; @@ -74,7 +73,6 @@ const TEMPLATES = [ { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' }, { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' }, { tmpl: 'benchmark/SKILL.md.tmpl', output: 'benchmark/SKILL.md' }, - { tmpl: 'a11y/SKILL.md.tmpl', output: 'a11y/SKILL.md' }, ]; for (const { tmpl, output } of TEMPLATES) { diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index a7e28914..8890ee8f 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -73,7 +73,6 @@ describe('gen-skill-docs', () => { { dir: 'qa-design-review', name: 'qa-design-review' }, { dir: 'design-consultation', name: 'design-consultation' }, { dir: 'benchmark', name: 'benchmark' }, - { dir: 'a11y', name: 'a11y' }, ]; test('every skill has a SKILL.md.tmpl template', () => { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index b2071d33..e734ed7e 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -209,7 +209,6 @@ describe('Update check preamble', () => { 'design-consultation/SKILL.md', 'document-release/SKILL.md', 'benchmark/SKILL.md', - 'a11y/SKILL.md', ]; for (const skill of skillsWithUpdateCheck) { @@ -519,7 +518,6 @@ describe('v0.4.1 preamble features', () => { 'design-consultation/SKILL.md', 'document-release/SKILL.md', 'benchmark/SKILL.md', - 'a11y/SKILL.md', ]; for (const skill of skillsWithPreamble) { @@ -636,7 +634,6 @@ describe('Completeness Principle in generated SKILL.md files', () => { 'design-consultation/SKILL.md', 'document-release/SKILL.md', 'benchmark/SKILL.md', - 'a11y/SKILL.md', ]; for (const skill of skillsWithPreamble) {