diff --git a/CHANGELOG.md b/CHANGELOG.md index 5df40ae8..04f690a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## [0.6.3.0] - 2026-03-17 + +### Added + +- **Every PR touching frontend code now gets a design review automatically.** `/review` and `/ship` apply a 20-item design checklist against changed CSS, HTML, JSX, and view files. Catches AI slop patterns (purple gradients, 3-column icon grids, generic hero copy), typography issues (body text < 16px, blacklisted fonts), accessibility gaps (`outline: none`), and `!important` abuse. Mechanical CSS fixes are auto-applied; design judgment calls ask you first. +- **`gstack-diff-scope` categorizes what changed in your branch.** Run `eval $(gstack-diff-scope main)` and get `SCOPE_FRONTEND=true/false`, `SCOPE_BACKEND`, `SCOPE_PROMPTS`, `SCOPE_TESTS`, `SCOPE_DOCS`, `SCOPE_CONFIG`. Design review uses it to skip silently on backend-only PRs. Ship pre-flight uses it to recommend design review when frontend files are touched. +- **Design review shows up in the Review Readiness Dashboard.** The dashboard now distinguishes between "LITE" (code-level, runs automatically in /review and /ship) and "FULL" (visual audit via /plan-design-review with browse binary). Both show up as Design Review entries. +- **E2E eval for design review detection.** Planted CSS/HTML fixtures with 7 known anti-patterns (Papyrus font, 14px body text, `outline: none`, `!important`, purple gradient, generic hero copy, 3-column feature grid). The eval verifies `/review` catches at least 4 of 7. + ## [0.6.2.0] - 2026-03-17 ### Added diff --git a/TODOS.md b/TODOS.md index 8616f906..5f771de7 100644 --- a/TODOS.md +++ b/TODOS.md @@ -444,17 +444,17 @@ Shipped as `/design-consultation` on garrytan/design branch. Renamed from `/setu ## Ship Confidence Dashboard -### Smart review relevance detection +### Smart review relevance detection — PARTIALLY SHIPPED -**What:** Auto-detect which of the 4 reviews are relevant based on branch changes (skip Design Review if no CSS/view changes, skip Code Review if plan-only). +~~**What:** Auto-detect which of the 4 reviews are relevant based on branch changes (skip Design Review if no CSS/view changes, skip Code Review if plan-only).~~ -**Why:** Currently dashboard always shows 4 rows. On docs-only changes, "Design Review: NOT YET RUN" is noise. +`bin/gstack-diff-scope` shipped — categorizes diff into SCOPE_FRONTEND, SCOPE_BACKEND, SCOPE_PROMPTS, SCOPE_TESTS, SCOPE_DOCS, SCOPE_CONFIG. Used by design-review-lite to skip when no frontend files changed. Dashboard integration for conditional row display is a follow-up. -**Context:** /plan-design-review and /qa already do file-type detection in diff-aware mode. Could reuse that heuristic. Would require a `gstack-diff-scope` helper or enriching `gstack-slug` to also output change categories. +**Remaining:** Dashboard conditional row display (hide "Design Review: NOT YET RUN" when SCOPE_FRONTEND=false). Extend to Eng Review (skip for docs-only) and CEO Review (skip for config-only). -**Effort:** M +**Effort:** S **Priority:** P3 -**Depends on:** Ship Confidence Dashboard (shipped) +**Depends on:** gstack-diff-scope (shipped) ### /merge skill — review-gated PR merge diff --git a/VERSION b/VERSION index e1e48733..1e40a508 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.2.0 +0.6.3.0 diff --git a/bin/gstack-diff-scope b/bin/gstack-diff-scope new file mode 100755 index 00000000..ada66c0a --- /dev/null +++ b/bin/gstack-diff-scope @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# gstack-diff-scope — categorize what changed in the diff against a base branch +# Usage: eval $(gstack-diff-scope main) → sets SCOPE_FRONTEND=true SCOPE_BACKEND=false ... +# Or: gstack-diff-scope main → prints SCOPE_*=... lines +set -euo pipefail + +BASE="${1:-main}" + +# Get changed file list +FILES=$(git diff "${BASE}...HEAD" --name-only 2>/dev/null || git diff "${BASE}" --name-only 2>/dev/null || echo "") + +if [ -z "$FILES" ]; then + echo "SCOPE_FRONTEND=false" + echo "SCOPE_BACKEND=false" + echo "SCOPE_PROMPTS=false" + echo "SCOPE_TESTS=false" + echo "SCOPE_DOCS=false" + echo "SCOPE_CONFIG=false" + exit 0 +fi + +FRONTEND=false +BACKEND=false +PROMPTS=false +TESTS=false +DOCS=false +CONFIG=false + +while IFS= read -r f; do + case "$f" in + # Frontend: CSS, views, components, templates + *.css|*.scss|*.less|*.sass|*.pcss|*.module.css|*.module.scss) FRONTEND=true ;; + *.tsx|*.jsx|*.vue|*.svelte|*.astro) FRONTEND=true ;; + *.erb|*.haml|*.slim|*.hbs|*.ejs) FRONTEND=true ;; + *.html) FRONTEND=true ;; + tailwind.config.*|postcss.config.*) FRONTEND=true ;; + app/views/*|*/components/*|styles/*|css/*|app/assets/stylesheets/*) FRONTEND=true ;; + + # Prompts: prompt builders, system prompts, generation services + *prompt_builder*|*generation_service*|*writer_service*|*designer_service*) PROMPTS=true ;; + *evaluator*|*scorer*|*classifier_service*|*analyzer*) PROMPTS=true ;; + *voice*.rb|*writing*.rb|*prompt*.rb|*token*.rb) PROMPTS=true ;; + app/services/chat_tools/*|app/services/x_thread_tools/*) PROMPTS=true ;; + config/system_prompts/*) PROMPTS=true ;; + + # Tests + *.test.*|*.spec.*|*_test.*|*_spec.*) TESTS=true ;; + test/*|tests/*|spec/*|__tests__/*|cypress/*|e2e/*) TESTS=true ;; + + # Docs + *.md) DOCS=true ;; + + # Config + package.json|package-lock.json|yarn.lock|bun.lockb) CONFIG=true ;; + Gemfile|Gemfile.lock) CONFIG=true ;; + *.yml|*.yaml) CONFIG=true ;; + .github/*) CONFIG=true ;; + requirements.txt|pyproject.toml|go.mod|Cargo.toml|composer.json) CONFIG=true ;; + + # Backend: everything else that's code (excluding views/components already matched) + *.rb|*.py|*.go|*.rs|*.java|*.php|*.ex|*.exs) BACKEND=true ;; + *.ts|*.js) BACKEND=true ;; # Non-component TS/JS is backend + esac +done <<< "$FILES" + +echo "SCOPE_FRONTEND=$FRONTEND" +echo "SCOPE_BACKEND=$BACKEND" +echo "SCOPE_PROMPTS=$PROMPTS" +echo "SCOPE_TESTS=$TESTS" +echo "SCOPE_DOCS=$DOCS" +echo "SCOPE_CONFIG=$CONFIG" diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index b44dff6d..55238e3f 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -702,7 +702,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index a8f3498e..1d821bf2 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -645,7 +645,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 05c74ba7..48fe7230 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -348,7 +348,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/review/SKILL.md b/review/SKILL.md index 186978ef..5d734594 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -196,6 +196,47 @@ Follow the output format specified in the checklist. Respect the suppressions --- +## Step 4.5: Design Review (conditional) + +## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using `gstack-diff-scope`: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +**If `SCOPE_FRONTEND=false`:** Skip design review silently. No output. + +**If `SCOPE_FRONTEND=true`:** + +1. **Check for DESIGN.md.** If `DESIGN.md` or `design-system.md` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read `.claude/skills/review/design-checklist.md`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. + +Include any design findings alongside the findings from Step 4. They follow the same Fix-First flow in Step 5 — AUTO-FIX for mechanical CSS fixes, ASK for everything else. + +--- + ## Step 5: Fix-First Review **Every finding gets action — not just critical ones.** diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index c122ada1..c1d3fae6 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -75,6 +75,14 @@ Follow the output format specified in the checklist. Respect the suppressions --- +## Step 4.5: Design Review (conditional) + +{{DESIGN_REVIEW_LITE}} + +Include any design findings alongside the findings from Step 4. They follow the same Fix-First flow in Step 5 — AUTO-FIX for mechanical CSS fixes, ASK for everything else. + +--- + ## Step 5: Fix-First Review **Every finding gets action — not just critical ones.** diff --git a/review/design-checklist.md b/review/design-checklist.md new file mode 100644 index 00000000..bbe49885 --- /dev/null +++ b/review/design-checklist.md @@ -0,0 +1,132 @@ +# Design Review Checklist (Lite) + +> **Subset of DESIGN_METHODOLOGY** — when adding items here, also update `generateDesignMethodology()` in `scripts/gen-skill-docs.ts`, and vice versa. + +## Instructions + +This checklist applies to **source code in the diff** — not rendered output. Read each changed frontend file (full file, not just diff hunks) and flag anti-patterns. + +**Trigger:** Only run this checklist if the diff touches frontend files. Use `gstack-diff-scope` to detect: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +If `SCOPE_FRONTEND=false`, skip the entire design review silently. + +**DESIGN.md calibration:** If `DESIGN.md` or `design-system.md` exists in the repo root, read it first. All findings are calibrated against the project's stated design system. Patterns explicitly blessed in DESIGN.md are NOT flagged. If no DESIGN.md exists, use universal design principles. + +--- + +## Confidence Tiers + +Each item is tagged with a detection confidence level: + +- **[HIGH]** — Reliably detectable via grep/pattern match. Definitive findings. +- **[MEDIUM]** — Detectable via pattern aggregation or heuristic. Flag as findings but expect some noise. +- **[LOW]** — Requires understanding visual intent. Present as: "Possible issue — verify visually or run /qa-design-review." + +--- + +## Classification + +**AUTO-FIX** (mechanical CSS fixes only — HIGH confidence, no design judgment needed): +- `outline: none` without replacement → add `outline: revert` or `&:focus-visible { outline: 2px solid currentColor; }` +- `!important` in new CSS → remove and fix specificity +- `font-size` < 16px on body text → bump to 16px + +**ASK** (everything else — requires design judgment): +- All AI slop findings, typography structure, spacing choices, interaction state gaps, DESIGN.md violations + +**LOW confidence items** → present as "Possible: [description]. Verify visually or run /qa-design-review." Never AUTO-FIX. + +--- + +## Output Format + +``` +Design Review: N issues (X auto-fixable, Y need input, Z possible) + +**AUTO-FIXED:** +- [file:line] Problem → fix applied + +**NEEDS INPUT:** +- [file:line] Problem description + Recommended fix: suggested fix + +**POSSIBLE (verify visually):** +- [file:line] Possible issue — verify with /qa-design-review +``` + +If no issues found: `Design Review: No issues found.` + +If no frontend files changed: skip silently, no output. + +--- + +## Categories + +### 1. AI Slop Detection (6 items) — highest priority + +These are the telltale signs of AI-generated UI that no designer at a respected studio would ship. + +- **[MEDIUM]** Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes. Look for `linear-gradient` with values in the `#6366f1`–`#8b5cf6` range, or CSS custom properties resolving to purple/violet. + +- **[LOW]** The 3-column feature grid: icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. Look for a grid/flex container with exactly 3 children that each contain a circular element + heading + paragraph. + +- **[LOW]** Icons in colored circles as section decoration. Look for elements with `border-radius: 50%` + a background color used as decorative containers for icons. + +- **[HIGH]** Centered everything: `text-align: center` on all headings, descriptions, and cards. Grep for `text-align: center` density — if >60% of text containers use center alignment, flag it. + +- **[MEDIUM]** Uniform bubbly border-radius on every element: same large radius (16px+) applied to cards, buttons, inputs, containers uniformly. Aggregate `border-radius` values — if >80% use the same value ≥16px, flag it. + +- **[MEDIUM]** Generic hero copy: "Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...", "Revolutionize your...", "Streamline your workflow". Grep HTML/JSX content for these patterns. + +### 2. Typography (4 items) + +- **[HIGH]** Body text `font-size` < 16px. Grep for `font-size` declarations on `body`, `p`, `.text`, or base styles. Values below 16px (or 1rem when base is 16px) are flagged. + +- **[HIGH]** More than 3 font families introduced in the diff. Count distinct `font-family` declarations. Flag if >3 unique families appear across changed files. + +- **[HIGH]** Heading hierarchy skipping levels: `h1` followed by `h3` without an `h2` in the same file/component. Check HTML/JSX for heading tags. + +- **[HIGH]** Blacklisted fonts: Papyrus, Comic Sans, Lobster, Impact, Jokerman. Grep `font-family` for these names. + +### 3. Spacing & Layout (4 items) + +- **[MEDIUM]** Arbitrary spacing values not on a 4px or 8px scale, when DESIGN.md specifies a spacing scale. Check `margin`, `padding`, `gap` values against the stated scale. Only flag when DESIGN.md defines a scale. + +- **[MEDIUM]** Fixed widths without responsive handling: `width: NNNpx` on containers without `max-width` or `@media` breakpoints. Risk of horizontal scroll on mobile. + +- **[MEDIUM]** Missing `max-width` on text containers: body text or paragraph containers with no `max-width` set, allowing lines >75 characters. Check for `max-width` on text wrappers. + +- **[HIGH]** `!important` in new CSS rules. Grep for `!important` in added lines. Almost always a specificity escape hatch that should be fixed properly. + +### 4. Interaction States (3 items) + +- **[MEDIUM]** Interactive elements (buttons, links, inputs) missing hover/focus states. Check if `:hover` and `:focus` or `:focus-visible` pseudo-classes exist for new interactive element styles. + +- **[HIGH]** `outline: none` or `outline: 0` without a replacement focus indicator. Grep for `outline:\s*none` or `outline:\s*0`. This removes keyboard accessibility. + +- **[LOW]** Touch targets < 44px on interactive elements. Check `min-height`/`min-width`/`padding` on buttons and links. Requires computing effective size from multiple properties — low confidence from code alone. + +### 5. DESIGN.md Violations (3 items, conditional) + +Only apply if `DESIGN.md` or `design-system.md` exists: + +- **[MEDIUM]** Colors not in the stated palette. Compare color values in changed CSS against the palette defined in DESIGN.md. + +- **[MEDIUM]** Fonts not in the stated typography section. Compare `font-family` values against DESIGN.md's font list. + +- **[MEDIUM]** Spacing values outside the stated scale. Compare `margin`/`padding`/`gap` values against DESIGN.md's spacing scale. + +--- + +## Suppressions + +Do NOT flag: +- Patterns explicitly documented in DESIGN.md as intentional choices +- Third-party/vendor CSS files (node_modules, vendor directories) +- CSS resets or normalize stylesheets +- Test fixture files +- Generated/minified CSS diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index d2e86ecf..cb807111 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -519,6 +519,45 @@ Minimum 0 per category. 11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user.`; } +function generateDesignReviewLite(): string { + return `## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using \`gstack-diff-scope\`: + +\`\`\`bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +\`\`\` + +**If \`SCOPE_FRONTEND=false\`:** Skip design review silently. No output. + +**If \`SCOPE_FRONTEND=true\`:** + +1. **Check for DESIGN.md.** If \`DESIGN.md\` or \`design-system.md\` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read \`.claude/skills/review/design-checklist.md\`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (\`outline: none\`, \`!important\`, \`font-size < 16px\`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +\`\`\`bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +\`\`\` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count.`; +} + +// NOTE: design-checklist.md is a subset of this methodology for code-level detection. +// When adding items here, also update review/design-checklist.md, and vice versa. function generateDesignMethodology(): string { return `## Modes @@ -865,7 +904,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" \`\`\` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: \`\`\` +====================================================================+ @@ -1056,6 +1095,7 @@ const RESOLVERS: Record string> = { BASE_BRANCH_DETECT: generateBaseBranchDetect, QA_METHODOLOGY: generateQAMethodology, DESIGN_METHODOLOGY: generateDesignMethodology, + DESIGN_REVIEW_LITE: generateDesignReviewLite, REVIEW_DASHBOARD: generateReviewDashboard, TEST_BOOTSTRAP: generateTestBootstrap, }; diff --git a/ship/SKILL.md b/ship/SKILL.md index e2b524d9..486f32bf 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -186,7 +186,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -226,7 +226,8 @@ If the Eng Review is NOT "CLEAR": - Show that Eng Review is missing or has open issues - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - - If CEO/Design reviews are missing, mention them as informational ("CEO Review not run — recommended for product changes") but do NOT block or recommend aborting for them + - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash @@ -642,6 +643,43 @@ Review the diff for structural issues that tests don't catch. - **Pass 1 (CRITICAL):** SQL & Data Safety, LLM Output Trust Boundary - **Pass 2 (INFORMATIONAL):** All remaining categories +## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using `gstack-diff-scope`: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +**If `SCOPE_FRONTEND=false`:** Skip design review silently. No output. + +**If `SCOPE_FRONTEND=true`:** + +1. **Check for DESIGN.md.** If `DESIGN.md` or `design-system.md` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read `.claude/skills/review/design-checklist.md`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. + + Include any design findings alongside the code review findings. They follow the same Fix-First flow below. + 4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. @@ -863,7 +901,11 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Pre-Landing Review - + + +## Design Review + + ## Eval Results diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index e059fc6a..f9922147 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -69,7 +69,8 @@ If the Eng Review is NOT "CLEAR": - Show that Eng Review is missing or has open issues - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - - If CEO/Design reviews are missing, mention them as informational ("CEO Review not run — recommended for product changes") but do NOT block or recommend aborting for them + - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash @@ -334,6 +335,10 @@ Review the diff for structural issues that tests don't catch. - **Pass 1 (CRITICAL):** SQL & Data Safety, LLM Output Trust Boundary - **Pass 2 (INFORMATIONAL):** All remaining categories +{{DESIGN_REVIEW_LITE}} + + Include any design findings alongside the code review findings. They follow the same Fix-First flow below. + 4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. @@ -555,7 +560,11 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Pre-Landing Review - + + +## Design Review + + ## Eval Results diff --git a/test/fixtures/review-eval-design-slop.css b/test/fixtures/review-eval-design-slop.css new file mode 100644 index 00000000..40e055fb --- /dev/null +++ b/test/fixtures/review-eval-design-slop.css @@ -0,0 +1,86 @@ +/* Planted design anti-patterns for E2E eval — 7 issues */ + +/* Issue 1: [HIGH] Blacklisted font (Papyrus) */ +/* Issue 2: [HIGH] Body text < 16px (14px) */ +body { + font-family: 'Papyrus', sans-serif; + font-size: 14px; + margin: 0; + padding: 0; +} + +/* Issue 5: [MEDIUM] Purple/violet gradient background */ +.hero { + background: linear-gradient(135deg, #6366f1, #8b5cf6); + text-align: center; + padding: 80px 20px; + color: white; +} + +.hero h1 { + text-align: center; + font-size: 48px; +} + +.hero p { + text-align: center; + font-size: 20px; +} + +/* Issue 7: [LOW] 3-column feature grid with icon circles */ +.features { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 24px; + padding: 60px 40px; + text-align: center; +} + +.feature-card { + border-radius: 24px; + padding: 32px; + text-align: center; + background: #f9fafb; +} + +/* Icon in colored circle — AI slop pattern */ +.icon-circle { + width: 60px; + height: 60px; + border-radius: 50%; + background: #ede9fe; + display: flex; + align-items: center; + justify-content: center; + margin: 0 auto 16px; + font-size: 24px; +} + +/* Issue 3: [HIGH] outline: none without replacement */ +button { + outline: none; + background: #6366f1; + color: white; + border: none; + padding: 12px 24px; + border-radius: 24px; + cursor: pointer; +} + +.small-link { + font-size: 11px; + padding: 4px 8px; +} + +/* Issue 4: [HIGH] !important usage */ +.override { + color: red !important; + margin-left: 10px !important; +} + +.footer { + text-align: center; + padding: 40px; + background: #1e1b4b; + color: white; +} diff --git a/test/fixtures/review-eval-design-slop.html b/test/fixtures/review-eval-design-slop.html new file mode 100644 index 00000000..f05affd1 --- /dev/null +++ b/test/fixtures/review-eval-design-slop.html @@ -0,0 +1,41 @@ + + + + + + + Our Platform + + + +
+

Welcome to Our Platform

+

Your all-in-one solution for everything you need

+ +
+ + +
+
+
+

Feature One

+

A short description of this amazing feature that will change your life.

+
+
+
+

Feature Two

+

Another incredible capability that sets us apart from the competition.

+
+
+
+

Feature Three

+

Yet another powerful tool to streamline your workflow effortlessly.

+
+
+ + + + diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 30a15579..84a11da2 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -55,6 +55,7 @@ export const E2E_TOUCHFILES: Record = { 'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'], 'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'], 'review-base-branch': ['review/**'], + 'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'], // Plan reviews 'plan-ceo-review': ['plan-ceo-review/**'], diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 338ec2f1..6a66311b 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -636,6 +636,97 @@ The diff adds a new "returned" status to the Order model. Your job is to check i }, 120_000); }); +// --- Review: Design review lite E2E --- + +describeE2E('Review design lite E2E', () => { + let designDir: string; + + beforeAll(() => { + designDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-design-lite-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: designDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Commit clean base on main + fs.writeFileSync(path.join(designDir, 'index.html'), '

Clean

\n'); + fs.writeFileSync(path.join(designDir, 'styles.css'), 'body { font-size: 16px; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Feature branch adds AI slop CSS + HTML + run('git', ['checkout', '-b', 'feature/add-landing-page']); + const slopCss = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.css'), 'utf-8'); + const slopHtml = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.html'), 'utf-8'); + fs.writeFileSync(path.join(designDir, 'styles.css'), slopCss); + fs.writeFileSync(path.join(designDir, 'landing.html'), slopHtml); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add landing page']); + + // Copy review skill files + fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(designDir, 'review-SKILL.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(designDir, 'review-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'design-checklist.md'), path.join(designDir, 'review-design-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(designDir, 'review-greptile-triage.md')); + }); + + afterAll(() => { + try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {} + }); + + test('/review catches design anti-patterns in CSS/HTML diff', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on branch feature/add-landing-page with changes against main. +Read review-SKILL.md for the review workflow instructions. +Read review-checklist.md for the code review checklist. +Read review-design-checklist.md for the design review checklist. +Run /review on the current diff (git diff main...HEAD). + +The diff adds a landing page with CSS and HTML. Check for both code issues AND design anti-patterns. +Write your review findings to ${designDir}/review-output.md + +Important: The design checklist should catch issues like blacklisted fonts, small font sizes, outline:none, !important, AI slop patterns (purple gradients, generic hero copy, 3-column feature grid), etc.`, + workingDirectory: designDir, + maxTurns: 15, + timeout: 120_000, + testName: 'review-design-lite', + runId, + }); + + logCost('/review design lite', result); + recordE2E('/review design lite', 'Review design lite E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify the review caught at least 4 of 7 planted design issues + const reviewPath = path.join(designDir, 'review-output.md'); + if (fs.existsSync(reviewPath)) { + const review = fs.readFileSync(reviewPath, 'utf-8').toLowerCase(); + let detected = 0; + + // Issue 1: Blacklisted font (Papyrus) — HIGH + if (review.includes('papyrus') || review.includes('blacklisted font') || review.includes('font family')) detected++; + // Issue 2: Body text < 16px — HIGH + if (review.includes('14px') || review.includes('font-size') || review.includes('font size') || review.includes('body text')) detected++; + // Issue 3: outline: none — HIGH + if (review.includes('outline') || review.includes('focus')) detected++; + // Issue 4: !important — HIGH + if (review.includes('!important') || review.includes('important')) detected++; + // Issue 5: Purple gradient — MEDIUM + if (review.includes('gradient') || review.includes('purple') || review.includes('violet') || review.includes('#6366f1') || review.includes('#8b5cf6')) detected++; + // Issue 6: Generic hero copy — MEDIUM + if (review.includes('welcome to') || review.includes('all-in-one') || review.includes('generic') || review.includes('hero copy') || review.includes('ai slop')) detected++; + // Issue 7: 3-column feature grid — LOW + if (review.includes('3-column') || review.includes('three-column') || review.includes('feature grid') || review.includes('icon') || review.includes('circle')) detected++; + + console.log(`Design review detected ${detected}/7 planted issues`); + expect(detected).toBeGreaterThanOrEqual(4); + } + }, 150_000); +}); + // --- B6/B7/B8: Planted-bug outcome evals --- // Outcome evals also need ANTHROPIC_API_KEY for the LLM judge