diff --git a/CHANGELOG.md b/CHANGELOG.md index 49d9183..a464a27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.6.5.0] - 2026-03-18 + +### Fixed + +- **Review log no longer breaks on branch names with `/`.** Branch names like `garrytan/design-system` caused review log writes to fail because Claude Code runs multi-line bash blocks as separate shell invocations, losing the sanitized `$BRANCH` variable between commands. New `gstack-review-log` and `gstack-review-read` atomic helpers encapsulate the entire operation in a single command — slug detection, directory creation, and file I/O all happen in one shell. +- **All `eval $(gstack-slug)` blocks across 12 skill templates now use `&&` chaining** to prevent variable loss, even in templates that only need `mkdir`. + +### For contributors + +- Added `bin/gstack-review-log` and `bin/gstack-review-read` helpers with `GSTACK_HOME` env var override for testability. +- Full integration tests: temp-dir file creation, append behavior, slash sanitization, `NO_REVIEWS` fallback, `---CONFIG---` separator. +- Regression tests verify generated SKILL.md files use the new helpers. + ## [0.6.4.0] - 2026-03-17 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 0979c95..5c5a550 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,6 +30,17 @@ on `git diff` against the base branch. Each test declares its file dependencies llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script variants to force all tests. Run `eval:select` to preview which tests would run. +## Testing + +```bash +bun test # run before every commit — free, <2s +bun run test:evals # run before shipping — paid, diff-based (~$4/run max) +``` + +`bun test` runs skill validation, gen-skill-docs quality checks, and browse +integration tests. `bun run test:evals` runs LLM-judge quality evals and E2E +tests via `claude -p`. Both must pass before creating a PR. + ## Project structure ``` @@ -94,6 +105,18 @@ Rules: - **Express conditionals as English.** Instead of nested `if/elif/else` in bash, write numbered decision steps: "1. If X, do Y. 2. Otherwise, do Z." +## Platform-agnostic design + +Skills must NEVER hardcode framework-specific commands, file patterns, or directory +structures. Instead: + +1. **Read CLAUDE.md** for project-specific config (test commands, eval commands, etc.) +2. **If missing, AskUserQuestion** — let the user tell you or let gstack search the repo +3. **Persist the answer to CLAUDE.md** so we never have to ask again + +This applies to test commands, eval commands, deploy commands, and any other +project-specific behavior. The project owns its config; gstack reads it. + ## Browser interaction When you need to interact with a browser (QA, dogfooding, cookie setup), use the diff --git a/VERSION b/VERSION index 31d34d2..9de863a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.4.0 +0.6.5.0 diff --git a/bin/gstack-review-log b/bin/gstack-review-log new file mode 100755 index 0000000..ad29c17 --- /dev/null +++ b/bin/gstack-review-log @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# gstack-review-log — atomically log a review result +# Usage: gstack-review-log '{"skill":"...","timestamp":"...","status":"..."}' +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +eval $("$SCRIPT_DIR/gstack-slug" 2>/dev/null) +GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}" +mkdir -p "$GSTACK_HOME/projects/$SLUG" +echo "$1" >> "$GSTACK_HOME/projects/$SLUG/$BRANCH-reviews.jsonl" diff --git a/bin/gstack-review-read b/bin/gstack-review-read new file mode 100755 index 0000000..8c4650d --- /dev/null +++ b/bin/gstack-review-read @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# gstack-review-read — read review log and config for dashboard +# Usage: gstack-review-read +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +eval $("$SCRIPT_DIR/gstack-slug" 2>/dev/null) +GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}" +cat "$GSTACK_HOME/projects/$SLUG/$BRANCH-reviews.jsonl" 2>/dev/null || echo "NO_REVIEWS" +echo "---CONFIG---" +"$SCRIPT_DIR/gstack-config" get skip_eng_review 2>/dev/null || echo "false" diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index 8fd9cb4..e007233 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -153,8 +153,7 @@ ls src/ app/ pages/ components/ 2>/dev/null | head -30 Look for brainstorm output: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -ls ~/.gstack/projects/$SLUG/*brainstorm* 2>/dev/null | head -5 +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls ~/.gstack/projects/$SLUG/*brainstorm* 2>/dev/null | head -5 ls .context/*brainstorm* .context/attachments/*brainstorm* 2>/dev/null | head -5 ``` diff --git a/design-consultation/SKILL.md.tmpl b/design-consultation/SKILL.md.tmpl index 141a919..435326b 100644 --- a/design-consultation/SKILL.md.tmpl +++ b/design-consultation/SKILL.md.tmpl @@ -49,8 +49,7 @@ ls src/ app/ pages/ components/ 2>/dev/null | head -30 Look for brainstorm output: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -ls ~/.gstack/projects/$SLUG/*brainstorm* 2>/dev/null | head -5 +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls ~/.gstack/projects/$SLUG/*brainstorm* 2>/dev/null | head -5 ls .context/*brainstorm* .context/attachments/*brainstorm* 2>/dev/null | head -5 ``` diff --git a/design-review/SKILL.md b/design-review/SKILL.md index b06e082..4ed1769 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -810,8 +810,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md` diff --git a/design-review/SKILL.md.tmpl b/design-review/SKILL.md.tmpl index eb8dd6b..f8f3ff4 100644 --- a/design-review/SKILL.md.tmpl +++ b/design-review/SKILL.md.tmpl @@ -208,8 +208,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md` diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index ce799fe..50af907 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -212,8 +212,8 @@ Run the following commands: git log --oneline -30 # Recent history git diff --stat # What's already changed git stash list # Any stashed work -grep -r "TODO\|FIXME\|HACK\|XXX" --include="*.rb" --include="*.js" -l -find . -name "*.rb" -newer Gemfile.lock | head -20 # Recently touched files +grep -r "TODO\|FIXME\|HACK\|XXX" -l --exclude-dir=node_modules --exclude-dir=vendor --exclude-dir=.git | head -30 +git log --since=30.days --name-only --format="" | sort | uniq -c | sort -rn | head -20 # Recently touched files ``` Then read CLAUDE.md, TODOS.md, and any existing architecture docs. When reading TODOS.md, specifically: * Note any TODOs this plan touches, blocks, or unlocks @@ -284,8 +284,7 @@ Describe the ideal end state of this system 12 months from now. Does this plan m After the opt-in/cherry-pick ceremony, write the plan to disk so the vision and decisions survive beyond this conversation. Only run this step for EXPANSION and SELECTIVE EXPANSION modes. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG/ceo-plans +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG/ceo-plans ``` Before writing, check for existing CEO plans in the ceo-plans/ directory. If any are >30 days old or their branch has been merged/deleted, offer to archive them: @@ -398,24 +397,24 @@ For every new method, service, or codepath that can fail, fill in this table: ``` METHOD/CODEPATH | WHAT CAN GO WRONG | EXCEPTION CLASS -------------------------|-----------------------------|----------------- - ExampleService#call | API timeout | Faraday::TimeoutError + ExampleService.call | API timeout | TimeoutError | API returns 429 | RateLimitError - | API returns malformed JSON | JSON::ParserError - | DB connection pool exhausted| ActiveRecord::ConnectionTimeoutError - | Record not found | ActiveRecord::RecordNotFound + | API returns malformed JSON | JSONParseError + | DB connection pool exhausted| ConnectionPoolExhausted + | Record not found | RecordNotFound -------------------------|-----------------------------|----------------- EXCEPTION CLASS | RESCUED? | RESCUE ACTION | USER SEES -----------------------------|-----------|------------------------|------------------ - Faraday::TimeoutError | Y | Retry 2x, then raise | "Service temporarily unavailable" + TimeoutError | Y | Retry 2x, then raise | "Service temporarily unavailable" RateLimitError | Y | Backoff + retry | Nothing (transparent) - JSON::ParserError | N ← GAP | — | 500 error ← BAD - ConnectionTimeoutError | N ← GAP | — | 500 error ← BAD - ActiveRecord::RecordNotFound | Y | Return nil, log warning | "Not found" message + JSONParseError | N ← GAP | — | 500 error ← BAD + ConnectionPoolExhausted | N ← GAP | — | 500 error ← BAD + RecordNotFound | Y | Return nil, log warning | "Not found" message ``` Rules for this section: -* `rescue StandardError` is ALWAYS a smell. Name the specific exceptions. -* `rescue => e` with only `Rails.logger.error(e.message)` is insufficient. Log the full context: what was being attempted, with what arguments, for what user/request. +* Catching all errors generically (`rescue StandardError`, `catch (Exception e)`, `except Exception`) is ALWAYS a smell. Name the specific exceptions. +* Logging only the error message is insufficient. Log the full context: what was being attempted, with what arguments, for what user/request. * Every rescued error must either: retry with backoff, degrade gracefully with a user-visible message, or re-raise with added context. "Swallow and continue" is almost never acceptable. * For each GAP (unrescued error that should be rescued): specify the rescue action and what the user should see. * For LLM/AI service calls specifically: what happens when the response is malformed? When it's empty? When it hallucinates invalid JSON? When the model returns a refusal? Each of these is a distinct failure mode. @@ -712,9 +711,7 @@ If any AskUserQuestion goes unanswered, note it here. Never silently default. After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' ``` Before running this command, substitute the placeholder values from the Completion Summary you just produced: @@ -729,10 +726,7 @@ Before running this command, substitute the placeholder values from the Completi After completing the review, read the review log and config to display the dashboard. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -cat ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_REVIEWS" -echo "---CONFIG---" -~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" +~/.claude/skills/gstack/bin/gstack-review-read ``` Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 1a8b065..adf7b92 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -91,8 +91,8 @@ Run the following commands: git log --oneline -30 # Recent history git diff --stat # What's already changed git stash list # Any stashed work -grep -r "TODO\|FIXME\|HACK\|XXX" --include="*.rb" --include="*.js" -l -find . -name "*.rb" -newer Gemfile.lock | head -20 # Recently touched files +grep -r "TODO\|FIXME\|HACK\|XXX" -l --exclude-dir=node_modules --exclude-dir=vendor --exclude-dir=.git | head -30 +git log --since=30.days --name-only --format="" | sort | uniq -c | sort -rn | head -20 # Recently touched files ``` Then read CLAUDE.md, TODOS.md, and any existing architecture docs. When reading TODOS.md, specifically: * Note any TODOs this plan touches, blocks, or unlocks @@ -163,8 +163,7 @@ Describe the ideal end state of this system 12 months from now. Does this plan m After the opt-in/cherry-pick ceremony, write the plan to disk so the vision and decisions survive beyond this conversation. Only run this step for EXPANSION and SELECTIVE EXPANSION modes. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG/ceo-plans +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG/ceo-plans ``` Before writing, check for existing CEO plans in the ceo-plans/ directory. If any are >30 days old or their branch has been merged/deleted, offer to archive them: @@ -277,24 +276,24 @@ For every new method, service, or codepath that can fail, fill in this table: ``` METHOD/CODEPATH | WHAT CAN GO WRONG | EXCEPTION CLASS -------------------------|-----------------------------|----------------- - ExampleService#call | API timeout | Faraday::TimeoutError + ExampleService.call | API timeout | TimeoutError | API returns 429 | RateLimitError - | API returns malformed JSON | JSON::ParserError - | DB connection pool exhausted| ActiveRecord::ConnectionTimeoutError - | Record not found | ActiveRecord::RecordNotFound + | API returns malformed JSON | JSONParseError + | DB connection pool exhausted| ConnectionPoolExhausted + | Record not found | RecordNotFound -------------------------|-----------------------------|----------------- EXCEPTION CLASS | RESCUED? | RESCUE ACTION | USER SEES -----------------------------|-----------|------------------------|------------------ - Faraday::TimeoutError | Y | Retry 2x, then raise | "Service temporarily unavailable" + TimeoutError | Y | Retry 2x, then raise | "Service temporarily unavailable" RateLimitError | Y | Backoff + retry | Nothing (transparent) - JSON::ParserError | N ← GAP | — | 500 error ← BAD - ConnectionTimeoutError | N ← GAP | — | 500 error ← BAD - ActiveRecord::RecordNotFound | Y | Return nil, log warning | "Not found" message + JSONParseError | N ← GAP | — | 500 error ← BAD + ConnectionPoolExhausted | N ← GAP | — | 500 error ← BAD + RecordNotFound | Y | Return nil, log warning | "Not found" message ``` Rules for this section: -* `rescue StandardError` is ALWAYS a smell. Name the specific exceptions. -* `rescue => e` with only `Rails.logger.error(e.message)` is insufficient. Log the full context: what was being attempted, with what arguments, for what user/request. +* Catching all errors generically (`rescue StandardError`, `catch (Exception e)`, `except Exception`) is ALWAYS a smell. Name the specific exceptions. +* Logging only the error message is insufficient. Log the full context: what was being attempted, with what arguments, for what user/request. * Every rescued error must either: retry with backoff, degrade gracefully with a user-visible message, or re-raise with added context. "Swallow and continue" is almost never acceptable. * For each GAP (unrescued error that should be rescued): specify the rescue action and what the user should see. * For LLM/AI service calls specifically: what happens when the response is malformed? When it's empty? When it hallucinates invalid JSON? When the model returns a refusal? Each of these is a distinct failure mode. @@ -591,9 +590,7 @@ If any AskUserQuestion goes unanswered, note it here. Never silently default. After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' ``` Before running this command, substitute the placeholder values from the Completion Summary you just produced: diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 507952c..7c94a07 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -387,9 +387,7 @@ If any AskUserQuestion goes unanswered, note it here. Never silently default to After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' ``` Substitute values from the Completion Summary: @@ -404,10 +402,7 @@ Substitute values from the Completion Summary: After completing the review, read the review log and config to display the dashboard. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -cat ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_REVIEWS" -echo "---CONFIG---" -~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" +~/.claude/skills/gstack/bin/gstack-review-read ``` Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: diff --git a/plan-design-review/SKILL.md.tmpl b/plan-design-review/SKILL.md.tmpl index f8f5221..043a2b6 100644 --- a/plan-design-review/SKILL.md.tmpl +++ b/plan-design-review/SKILL.md.tmpl @@ -266,9 +266,7 @@ If any AskUserQuestion goes unanswered, note it here. Never silently default to After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' ``` Substitute values from the Completion Summary: diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 48fe723..896e43f 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -205,7 +205,7 @@ Evaluate: **STOP.** For each issue found in this section, call AskUserQuestion individually. One issue per call. Present options, state your recommendation, explain WHY. Do NOT batch multiple issues into one AskUserQuestion. Only proceed to the next section after ALL issues in this section are resolved. ### 3. Test review -Make a diagram of all new UX, new data flow, new codepaths, and new branching if statements or outcomes. For each, note what is new about the features discussed in this branch and plan. Then, for each new item in the diagram, make sure there is a JS or Rails test. +Make a diagram of all new UX, new data flow, new codepaths, and new branching if statements or outcomes. For each, note what is new about the features discussed in this branch and plan. Then, for each new item in the diagram, make sure there is a corresponding test. For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in CLAUDE.md. If this plan touches ANY of those patterns, state which eval suites must be run, which cases should be added, and what baselines to compare against. Then use AskUserQuestion to confirm the eval scope with the user. @@ -216,10 +216,9 @@ For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in C After producing the test diagram, write a test plan artifact to the project directory so `/qa` and `/qa-only` can consume it as primary test input (replacing the lossy git-diff heuristic): ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG USER=$(whoami) DATETIME=$(date +%Y%m%d-%H%M%S) -mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-plan-{datetime}.md`: @@ -325,9 +324,7 @@ Check the git log for this branch. If there are prior commits suggesting a previ After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' ``` Substitute values from the Completion Summary: @@ -342,10 +339,7 @@ Substitute values from the Completion Summary: After completing the review, read the review log and config to display the dashboard. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -cat ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_REVIEWS" -echo "---CONFIG---" -~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" +~/.claude/skills/gstack/bin/gstack-review-read ``` Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index 91f2471..e5eeec5 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -101,7 +101,7 @@ Evaluate: **STOP.** For each issue found in this section, call AskUserQuestion individually. One issue per call. Present options, state your recommendation, explain WHY. Do NOT batch multiple issues into one AskUserQuestion. Only proceed to the next section after ALL issues in this section are resolved. ### 3. Test review -Make a diagram of all new UX, new data flow, new codepaths, and new branching if statements or outcomes. For each, note what is new about the features discussed in this branch and plan. Then, for each new item in the diagram, make sure there is a JS or Rails test. +Make a diagram of all new UX, new data flow, new codepaths, and new branching if statements or outcomes. For each, note what is new about the features discussed in this branch and plan. Then, for each new item in the diagram, make sure there is a corresponding test. For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in CLAUDE.md. If this plan touches ANY of those patterns, state which eval suites must be run, which cases should be added, and what baselines to compare against. Then use AskUserQuestion to confirm the eval scope with the user. @@ -112,10 +112,9 @@ For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in C After producing the test diagram, write a test plan artifact to the project directory so `/qa` and `/qa-only` can consume it as primary test input (replacing the lossy git-diff heuristic): ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG USER=$(whoami) DATETIME=$(date +%Y%m%d-%H%M%S) -mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-plan-{datetime}.md`: @@ -221,9 +220,7 @@ Check the git log for this branch. If there are prior commits suggesting a previ After producing the Completion Summary above, persist the review result: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE"}' ``` Substitute values from the Completion Summary: diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index 594979b..df16bd6 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -173,8 +173,7 @@ Before falling back to git diff heuristics, check for richer test plan sources: 1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 ``` 2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation 3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. @@ -466,8 +465,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** Write test outcome artifact for cross-session context: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` diff --git a/qa-only/SKILL.md.tmpl b/qa-only/SKILL.md.tmpl index 831e71e..9679ef8 100644 --- a/qa-only/SKILL.md.tmpl +++ b/qa-only/SKILL.md.tmpl @@ -52,8 +52,7 @@ Before falling back to git diff heuristics, check for richer test plan sources: 1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 ``` 2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation 3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. @@ -72,8 +71,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** Write test outcome artifact for cross-session context: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` diff --git a/qa/SKILL.md b/qa/SKILL.md index 10e5071..d094fa6 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -366,8 +366,7 @@ Before falling back to git diff heuristics, check for richer test plan sources: 1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 ``` 2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation 3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. @@ -827,8 +826,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** Write test outcome artifact for cross-session context: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` diff --git a/qa/SKILL.md.tmpl b/qa/SKILL.md.tmpl index bd94deb..ca8e96d 100644 --- a/qa/SKILL.md.tmpl +++ b/qa/SKILL.md.tmpl @@ -77,8 +77,7 @@ Before falling back to git diff heuristics, check for richer test plan sources: 1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 ``` 2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation 3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. @@ -265,8 +264,7 @@ Write the report to both local and project-scoped locations: **Project-scoped:** Write test outcome artifact for cross-session context: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) && mkdir -p ~/.gstack/projects/$SLUG ``` Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` diff --git a/retro/SKILL.md b/retro/SKILL.md index 71eab98..36c7af1 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -240,7 +240,7 @@ Then show a **per-author leaderboard** immediately below: ``` Contributor Commits +/- Top area You (garry) 32 +2400/-300 browse/ -alice 12 +800/-150 app/services/ +alice 12 +800/-150 src/services/ bob 3 +120/-40 tests/ ``` @@ -325,7 +325,7 @@ From commit diffs, estimate PR sizes and bucket them: ### Step 8: Focus Score + Ship of the Week -**Focus score:** Calculate the percentage of commits touching the single most-changed top-level directory (e.g., `app/services/`, `app/views/`). Higher score = deeper focused work. Lower score = scattered context-switching. Report as: "Focus score: 62% (app/services/)" +**Focus score:** Calculate the percentage of commits touching the single most-changed top-level directory (e.g., `src/`, `lib/`, `components/`). Higher score = deeper focused work. Lower score = scattered context-switching. Report as: "Focus score: 62% (src/services/)" **Ship of the week:** Auto-identify the single highest-LOC PR in the window. Highlight it: - PR number and title @@ -443,7 +443,7 @@ Use the Write tool to save the JSON file with this schema: }, "authors": { "Garry Tan": { "commits": 32, "insertions": 2400, "deletions": 300, "test_ratio": 0.41, "top_area": "browse/" }, - "Alice": { "commits": 12, "insertions": 800, "deletions": 150, "test_ratio": 0.35, "top_area": "app/services/" } + "Alice": { "commits": 12, "insertions": 800, "deletions": 150, "test_ratio": 0.35, "top_area": "src/services/" } }, "version_range": ["1.16.0.0", "1.16.1.0"], "streak_days": 47, diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index bfbc200..487a8ac 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -136,7 +136,7 @@ Then show a **per-author leaderboard** immediately below: ``` Contributor Commits +/- Top area You (garry) 32 +2400/-300 browse/ -alice 12 +800/-150 app/services/ +alice 12 +800/-150 src/services/ bob 3 +120/-40 tests/ ``` @@ -221,7 +221,7 @@ From commit diffs, estimate PR sizes and bucket them: ### Step 8: Focus Score + Ship of the Week -**Focus score:** Calculate the percentage of commits touching the single most-changed top-level directory (e.g., `app/services/`, `app/views/`). Higher score = deeper focused work. Lower score = scattered context-switching. Report as: "Focus score: 62% (app/services/)" +**Focus score:** Calculate the percentage of commits touching the single most-changed top-level directory (e.g., `src/`, `lib/`, `components/`). Higher score = deeper focused work. Lower score = scattered context-switching. Report as: "Focus score: 62% (src/services/)" **Ship of the week:** Auto-identify the single highest-LOC PR in the window. Highlight it: - PR number and title @@ -339,7 +339,7 @@ Use the Write tool to save the JSON file with this schema: }, "authors": { "Garry Tan": { "commits": 32, "insertions": 2400, "deletions": 300, "test_ratio": 0.41, "top_area": "browse/" }, - "Alice": { "commits": 12, "insertions": 800, "deletions": 150, "test_ratio": 0.35, "top_area": "app/services/" } + "Alice": { "commits": 12, "insertions": 800, "deletions": 150, "test_ratio": 0.35, "top_area": "src/services/" } }, "version_range": ["1.16.0.0", "1.16.1.0"], "streak_days": 47, diff --git a/review/SKILL.md b/review/SKILL.md index 3a14a9d..31a44f9 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -226,9 +226,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 6. **Log the result** for the Review Readiness Dashboard: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' ``` Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. @@ -266,11 +264,11 @@ Example format: ``` I auto-fixed 5 issues. 2 need your input: -1. [CRITICAL] app/models/post.rb:42 — Race condition in status transition +1. [CRITICAL] src/models/post.ts:42 — Race condition in status transition Fix: Add `WHERE status = 'draft'` to the UPDATE → A) Fix B) Skip -2. [INFORMATIONAL] app/services/generator.rb:88 — LLM output not type-checked before DB write +2. [INFORMATIONAL] src/services/generator.py:88 — LLM output not type-checked before DB write Fix: Add JSON schema validation → A) Fix B) Skip diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index c1d3fae..07063bb 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -112,11 +112,11 @@ Example format: ``` I auto-fixed 5 issues. 2 need your input: -1. [CRITICAL] app/models/post.rb:42 — Race condition in status transition +1. [CRITICAL] src/models/post.ts:42 — Race condition in status transition Fix: Add `WHERE status = 'draft'` to the UPDATE → A) Fix B) Skip -2. [INFORMATIONAL] app/services/generator.rb:88 — LLM output not type-checked before DB write +2. [INFORMATIONAL] src/services/generator.py:88 — LLM output not type-checked before DB write Fix: Add JSON schema validation → A) Fix B) Skip diff --git a/review/checklist.md b/review/checklist.md index 282c994..2a758b4 100644 --- a/review/checklist.md +++ b/review/checklist.md @@ -35,16 +35,16 @@ Be terse. For each issue: one line describing the problem, one line with the fix ### Pass 1 — CRITICAL #### SQL & Data Safety -- String interpolation in SQL (even if values are `.to_i`/`.to_f` — use `sanitize_sql_array` or Arel) -- TOCTOU races: check-then-set patterns that should be atomic `WHERE` + `update_all` -- `update_column`/`update_columns` bypassing validations on fields that have or should have constraints -- N+1 queries: `.includes()` missing for associations used in loops/views (especially avatar, attachments) +- String interpolation in SQL — use parameterized queries (Rails: `sanitize_sql_array`/Arel; Node: prepared statements; Python: parameterized queries; Go: `db.Query` with `?` placeholders) +- TOCTOU races: check-then-set patterns that should be atomic `WHERE` + `UPDATE` +- Bypassing model validations for direct DB writes (Rails: `update_column`/`update_columns`; Django: `QuerySet.update()`; Prisma: raw queries) +- N+1 queries: missing eager loading for associations used in loops/views (Rails: `.includes()`; SQLAlchemy: `joinedload()`; Prisma: `include`; Go: manual batch query) #### Race Conditions & Concurrency -- Read-check-write without uniqueness constraint or `rescue RecordNotUnique; retry` (e.g., `where(hash:).first` then `save!` without handling concurrent insert) -- `find_or_create_by` on columns without unique DB index — concurrent calls can create duplicates +- Read-check-write without uniqueness constraint or catch-duplicate-key-and-retry (e.g., find-then-insert without handling concurrent insert) +- Find-or-create without unique DB index — concurrent calls can create duplicates - Status transitions that don't use atomic `WHERE old_status = ? UPDATE SET new_status` — concurrent updates can skip or double-apply transitions -- `html_safe` on user-controlled data (XSS) — check any `.html_safe`, `raw()`, or string interpolation into `html_safe` output +- Unsafe HTML rendering of user-controlled data (XSS) — check for unescaped output (Rails: `.html_safe`/`raw()`; React: `dangerouslySetInnerHTML`; Vue: `v-html`; Django: `|safe`/`mark_safe`) #### LLM Output Trust Boundary - LLM-generated values (emails, URLs, names) written to DB or passed to mailers without format validation. Add lightweight guards (`EMAIL_REGEXP`, `URI.parse`, `.strip`) before persisting. @@ -141,7 +141,7 @@ the agent auto-fixes a finding or asks the user. ``` AUTO-FIX (agent fixes without asking): ASK (needs human judgment): ├─ Dead code / unused variables ├─ Security (auth, XSS, injection) -├─ N+1 queries (missing .includes()) ├─ Race conditions +├─ N+1 queries (missing eager loading) ├─ Race conditions ├─ Stale comments contradicting code ├─ Design decisions ├─ Magic numbers → named constants ├─ Large fixes (>20 lines) ├─ Missing LLM output validation ├─ Enum completeness diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 687143c..07d2240 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -548,9 +548,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 6. **Log the result** for the Review Readiness Dashboard: \`\`\`bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' \`\`\` Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count.`; @@ -898,10 +896,7 @@ function generateReviewDashboard(): string { After completing the review, read the review log and config to display the dashboard. \`\`\`bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -cat ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_REVIEWS" -echo "---CONFIG---" -~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" +~/.claude/skills/gstack/bin/gstack-review-read \`\`\` Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: diff --git a/ship/SKILL.md b/ship/SKILL.md index 875845d..56c4a19 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -180,10 +180,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat After completing the review, read the review log and config to display the dashboard. ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -cat ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_REVIEWS" -echo "---CONFIG---" -~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" +~/.claude/skills/gstack/bin/gstack-review-read ``` Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: @@ -217,8 +214,7 @@ If the Eng Review is NOT "CLEAR": 1. **Check for a prior override on this branch:** ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - grep '"skill":"ship-review-override"' ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_OVERRIDE" + ~/.claude/skills/gstack/bin/gstack-review-read | grep '"skill":"ship-review-override"' || echo "NO_OVERRIDE" ``` If an override exists, display the dashboard and note "Review gate previously accepted — continuing." Do NOT ask again. @@ -231,8 +227,7 @@ If the Eng Review is NOT "CLEAR": 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - echo '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl + ~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' ``` Substitute USER_CHOICE with "ship_anyway" or "not_relevant". @@ -246,7 +241,7 @@ Fetch and merge the base branch into the feature branch so tests run against the git fetch origin && git merge origin/ --no-edit ``` -**If there are merge conflicts:** Try to auto-resolve if they are simple (VERSION, schema.rb, CHANGELOG ordering). If conflicts are complex or ambiguous, **STOP** and show them. +**If there are merge conflicts:** Try to auto-resolve if they are simple (VERSION, CHANGELOG ordering). If conflicts are complex or ambiguous, **STOP** and show them. **If already up to date:** Continue silently. @@ -411,85 +406,64 @@ Only commit if there are changes. Stage all bootstrap files (config, test direct ## Step 3: Run tests (on merged code) -**Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls -`db:test:prepare` internally, which loads the schema into the correct lane database. -Running bare test migrations without INSTANCE hits an orphan DB and corrupts structure.sql. +Read CLAUDE.md. Look for a section about testing — `## Testing`, `## Test Commands`, +`## Tests`, or test commands in `## Commands`. Extract all test commands (lines in +bash code blocks or command tables that run tests — e.g., `bun test`, `npm test`, `pytest`, `go test ./...`, `cargo test`, `bin/rails test`). -Run both test suites in parallel: +Run all discovered test commands in parallel, each piped to a unique /tmp file: ```bash -bin/test-lane 2>&1 | tee /tmp/ship_tests.txt & -npm run test 2>&1 | tee /tmp/ship_vitest.txt & +{test_command_1} 2>&1 | tee /tmp/ship_tests_1.txt & +{test_command_2} 2>&1 | tee /tmp/ship_tests_2.txt & wait ``` -After both complete, read the output files and check pass/fail. +After all complete, read the output files and check pass/fail. **If any test fails:** Show the failures and **STOP**. Do not proceed. **If all pass:** Continue silently — just note the counts briefly. ---- - -## Step 3.25: Eval Suites (conditional) - -Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. - -**1. Check if the diff touches prompt-related files:** - -```bash -git diff origin/ --name-only -``` - -Match against these patterns (from CLAUDE.md): -- `app/services/*_prompt_builder.rb` -- `app/services/*_generation_service.rb`, `*_writer_service.rb`, `*_designer_service.rb` -- `app/services/*_evaluator.rb`, `*_scorer.rb`, `*_classifier_service.rb`, `*_analyzer.rb` -- `app/services/concerns/*voice*.rb`, `*writing*.rb`, `*prompt*.rb`, `*token*.rb` -- `app/services/chat_tools/*.rb`, `app/services/x_thread_tools/*.rb` -- `config/system_prompts/*.txt` -- `test/evals/**/*` (eval infrastructure changes affect all suites) +**If CLAUDE.md has no test commands:** Use AskUserQuestion: -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +"I couldn't find test commands in CLAUDE.md. I need to know how to run tests before +I can ship. Options: +A) Let me search the repo — I'll look at package.json, Makefile, Gemfile, etc., + figure out the test commands, and add a ## Testing section to CLAUDE.md so we + never have to ask again. +B) Tell me the commands — type them and I'll add them to CLAUDE.md. +C) This project has no tests — skip testing and continue shipping. +RECOMMENDATION: Choose A because it's a one-time cost that prevents this question forever." -**2. Identify affected eval suites:** +If A: Search the repo for test infrastructure (package.json scripts, Makefile targets, +Gemfile test gems, pytest/pyproject.toml config, go.mod, Cargo.toml, CI workflow files). +Determine the correct test commands. Write a `## Testing` section to CLAUDE.md with the +discovered commands. Then re-run Step 3 with those commands. -Each eval runner (`test/evals/*_eval_runner.rb`) declares `PROMPT_SOURCE_FILES` listing which source files affect it. Grep these to find which suites match the changed files: +If B: User provides commands. Write them to CLAUDE.md `## Testing` section. Re-run Step 3. -```bash -grep -l "changed_file_basename" test/evals/*_eval_runner.rb -``` - -Map runner → test file: `post_generation_eval_runner.rb` → `post_generation_eval_test.rb`. - -**Special cases:** -- Changes to `test/evals/judges/*.rb`, `test/evals/support/*.rb`, or `test/evals/fixtures/` affect ALL suites that use those judges/support files. Check imports in the eval test files to determine which. -- Changes to `config/system_prompts/*.txt` — grep eval runners for the prompt filename to find affected suites. -- If unsure which suites are affected, run ALL suites that could plausibly be impacted. Over-testing is better than missing a regression. - -**3. Run affected suites at `EVAL_JUDGE_TIER=full`:** +If C: Skip tests with warning. Continue to Step 3.25. -`/ship` is a pre-merge gate, so always use full tier (Sonnet structural + Opus persona judges). +--- -```bash -EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/_eval_test.rb 2>&1 | tee /tmp/ship_evals.txt -``` +## Step 3.25: Eval Suites (conditional) -If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites. +Read CLAUDE.md. Look for a `## Evals` section, or eval-related commands in `## Testing`, +`## Test Commands`, `## Tests`, or `## Commands` (identified by keywords: "eval", "evals", +"judge", "llm-judge"). -**4. Check results:** +If an eval command is found: + Run it. The project's eval system handles diff-based file selection internally. -- **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. + ```bash + {eval_command} 2>&1 | tee /tmp/ship_evals.txt + ``` -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). + If the eval fails → show failures and **STOP**. Do not proceed. + If it passes → note pass counts and cost. Continue to Step 3.4. -**Tier reference (for context — /ship always uses `full`):** -| Tier | When | Speed (cached) | Cost | -|------|------|----------------|------| -| `fast` (Haiku) | Dev iteration, smoke tests | ~5s (14x faster) | ~$0.07/run | -| `standard` (Sonnet) | Default dev, `bin/test-lane --eval` | ~17s (4x faster) | ~$0.37/run | -| `full` (Opus persona) | **`/ship` and pre-merge** | ~72s (baseline) | ~$1.27/run | +If no eval command found: + Skip silently — most projects don't have eval suites. Continue to Step 3.4. --- @@ -671,9 +645,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 6. **Log the result** for the Review Readiness Dashboard: ```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' ``` Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. @@ -922,8 +894,7 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Test plan -- [x] All Rails tests pass (N runs, 0 failures) -- [x] All Vitest tests pass (N tests) +- [x] All tests pass (list each test suite: command name, pass/fail counts) 🤖 Generated with [Claude Code](https://claude.com/claude-code) EOF diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index bb077da..5540ee2 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -60,8 +60,7 @@ If the Eng Review is NOT "CLEAR": 1. **Check for a prior override on this branch:** ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - grep '"skill":"ship-review-override"' ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_OVERRIDE" + ~/.claude/skills/gstack/bin/gstack-review-read | grep '"skill":"ship-review-override"' || echo "NO_OVERRIDE" ``` If an override exists, display the dashboard and note "Review gate previously accepted — continuing." Do NOT ask again. @@ -74,8 +73,7 @@ If the Eng Review is NOT "CLEAR": 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash - eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) - echo '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl + ~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' ``` Substitute USER_CHOICE with "ship_anyway" or "not_relevant". @@ -89,7 +87,7 @@ Fetch and merge the base branch into the feature branch so tests run against the git fetch origin && git merge origin/ --no-edit ``` -**If there are merge conflicts:** Try to auto-resolve if they are simple (VERSION, schema.rb, CHANGELOG ordering). If conflicts are complex or ambiguous, **STOP** and show them. +**If there are merge conflicts:** Try to auto-resolve if they are simple (VERSION, CHANGELOG ordering). If conflicts are complex or ambiguous, **STOP** and show them. **If already up to date:** Continue silently. @@ -103,85 +101,64 @@ git fetch origin && git merge origin/ --no-edit ## Step 3: Run tests (on merged code) -**Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls -`db:test:prepare` internally, which loads the schema into the correct lane database. -Running bare test migrations without INSTANCE hits an orphan DB and corrupts structure.sql. +Read CLAUDE.md. Look for a section about testing — `## Testing`, `## Test Commands`, +`## Tests`, or test commands in `## Commands`. Extract all test commands (lines in +bash code blocks or command tables that run tests — e.g., `bun test`, `npm test`, `pytest`, `go test ./...`, `cargo test`, `bin/rails test`). -Run both test suites in parallel: +Run all discovered test commands in parallel, each piped to a unique /tmp file: ```bash -bin/test-lane 2>&1 | tee /tmp/ship_tests.txt & -npm run test 2>&1 | tee /tmp/ship_vitest.txt & +{test_command_1} 2>&1 | tee /tmp/ship_tests_1.txt & +{test_command_2} 2>&1 | tee /tmp/ship_tests_2.txt & wait ``` -After both complete, read the output files and check pass/fail. +After all complete, read the output files and check pass/fail. **If any test fails:** Show the failures and **STOP**. Do not proceed. **If all pass:** Continue silently — just note the counts briefly. ---- - -## Step 3.25: Eval Suites (conditional) - -Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. - -**1. Check if the diff touches prompt-related files:** - -```bash -git diff origin/ --name-only -``` - -Match against these patterns (from CLAUDE.md): -- `app/services/*_prompt_builder.rb` -- `app/services/*_generation_service.rb`, `*_writer_service.rb`, `*_designer_service.rb` -- `app/services/*_evaluator.rb`, `*_scorer.rb`, `*_classifier_service.rb`, `*_analyzer.rb` -- `app/services/concerns/*voice*.rb`, `*writing*.rb`, `*prompt*.rb`, `*token*.rb` -- `app/services/chat_tools/*.rb`, `app/services/x_thread_tools/*.rb` -- `config/system_prompts/*.txt` -- `test/evals/**/*` (eval infrastructure changes affect all suites) +**If CLAUDE.md has no test commands:** Use AskUserQuestion: -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +"I couldn't find test commands in CLAUDE.md. I need to know how to run tests before +I can ship. Options: +A) Let me search the repo — I'll look at package.json, Makefile, Gemfile, etc., + figure out the test commands, and add a ## Testing section to CLAUDE.md so we + never have to ask again. +B) Tell me the commands — type them and I'll add them to CLAUDE.md. +C) This project has no tests — skip testing and continue shipping. +RECOMMENDATION: Choose A because it's a one-time cost that prevents this question forever." -**2. Identify affected eval suites:** +If A: Search the repo for test infrastructure (package.json scripts, Makefile targets, +Gemfile test gems, pytest/pyproject.toml config, go.mod, Cargo.toml, CI workflow files). +Determine the correct test commands. Write a `## Testing` section to CLAUDE.md with the +discovered commands. Then re-run Step 3 with those commands. -Each eval runner (`test/evals/*_eval_runner.rb`) declares `PROMPT_SOURCE_FILES` listing which source files affect it. Grep these to find which suites match the changed files: +If B: User provides commands. Write them to CLAUDE.md `## Testing` section. Re-run Step 3. -```bash -grep -l "changed_file_basename" test/evals/*_eval_runner.rb -``` - -Map runner → test file: `post_generation_eval_runner.rb` → `post_generation_eval_test.rb`. - -**Special cases:** -- Changes to `test/evals/judges/*.rb`, `test/evals/support/*.rb`, or `test/evals/fixtures/` affect ALL suites that use those judges/support files. Check imports in the eval test files to determine which. -- Changes to `config/system_prompts/*.txt` — grep eval runners for the prompt filename to find affected suites. -- If unsure which suites are affected, run ALL suites that could plausibly be impacted. Over-testing is better than missing a regression. - -**3. Run affected suites at `EVAL_JUDGE_TIER=full`:** +If C: Skip tests with warning. Continue to Step 3.25. -`/ship` is a pre-merge gate, so always use full tier (Sonnet structural + Opus persona judges). +--- -```bash -EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/_eval_test.rb 2>&1 | tee /tmp/ship_evals.txt -``` +## Step 3.25: Eval Suites (conditional) -If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites. +Read CLAUDE.md. Look for a `## Evals` section, or eval-related commands in `## Testing`, +`## Test Commands`, `## Tests`, or `## Commands` (identified by keywords: "eval", "evals", +"judge", "llm-judge"). -**4. Check results:** +If an eval command is found: + Run it. The project's eval system handles diff-based file selection internally. -- **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. + ```bash + {eval_command} 2>&1 | tee /tmp/ship_evals.txt + ``` -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). + If the eval fails → show failures and **STOP**. Do not proceed. + If it passes → note pass counts and cost. Continue to Step 3.4. -**Tier reference (for context — /ship always uses `full`):** -| Tier | When | Speed (cached) | Cost | -|------|------|----------------|------| -| `fast` (Haiku) | Dev iteration, smoke tests | ~5s (14x faster) | ~$0.07/run | -| `standard` (Sonnet) | Default dev, `bin/test-lane --eval` | ~17s (4x faster) | ~$0.37/run | -| `full` (Opus persona) | **`/ship` and pre-merge** | ~72s (baseline) | ~$1.27/run | +If no eval command found: + Skip silently — most projects don't have eval suites. Continue to Step 3.4. --- @@ -581,8 +558,7 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Test plan -- [x] All Rails tests pass (N runs, 0 failures) -- [x] All Vitest tests pass (N tests) +- [x] All tests pass (list each test suite: command name, pass/fail counts) 🤖 Generated with [Claude Code](https://claude.com/claude-code) EOF diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 9dfd1a1..0e1b88c 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -329,14 +329,14 @@ describe('REVIEW_DASHBOARD resolver', () => { for (const skill of REVIEW_SKILLS) { test(`review dashboard appears in ${skill} generated file`, () => { const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); - expect(content).toContain('reviews.jsonl'); + expect(content).toContain('gstack-review-read'); expect(content).toContain('REVIEW READINESS DASHBOARD'); }); } test('review dashboard appears in ship generated file', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('reviews.jsonl'); + expect(content).toContain('gstack-review-read'); expect(content).toContain('REVIEW READINESS DASHBOARD'); }); @@ -349,4 +349,18 @@ describe('REVIEW_DASHBOARD resolver', () => { expect(content).toContain('Design Review'); expect(content).toContain('skip_eng_review'); }); + + test('review dashboard uses gstack-review-read (not multi-line eval+cat pattern)', () => { + for (const skill of [...REVIEW_SKILLS, 'ship']) { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).toContain('gstack-review-read'); + } + }); + + test('review log uses gstack-review-log (not multi-line eval+mkdir+echo pattern)', () => { + for (const skill of REVIEW_SKILLS) { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).toContain('gstack-review-log'); + } + }); }); diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index bd0e205..a8a54e1 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -795,6 +795,132 @@ describe('gstack-slug', () => { }); }); +// --- gstack-review-log helper --- + +describe('gstack-review-log', () => { + const REVIEW_LOG_BIN = path.join(ROOT, 'bin', 'gstack-review-log'); + + test('binary exists and is executable', () => { + expect(fs.existsSync(REVIEW_LOG_BIN)).toBe(true); + const stat = fs.statSync(REVIEW_LOG_BIN); + expect(stat.mode & 0o111).toBeGreaterThan(0); + }); + + test('creates review log file and appends JSON payload', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-review-log-')); + const payload = '{"skill":"test","timestamp":"2026-03-17T00:00:00Z","status":"clean"}'; + const result = Bun.spawnSync([REVIEW_LOG_BIN, payload], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + expect(result.exitCode).toBe(0); + + // Determine the written file path via gstack-slug + const slugResult = Bun.spawnSync([path.join(ROOT, 'bin', 'gstack-slug')], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }); + const slug = slugResult.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? ''; + const branch = slugResult.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? ''; + const filePath = path.join(tmpDir, 'projects', slug, `${branch}-reviews.jsonl`); + + expect(fs.existsSync(filePath)).toBe(true); + expect(fs.readFileSync(filePath, 'utf-8').trim()).toBe(payload); + + // File path should not contain forward slashes in slug or branch segments + expect(slug).not.toContain('/'); + expect(branch).not.toContain('/'); + + fs.rmSync(tmpDir, { recursive: true }); + }); + + test('appending a second entry adds to same file (not overwrites)', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-review-log-')); + const payload1 = '{"skill":"test","entry":1}'; + const payload2 = '{"skill":"test","entry":2}'; + + Bun.spawnSync([REVIEW_LOG_BIN, payload1], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + Bun.spawnSync([REVIEW_LOG_BIN, payload2], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + + const slugResult = Bun.spawnSync([path.join(ROOT, 'bin', 'gstack-slug')], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }); + const slug = slugResult.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? ''; + const branch = slugResult.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? ''; + const filePath = path.join(tmpDir, 'projects', slug, `${branch}-reviews.jsonl`); + + const lines = fs.readFileSync(filePath, 'utf-8').trim().split('\n'); + expect(lines.length).toBe(2); + expect(lines[0]).toBe(payload1); + expect(lines[1]).toBe(payload2); + + fs.rmSync(tmpDir, { recursive: true }); + }); +}); + +// --- gstack-review-read helper --- + +describe('gstack-review-read', () => { + const REVIEW_READ_BIN = path.join(ROOT, 'bin', 'gstack-review-read'); + + test('binary exists and is executable', () => { + expect(fs.existsSync(REVIEW_READ_BIN)).toBe(true); + const stat = fs.statSync(REVIEW_READ_BIN); + expect(stat.mode & 0o111).toBeGreaterThan(0); + }); + + test('outputs NO_REVIEWS when no log file exists', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-review-read-')); + const result = Bun.spawnSync([REVIEW_READ_BIN], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + expect(result.exitCode).toBe(0); + const output = result.stdout.toString(); + expect(output).toContain('NO_REVIEWS'); + expect(output).toContain('---CONFIG---'); + + fs.rmSync(tmpDir, { recursive: true }); + }); + + test('outputs review content when log file exists', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-review-read-')); + const logBin = path.join(ROOT, 'bin', 'gstack-review-log'); + const payload = '{"skill":"plan-eng-review","status":"clean"}'; + + // Write a review first + Bun.spawnSync([logBin, payload], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + + // Now read + const result = Bun.spawnSync([REVIEW_READ_BIN], { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpDir }, + stdout: 'pipe', + stderr: 'pipe', + }); + expect(result.exitCode).toBe(0); + const output = result.stdout.toString(); + expect(output).toContain(payload); + expect(output).toContain('---CONFIG---'); + + fs.rmSync(tmpDir, { recursive: true }); + }); +}); + // --- Test Bootstrap validation --- describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => { @@ -1016,3 +1142,51 @@ describe('QA report template', () => { expect(content).toContain('**Precondition:**'); }); }); + +// --- Platform-agnostic blocklist regression tests --- + +describe('Platform-agnostic: no Rails-isms in templates or generated files', () => { + const skillDirs = fs.readdirSync(ROOT).filter(d => { + const fullPath = path.join(ROOT, d); + return fs.statSync(fullPath).isDirectory() && fs.existsSync(path.join(fullPath, 'SKILL.md')); + }); + const allSkillFiles = [ + path.join(ROOT, 'SKILL.md'), + ...skillDirs.map(d => path.join(ROOT, d, 'SKILL.md')), + ]; + const allTemplateFiles = [ + ...fs.existsSync(path.join(ROOT, 'SKILL.md.tmpl')) ? [path.join(ROOT, 'SKILL.md.tmpl')] : [], + ...skillDirs + .filter(d => fs.existsSync(path.join(ROOT, d, 'SKILL.md.tmpl'))) + .map(d => path.join(ROOT, d, 'SKILL.md.tmpl')), + ]; + const allFiles = [...allSkillFiles, ...allTemplateFiles]; + + const BLOCKLIST = [ + { pattern: /bin\/test-lane/, label: 'bin/test-lane' }, + { pattern: /RAILS_ENV/, label: 'RAILS_ENV' }, + { pattern: /_prompt_builder\.rb/, label: '_prompt_builder.rb' }, + { pattern: /--include="\*\.rb"/, label: '--include="*.rb"' }, + ]; + + for (const { pattern, label } of BLOCKLIST) { + test(`no "${label}" in any generated SKILL.md or template`, () => { + const matches: string[] = []; + for (const file of allFiles) { + const content = fs.readFileSync(file, 'utf-8'); + if (pattern.test(content)) { + matches.push(path.relative(ROOT, file)); + } + } + expect(matches).toEqual([]); + }); + } + + test('review/checklist.md mentions multiple frameworks', () => { + const content = fs.readFileSync(path.join(ROOT, 'review', 'checklist.md'), 'utf-8'); + expect(content).toContain('Rails:'); + expect(content).toContain('React:'); + expect(content).toContain('Django:'); + expect(content).toContain('Prisma:'); + }); +});