truevox · truevox · Mar 24, 2026 · Mar 26, 2026 · gemini-code-assist · Mar 26, 2026
diff --git a/.claude/skills/gstack/.env.example b/.claude/skills/gstack/.env.example
@@ -0,0 +1,5 @@
+# Copy to .env and fill in values
+# bun auto-loads .env — no dotenv needed
+
+# Required for LLM-as-judge evals (bun run test:eval)
+ANTHROPIC_API_KEY=sk-ant-your-key-here
diff --git a/.claude/skills/gstack/.github/actionlint.yaml b/.claude/skills/gstack/.github/actionlint.yaml
@@ -0,0 +1,4 @@
+self-hosted-runner:
+  labels:
+    - ubicloud-standard-2
+    - ubicloud-standard-8
diff --git a/.claude/skills/gstack/.github/docker/Dockerfile.ci b/.claude/skills/gstack/.github/docker/Dockerfile.ci
@@ -0,0 +1,63 @@
+# gstack CI eval runner — pre-baked toolchain + deps
+# Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes
+FROM ubuntu:24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git curl unzip ca-certificates jq bc gpg \
+    && rm -rf /var/lib/apt/lists/*
+
+# GitHub CLI
+RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+    | gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
+    | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+    && apt-get update && apt-get install -y --no-install-recommends gh \
+    && rm -rf /var/lib/apt/lists/*
+
+# Node.js 22 LTS (needed for claude CLI)
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
+    && apt-get install -y --no-install-recommends nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Bun (install to /usr/local so non-root users can access it)
+ENV BUN_INSTALL="/usr/local"
+RUN curl -fsSL https://bun.sh/install | bash
+
+# Claude CLI
+RUN npm i -g @anthropic-ai/claude-code
+
+# Playwright system deps (Chromium) — needed for browse E2E tests
+RUN npx playwright install-deps chromium
+
+# Pre-install dependencies (cached layer — only rebuilds when package.json changes)
+COPY package.json /workspace/
+WORKDIR /workspace
+RUN bun install && rm -rf /tmp/*
+
+# Install Playwright Chromium to a shared location accessible by all users
+ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
+RUN npx playwright install chromium \
+    && chmod -R a+rX /opt/playwright-browsers
+
+# Verify everything works
+RUN bun --version && node --version && claude --version && jq --version && gh --version \
+    && npx playwright --version
+
+# At runtime: checkout overwrites /workspace, but node_modules persists
+# if we move it out of the way and symlink back
+# Save node_modules + package.json snapshot for cache validation at runtime
+RUN mv /workspace/node_modules /opt/node_modules_cache \
+    && cp /workspace/package.json /opt/node_modules_cache/.package.json
+
+# Claude CLI refuses --dangerously-skip-permissions as root.
+# Create a non-root user for eval runs (GH Actions overrides USER, so
+# the workflow must set options.user or use gosu/su-exec at runtime).
+RUN useradd -m -s /bin/bash runner \
+    && chmod -R a+rX /opt/node_modules_cache \
+    && mkdir -p /home/runner/.gstack && chown -R runner:runner /home/runner/.gstack \
+    && chmod 1777 /tmp \
+    && mkdir -p /home/runner/.bun && chown -R runner:runner /home/runner/.bun \
+    && chmod -R 1777 /tmp
diff --git a/.claude/skills/gstack/.github/workflows/actionlint.yml b/.claude/skills/gstack/.github/workflows/actionlint.yml
@@ -0,0 +1,8 @@
+name: Workflow Lint
+on: [push, pull_request]
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: rhysd/actionlint@v1.7.11
diff --git a/.claude/skills/gstack/.github/workflows/ci-image.yml b/.claude/skills/gstack/.github/workflows/ci-image.yml
@@ -0,0 +1,40 @@
+name: Build CI Image
+on:
+  # Rebuild weekly (Monday 6am UTC) to pick up CLI updates
+  schedule:
+    - cron: '0 6 * * 1'
+  # Rebuild on Dockerfile or lockfile changes
+  push:
+    branches: [main]
+    paths:
+      - '.github/docker/Dockerfile.ci'
+      - 'package.json'
+  # Manual trigger
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubicloud-standard-2
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+
+      # Copy lockfile + package.json into Docker build context
+      - run: cp package.json .github/docker/
+
+      - uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: docker/build-push-action@v6
+        with:
+          context: .github/docker
+          file: .github/docker/Dockerfile.ci
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository }}/ci:latest
+            ghcr.io/${{ github.repository }}/ci:${{ github.sha }}
diff --git a/.claude/skills/gstack/.github/workflows/evals.yml b/.claude/skills/gstack/.github/workflows/evals.yml
@@ -0,0 +1,242 @@
+name: E2E Evals
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+concurrency:
+  group: evals-${{ github.head_ref }}
+  cancel-in-progress: true
+
+env:
+  IMAGE: ghcr.io/${{ github.repository }}/ci
+
+jobs:
+  # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
+  build-image:
+    runs-on: ubicloud-standard-2
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      image-tag: ${{ steps.meta.outputs.tag }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - id: meta
+        run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"
+
+      - uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check if image exists
+        id: check
+        run: |
+          if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
+            echo "exists=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "exists=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - if: steps.check.outputs.exists == 'false'
+        run: cp package.json .github/docker/
+
+      - if: steps.check.outputs.exists == 'false'
+        uses: docker/build-push-action@v6
+        with:
+          context: .github/docker
+          file: .github/docker/Dockerfile.ci
+          push: true
+          tags: |
+            ${{ steps.meta.outputs.tag }}
+            ${{ env.IMAGE }}:latest
+
+  evals:
+    runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }}
+    needs: build-image
+    container:
+      image: ${{ needs.build-image.outputs.image-tag }}
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --user runner
+    timeout-minutes: 25
+    strategy:
+      fail-fast: false
+      matrix:
+        suite:
+          - name: llm-judge
+            file: test/skill-llm-eval.test.ts
+          - name: e2e-browse
+            file: test/skill-e2e-bws.test.ts
+            runner: ubicloud-standard-8
+          - name: e2e-plan
+            file: test/skill-e2e-plan.test.ts
+          - name: e2e-deploy
+            file: test/skill-e2e-deploy.test.ts
+          - name: e2e-design
+            file: test/skill-e2e-design.test.ts
+          - name: e2e-qa-bugs
+            file: test/skill-e2e-qa-bugs.test.ts
+          - name: e2e-qa-workflow
+            file: test/skill-e2e-qa-workflow.test.ts
+          - name: e2e-review
+            file: test/skill-e2e-review.test.ts
+          - name: e2e-workflow
+            file: test/skill-e2e-workflow.test.ts
+            allow_failure: true  # /ship + /setup-browser-cookies are env-dependent
+          - name: e2e-routing
+            file: test/skill-routing-e2e.test.ts
+            allow_failure: true  # LLM routing is non-deterministic
+          - name: e2e-codex
+            file: test/codex-e2e.test.ts
+          - name: e2e-gemini
+            file: test/gemini-e2e.test.ts
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      # Bun creates root-owned temp dirs during Docker build. GH Actions runs as
+      # runner user with HOME=/github/home. Redirect bun's cache to a writable dir.
+      - name: Fix bun temp
+        run: |
+          mkdir -p /home/runner/.cache/bun
+          {
+            echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
+            echo "BUN_TMPDIR=/home/runner/.cache/bun"
+            echo "TMPDIR=/home/runner/.cache"
+          } >> "$GITHUB_ENV"
+
+      # Restore pre-installed node_modules from Docker image via symlink (~0s vs ~15s install)
+      - name: Restore deps
+        run: |
+          if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
+            ln -s /opt/node_modules_cache node_modules
+          else
+            bun install
+          fi
+
+      - run: bun run build
+
+      # Verify Playwright can launch Chromium (fails fast if sandbox/deps are broken)
+      - name: Verify Chromium
+        if: matrix.suite.name == 'e2e-browse'
+        run: |
+          echo "whoami=$(whoami) HOME=$HOME TMPDIR=${TMPDIR:-unset}"
+          touch /tmp/.bun-test && rm /tmp/.bun-test && echo "/tmp writable"
+          bun -e "import {chromium} from 'playwright';const b=await chromium.launch({args:['--no-sandbox']});console.log('Chromium OK');await b.close()"
+
+      - name: Run ${{ matrix.suite.name }}
+        continue-on-error: ${{ matrix.suite.allow_failure || false }}
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          EVALS_CONCURRENCY: "40"
+          PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
+        run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}
+
+      - name: Upload eval results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-${{ matrix.suite.name }}
+          path: ~/.gstack-dev/evals/*.json
+          retention-days: 90
+
+  report:
+    runs-on: ubicloud-standard-2
+    needs: evals
+    if: always() && github.event_name == 'pull_request'
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Download all eval artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: eval-*
+          path: /tmp/eval-results
+          merge-multiple: true
+
+      - name: Post PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # shellcheck disable=SC2086,SC2059
+          RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
+          if [ -z "$RESULTS" ]; then
+            echo "No eval results found"
+            exit 0
+          fi
+
+          TOTAL=0; PASSED=0; FAILED=0; COST="0"
+          SUITE_LINES=""
+          for f in $RESULTS; do
+            if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then
+              echo "Skipping malformed JSON: $f"
+              continue
+            fi
+            T=$(jq -r '.total_tests // 0' "$f")
+            P=$(jq -r '.passed // 0' "$f")
+            F=$(jq -r '.failed // 0' "$f")
+            C=$(jq -r '.total_cost_usd // 0' "$f")
+            TIER=$(jq -r '.tier // "unknown"' "$f")
+            [ "$T" -eq 0 ] && continue
+            TOTAL=$((TOTAL + T))
+            PASSED=$((PASSED + P))
+            FAILED=$((FAILED + F))
+            COST=$(echo "$COST + $C" | bc)
+            STATUS_ICON="✅"
+            [ "$F" -gt 0 ] && STATUS_ICON="❌"
+            SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
+          done
+
+          STATUS="✅ PASS"
+          [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"
+
+          BODY="## E2E Evals: ${STATUS}
+
+          **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**
+
+          | Suite | Result | Status | Cost |
+          |-------|--------|--------|------|
+          $(echo -e "$SUITE_LINES")
+
+          ---
+          *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"
+
+          if [ "$FAILED" -gt 0 ]; then
+            FAILURES=""
+            for f in $RESULTS; do
+              if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi
+              F=$(jq -r '.failed // 0' "$f")
+              [ "$F" -eq 0 ] && continue
+              FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error")
+              FAILURES="${FAILURES}${FAILS}\n"
+            done
+            BODY="${BODY}
+
+          ### Failures
+          $(echo -e "$FAILURES")"
+          fi
+
+          # Update existing comment or create new one
+          COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
+            --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
+
+          if [ -n "$COMMENT_ID" ]; then
+            gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \
+              -X PATCH -f body="$BODY"
+          else
+            gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
+          fi
diff --git a/.claude/skills/gstack/.github/workflows/skill-docs.yml b/.claude/skills/gstack/.github/workflows/skill-docs.yml
@@ -0,0 +1,25 @@
+name: Skill Docs Freshness
+on: [push, pull_request]
+jobs:
+  check-freshness:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: oven-sh/setup-bun@v2
+      - run: bun install
+      - name: Check Claude host freshness
+        run: bun run gen:skill-docs
+      - name: Verify Claude skill docs are fresh
+        run: |
+          git diff --exit-code || {
+            echo "Generated SKILL.md files are stale. Run: bun run gen:skill-docs"
+            exit 1
+          }
+      - name: Check Codex host freshness
+        run: bun run gen:skill-docs --host codex
+      - name: Verify Codex skill docs are fresh
+        run: |
+          git diff --exit-code -- .agents/ || {
+            echo "Generated Codex SKILL.md files are stale. Run: bun run gen:skill-docs --host codex"
+            exit 1
+          }