-
Notifications
You must be signed in to change notification settings - Fork 0
Add bear race mechanic #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| # Copy to .env and fill in values | ||
| # bun auto-loads .env — no dotenv needed | ||
|
|
||
| # Required for LLM-as-judge evals (bun run test:eval) | ||
| ANTHROPIC_API_KEY=sk-ant-your-key-here |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| self-hosted-runner: | ||
| labels: | ||
| - ubicloud-standard-2 | ||
| - ubicloud-standard-8 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| # gstack CI eval runner — pre-baked toolchain + deps | ||
| # Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes | ||
| FROM ubuntu:24.04 | ||
|
|
||
| ENV DEBIAN_FRONTEND=noninteractive | ||
|
|
||
| # System deps | ||
| RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
| git curl unzip ca-certificates jq bc gpg \ | ||
| && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| # GitHub CLI | ||
| RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \ | ||
| | gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \ | ||
| && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \ | ||
| | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ | ||
| && apt-get update && apt-get install -y --no-install-recommends gh \ | ||
| && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| # Node.js 22 LTS (needed for claude CLI) | ||
| RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ | ||
| && apt-get install -y --no-install-recommends nodejs \ | ||
| && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| # Bun (install to /usr/local so non-root users can access it) | ||
| ENV BUN_INSTALL="/usr/local" | ||
| RUN curl -fsSL https://bun.sh/install | bash | ||
|
|
||
| # Claude CLI | ||
| RUN npm i -g @anthropic-ai/claude-code | ||
|
|
||
| # Playwright system deps (Chromium) — needed for browse E2E tests | ||
| RUN npx playwright install-deps chromium | ||
|
|
||
| # Pre-install dependencies (cached layer — only rebuilds when package.json changes) | ||
| COPY package.json /workspace/ | ||
| WORKDIR /workspace | ||
| RUN bun install && rm -rf /tmp/* | ||
|
|
||
| # Install Playwright Chromium to a shared location accessible by all users | ||
| ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers | ||
| RUN npx playwright install chromium \ | ||
| && chmod -R a+rX /opt/playwright-browsers | ||
|
|
||
| # Verify everything works | ||
| RUN bun --version && node --version && claude --version && jq --version && gh --version \ | ||
| && npx playwright --version | ||
|
|
||
| # At runtime: checkout overwrites /workspace, but node_modules persists | ||
| # if we move it out of the way and symlink back | ||
| # Save node_modules + package.json snapshot for cache validation at runtime | ||
| RUN mv /workspace/node_modules /opt/node_modules_cache \ | ||
| && cp /workspace/package.json /opt/node_modules_cache/.package.json | ||
|
|
||
| # Claude CLI refuses --dangerously-skip-permissions as root. | ||
| # Create a non-root user for eval runs (GH Actions overrides USER, so | ||
| # the workflow must set options.user or use gosu/su-exec at runtime). | ||
| RUN useradd -m -s /bin/bash runner \ | ||
| && chmod -R a+rX /opt/node_modules_cache \ | ||
| && mkdir -p /home/runner/.gstack && chown -R runner:runner /home/runner/.gstack \ | ||
| && chmod 1777 /tmp \ | ||
| && mkdir -p /home/runner/.bun && chown -R runner:runner /home/runner/.bun \ | ||
| && chmod -R 1777 /tmp | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The command |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| name: Workflow Lint | ||
| on: [push, pull_request] | ||
| jobs: | ||
| actionlint: | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - uses: rhysd/actionlint@v1.7.11 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| name: Build CI Image | ||
| on: | ||
| # Rebuild weekly (Monday 6am UTC) to pick up CLI updates | ||
| schedule: | ||
| - cron: '0 6 * * 1' | ||
| # Rebuild on Dockerfile or lockfile changes | ||
| push: | ||
| branches: [main] | ||
| paths: | ||
| - '.github/docker/Dockerfile.ci' | ||
| - 'package.json' | ||
| # Manual trigger | ||
| workflow_dispatch: | ||
|
|
||
| jobs: | ||
| build: | ||
| runs-on: ubicloud-standard-2 | ||
| permissions: | ||
| contents: read | ||
| packages: write | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| # Copy lockfile + package.json into Docker build context | ||
| - run: cp package.json .github/docker/ | ||
|
|
||
| - uses: docker/login-action@v3 | ||
| with: | ||
| registry: ghcr.io | ||
| username: ${{ github.actor }} | ||
| password: ${{ secrets.GITHUB_TOKEN }} | ||
|
|
||
| - uses: docker/build-push-action@v6 | ||
| with: | ||
| context: .github/docker | ||
| file: .github/docker/Dockerfile.ci | ||
| push: true | ||
| tags: | | ||
| ghcr.io/${{ github.repository }}/ci:latest | ||
| ghcr.io/${{ github.repository }}/ci:${{ github.sha }} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,242 @@ | ||
| name: E2E Evals | ||
| on: | ||
| pull_request: | ||
| branches: [main] | ||
| workflow_dispatch: | ||
|
|
||
| concurrency: | ||
| group: evals-${{ github.head_ref }} | ||
| cancel-in-progress: true | ||
|
|
||
| env: | ||
| IMAGE: ghcr.io/${{ github.repository }}/ci | ||
|
|
||
| jobs: | ||
| # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change) | ||
| build-image: | ||
| runs-on: ubicloud-standard-2 | ||
| permissions: | ||
| contents: read | ||
| packages: write | ||
| outputs: | ||
| image-tag: ${{ steps.meta.outputs.tag }} | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - id: meta | ||
| run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT" | ||
|
|
||
| - uses: docker/login-action@v3 | ||
| with: | ||
| registry: ghcr.io | ||
| username: ${{ github.actor }} | ||
| password: ${{ secrets.GITHUB_TOKEN }} | ||
|
|
||
| - name: Check if image exists | ||
| id: check | ||
| run: | | ||
| if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then | ||
| echo "exists=true" >> "$GITHUB_OUTPUT" | ||
| else | ||
| echo "exists=false" >> "$GITHUB_OUTPUT" | ||
| fi | ||
|
|
||
| - if: steps.check.outputs.exists == 'false' | ||
| run: cp package.json .github/docker/ | ||
|
|
||
| - if: steps.check.outputs.exists == 'false' | ||
| uses: docker/build-push-action@v6 | ||
| with: | ||
| context: .github/docker | ||
| file: .github/docker/Dockerfile.ci | ||
| push: true | ||
| tags: | | ||
| ${{ steps.meta.outputs.tag }} | ||
| ${{ env.IMAGE }}:latest | ||
|
|
||
| evals: | ||
| runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }} | ||
| needs: build-image | ||
| container: | ||
| image: ${{ needs.build-image.outputs.image-tag }} | ||
| credentials: | ||
| username: ${{ github.actor }} | ||
| password: ${{ secrets.GITHUB_TOKEN }} | ||
| options: --user runner | ||
| timeout-minutes: 25 | ||
| strategy: | ||
| fail-fast: false | ||
| matrix: | ||
| suite: | ||
| - name: llm-judge | ||
| file: test/skill-llm-eval.test.ts | ||
| - name: e2e-browse | ||
| file: test/skill-e2e-bws.test.ts | ||
| runner: ubicloud-standard-8 | ||
| - name: e2e-plan | ||
| file: test/skill-e2e-plan.test.ts | ||
| - name: e2e-deploy | ||
| file: test/skill-e2e-deploy.test.ts | ||
| - name: e2e-design | ||
| file: test/skill-e2e-design.test.ts | ||
| - name: e2e-qa-bugs | ||
| file: test/skill-e2e-qa-bugs.test.ts | ||
| - name: e2e-qa-workflow | ||
| file: test/skill-e2e-qa-workflow.test.ts | ||
| - name: e2e-review | ||
| file: test/skill-e2e-review.test.ts | ||
| - name: e2e-workflow | ||
| file: test/skill-e2e-workflow.test.ts | ||
| allow_failure: true # /ship + /setup-browser-cookies are env-dependent | ||
| - name: e2e-routing | ||
| file: test/skill-routing-e2e.test.ts | ||
| allow_failure: true # LLM routing is non-deterministic | ||
| - name: e2e-codex | ||
| file: test/codex-e2e.test.ts | ||
| - name: e2e-gemini | ||
| file: test/gemini-e2e.test.ts | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
|
|
||
| # Bun creates root-owned temp dirs during Docker build. GH Actions runs as | ||
| # runner user with HOME=/github/home. Redirect bun's cache to a writable dir. | ||
| - name: Fix bun temp | ||
| run: | | ||
| mkdir -p /home/runner/.cache/bun | ||
| { | ||
| echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun" | ||
| echo "BUN_TMPDIR=/home/runner/.cache/bun" | ||
| echo "TMPDIR=/home/runner/.cache" | ||
| } >> "$GITHUB_ENV" | ||
|
|
||
| # Restore pre-installed node_modules from Docker image via symlink (~0s vs ~15s install) | ||
| - name: Restore deps | ||
| run: | | ||
| if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then | ||
| ln -s /opt/node_modules_cache node_modules | ||
| else | ||
| bun install | ||
| fi | ||
|
|
||
| - run: bun run build | ||
|
|
||
| # Verify Playwright can launch Chromium (fails fast if sandbox/deps are broken) | ||
| - name: Verify Chromium | ||
| if: matrix.suite.name == 'e2e-browse' | ||
| run: | | ||
| echo "whoami=$(whoami) HOME=$HOME TMPDIR=${TMPDIR:-unset}" | ||
| touch /tmp/.bun-test && rm /tmp/.bun-test && echo "/tmp writable" | ||
| bun -e "import {chromium} from 'playwright';const b=await chromium.launch({args:['--no-sandbox']});console.log('Chromium OK');await b.close()" | ||
|
|
||
| - name: Run ${{ matrix.suite.name }} | ||
| continue-on-error: ${{ matrix.suite.allow_failure || false }} | ||
| env: | ||
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | ||
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | ||
| EVALS_CONCURRENCY: "40" | ||
| PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers | ||
| run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }} | ||
|
|
||
| - name: Upload eval results | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: eval-${{ matrix.suite.name }} | ||
| path: ~/.gstack-dev/evals/*.json | ||
| retention-days: 90 | ||
|
|
||
| report: | ||
| runs-on: ubicloud-standard-2 | ||
| needs: evals | ||
| if: always() && github.event_name == 'pull_request' | ||
| timeout-minutes: 5 | ||
| permissions: | ||
| contents: read | ||
| pull-requests: write | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 1 | ||
|
|
||
| - name: Download all eval artifacts | ||
| uses: actions/download-artifact@v4 | ||
| with: | ||
| pattern: eval-* | ||
| path: /tmp/eval-results | ||
| merge-multiple: true | ||
|
|
||
| - name: Post PR comment | ||
| env: | ||
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
| run: | | ||
| # shellcheck disable=SC2086,SC2059 | ||
| RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort) | ||
| if [ -z "$RESULTS" ]; then | ||
| echo "No eval results found" | ||
| exit 0 | ||
| fi | ||
|
|
||
| TOTAL=0; PASSED=0; FAILED=0; COST="0" | ||
| SUITE_LINES="" | ||
| for f in $RESULTS; do | ||
| if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then | ||
| echo "Skipping malformed JSON: $f" | ||
| continue | ||
| fi | ||
| T=$(jq -r '.total_tests // 0' "$f") | ||
| P=$(jq -r '.passed // 0' "$f") | ||
| F=$(jq -r '.failed // 0' "$f") | ||
| C=$(jq -r '.total_cost_usd // 0' "$f") | ||
| TIER=$(jq -r '.tier // "unknown"' "$f") | ||
| [ "$T" -eq 0 ] && continue | ||
| TOTAL=$((TOTAL + T)) | ||
| PASSED=$((PASSED + P)) | ||
| FAILED=$((FAILED + F)) | ||
| COST=$(echo "$COST + $C" | bc) | ||
| STATUS_ICON="✅" | ||
| [ "$F" -gt 0 ] && STATUS_ICON="❌" | ||
| SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n" | ||
| done | ||
|
|
||
| STATUS="✅ PASS" | ||
| [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL" | ||
|
|
||
| BODY="## E2E Evals: ${STATUS} | ||
|
|
||
| **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners** | ||
|
|
||
| | Suite | Result | Status | Cost | | ||
| |-------|--------|--------|------| | ||
| $(echo -e "$SUITE_LINES") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using $(printf '%b' "$SUITE_LINES") |
||
|
|
||
| --- | ||
| *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*" | ||
|
|
||
| if [ "$FAILED" -gt 0 ]; then | ||
| FAILURES="" | ||
| for f in $RESULTS; do | ||
| if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi | ||
| F=$(jq -r '.failed // 0' "$f") | ||
| [ "$F" -eq 0 ] && continue | ||
| FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error") | ||
| FAILURES="${FAILURES}${FAILS}\n" | ||
| done | ||
| BODY="${BODY} | ||
|
|
||
| ### Failures | ||
| $(echo -e "$FAILURES")" | ||
| fi | ||
|
|
||
| # Update existing comment or create new one | ||
| COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \ | ||
| --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1) | ||
|
|
||
| if [ -n "$COMMENT_ID" ]; then | ||
| gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \ | ||
| -X PATCH -f body="$BODY" | ||
| else | ||
| gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" | ||
| fi | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| name: Skill Docs Freshness | ||
| on: [push, pull_request] | ||
| jobs: | ||
| check-freshness: | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - uses: oven-sh/setup-bun@v2 | ||
| - run: bun install | ||
| - name: Check Claude host freshness | ||
| run: bun run gen:skill-docs | ||
| - name: Verify Claude skill docs are fresh | ||
| run: | | ||
| git diff --exit-code || { | ||
| echo "Generated SKILL.md files are stale. Run: bun run gen:skill-docs" | ||
| exit 1 | ||
| } | ||
| - name: Check Codex host freshness | ||
| run: bun run gen:skill-docs --host codex | ||
| - name: Verify Codex skill docs are fresh | ||
| run: | | ||
| git diff --exit-code -- .agents/ || { | ||
| echo "Generated Codex SKILL.md files are stale. Run: bun run gen:skill-docs --host codex" | ||
| exit 1 | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Dockerfile has multiple
RUN apt-get update && ... && rm -rf /var/lib/apt/lists/*commands. This is inefficient as it creates multiple layers and runsapt-get updaterepeatedly. It's a best practice to combine these into a singleRUNinstruction to reduce image size and build time. Consider performing all package source setup first, then a singleapt-get update, followed by a singleapt-get installfor all packages, and finally a single cleanup step.