sync

Frank · Frank · commit 42beb8f16c68 · 2025-12-26T14:29:43.000-05:00
diff --git a/.github/workflows/run-benchmark.yml b/.github/workflows/run-benchmark.yml
@@ -3,8 +3,8 @@ name: Run Benchmark
 on:
   workflow_dispatch:
     inputs:
-      run:
-        description: "Number of times to run the benchmark"
+      agent:
+        description: "Agent to use"
         required: true
         type: string
       model:
@@ -34,6 +34,7 @@ jobs:
     strategy:
       matrix:
         task: ${{ fromJson(needs.prepare.outputs.tasks) }}
+        run: [1, 2, 3]
     environment: production
     steps:
       - name: Checkout repository
@@ -52,16 +53,110 @@ jobs:
 
       - name: Print benchmark config
         env:
-          RUN_COUNT: ${{ inputs.run }}
           MODEL: ${{ inputs.model }}
           TASK: ${{ matrix.task }}
+          RUN: ${{ matrix.run }}
         run: |
-          echo "Run count: ${RUN_COUNT}"
           echo "Model: ${MODEL}"
           echo "Task: ${TASK}"
+          echo "Run: ${RUN}"
 
       - name: Run benchmark
         env:
           OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
           DEBUG: true
-        run: bun dev opencode --task ${{ matrix.task }} --model ${{ inputs.model }}
+          TASK: ${{ matrix.task }}
+          MODEL: ${{ inputs.model }}
+          AGENT: ${{ inputs.agent }}
+          RESULT_PATH: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
+        run: bun github/run.ts
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}
+          path: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
+
+  summarize-runs:
+    needs: benchmark
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        task: ${{ fromJson(needs.prepare.outputs.tasks) }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install
+
+      - name: Download run 1 results
+        uses: actions/download-artifact@v4
+        with:
+          name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1
+          path: results
+
+      - name: Download run 2 results
+        uses: actions/download-artifact@v4
+        with:
+          name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2
+          path: results
+
+      - name: Download run 3 results
+        uses: actions/download-artifact@v4
+        with:
+          name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3
+          path: results
+
+      - name: Summarize runs
+        env:
+          RESULT_PATHS: results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3.json
+          RUNS_SUMMARY_PATH: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
+        run: bun github/summarize-runs.ts
+
+      - name: Upload runs summary
+        uses: actions/upload-artifact@v4
+        with:
+          name: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}
+          path: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
+
+  summarize-tasks:
+    needs: summarize-runs
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install
+
+      - name: Download all runs summaries
+        uses: actions/download-artifact@v4
+        with:
+          pattern: runs-summary-*
+          path: runs-summaries
+
+      - name: Summarize tasks
+        env:
+          RUNS_SUMMARY_PATHS: runs-summaries/*/runs-summary-*.json
+        run: |
+          RUNS_SUMMARY_PATHS_COMMA=$(find runs-summaries -name 'runs-summary-*.json' | tr '\n' ',' | sed 's/,$//')
+          export RUNS_SUMMARY_PATHS="$RUNS_SUMMARY_PATHS_COMMA"
+          export TASKS_SUMMARY_PATH=tasks-summary.json
+          bun github/summarize-tasks.ts
+
+      - name: Upload tasks summary
+        uses: actions/upload-artifact@v4
+        with:
+          name: tasks-summary
+          path: tasks-summary.json
diff --git a/cli.ts b/cli.ts
@@ -6,8 +6,6 @@ import { hideBin } from "yargs/helpers";
 import { Agent } from "~/src/agents/index.js";
 import { Task } from "~/src/tasks/index.js";
 import { Summarizer } from "~/src/summarizer.js";
-import { withRetries } from "~/src/util/retry.js";
-import { buildRadarChartUrl } from "~/src/util/charts.js";
 import { Logger } from "~/src/util/logger.js";
 import { Eval } from "./src/eval.js";
 
@@ -21,9 +19,6 @@ const cli = yargs(hideBin(process.argv))
     [
       "$0 opencode --model opencode/gpt-5-codex --task DataDog/datadog-lambda-python@93d4a07..d776378",
     ],
-    [
-      "$0 opencode --model opencode/claude-sonnet-4-5 --task DataDog/datadog-lambda-python@93d4a07..d776378 --output results.json",
-    ],
   ])
   .strict();
 
@@ -58,37 +53,14 @@ cli.command(
         type: "string",
         description: "task to use in the format of repo@from..to",
         required: true,
-      })
-      .option("timeout", {
-        type: "number",
-        description: "timeout in minutes for each episode",
-        default: 40,
-      })
-      .option("output", {
-        type: "string",
-        description: "output file to save the results to",
       }),
-  async ({
-    agent: agentName,
-    model: modelId,
-    task: taskId,
-    timeout: timeoutMins,
-    output: outputPath,
-  }) => {
+  async ({ agent: agentName, model: modelId, task: taskId }) => {
     if (!agentName) throw new Error("Agent name is required");
 
     const logger = Logger.create(`[model ${modelId}]`);
 
-    // Run episodes
-    logger.log(`Starting episode with ${timeoutMins}min timeout...`);
-    const result = await withRetries(
-      () => Eval.run(agentName, modelId, taskId, { logger }),
-      {
-        retries: 3,
-        timeoutMs: timeoutMins * 60 * 1000,
-        logger,
-      },
-    );
+    // Run eval
+    const result = await Eval.run(agentName, modelId, taskId, { logger });
 
     // Summary episodes
     const summary = await Summarizer.summarizeRuns([result]);
@@ -129,11 +101,6 @@ cli.command(
     //  datasetLabel: modelId,
     //});
     //logger.log(`Radar Chart: ${chartUrl}\n`);
-
-    // Store result
-    if (outputPath) {
-      await writeFile(outputPath, JSON.stringify(result));
-    }
   },
 );
 
diff --git a/github/run.ts b/github/run.ts
@@ -0,0 +1,17 @@
+#!/usr/bin/env bun
+import { Logger } from "../src/util/logger.js";
+import { Eval } from "../src/eval.js";
+import { writeFile } from "node:fs/promises";
+
+const task = process.env.TASK!;
+const model = process.env.MODEL!;
+const agent = process.env.AGENT!;
+const resultPath = process.env.RESULT_PATH!;
+
+// Run eval
+const result = await Eval.run(agent, model, task, {
+  logger: Logger.create(`[model ${model}]`),
+});
+
+// Store result
+await writeFile(resultPath, JSON.stringify(result));
diff --git a/github/summarize-runs.ts b/github/summarize-runs.ts
@@ -0,0 +1,17 @@
+#!/usr/bin/env bun
+import { readFile, writeFile } from "node:fs/promises";
+import { Summarizer } from "../src/summarizer.js";
+
+const resultPaths = process.env.RESULT_PATHS!;
+const runsSummaryPath = process.env.RUNS_SUMMARY_PATH!;
+
+const results = await Promise.all(
+  resultPaths.split(",").map(async (resultPath) => {
+    const result = await readFile(resultPath, "utf8");
+    return JSON.parse(result);
+  }),
+);
+
+const summary = await Summarizer.summarizeRuns(results);
+
+await writeFile(runsSummaryPath, JSON.stringify(summary));
diff --git a/github/summarize-tasks.ts b/github/summarize-tasks.ts
@@ -0,0 +1,17 @@
+#!/usr/bin/env bun
+import { readFile, writeFile } from "node:fs/promises";
+import { Summarizer } from "../src/summarizer.js";
+
+const runsSummaryPaths = process.env.RUNS_SUMMARY_PATHS!;
+const tasksSummaryPath = process.env.TASKS_SUMMARY_PATH!;
+
+const runsSummaries = await Promise.all(
+  runsSummaryPaths.split(",").map(async (runsSummaryPath) => {
+    const runsSummary = await readFile(runsSummaryPath, "utf8");
+    return JSON.parse(runsSummary);
+  }),
+);
+
+const summary = await Summarizer.summarizeTasks(runsSummaries);
+
+await writeFile(tasksSummaryPath, JSON.stringify(summary));
diff --git a/src/eval.ts b/src/eval.ts
@@ -12,6 +12,7 @@ import { Metric } from "./metrics/index.js";
 import { average, variance, weightedSum } from "./util/math.js";
 import { Judge } from "./judges.js";
 import { getZenLanguageModel } from "./zenModels.js";
+import { withRetries } from "./util/retry.js";
 
 export namespace Eval {
   export const DISAGREEMENT_PENALTY = 0.5;
@@ -24,6 +25,26 @@ export namespace Eval {
     opts: {
       logger: Logger.Instance;
     },
+  ) {
+    const timeoutMins = 40;
+    opts.logger.log(`Starting episode with ${timeoutMins}min timeout...`);
+    return await withRetries(
+      () => runOnce(agentName, modelId, taskId, { logger: opts.logger }),
+      {
+        retries: 3,
+        timeoutMs: timeoutMins * 60 * 1000,
+        logger: opts.logger,
+      },
+    );
+  }
+
+  async function runOnce(
+    agentName: string,
+    modelId: string,
+    taskId: string,
+    opts: {
+      logger: Logger.Instance;
+    },
   ) {
     const agent = Agent.get(agentName);
     Agent.validateModel(agent, modelId);