Skip to content

Commit 42beb8f

Browse files
author
Frank
committed
sync
1 parent b8c07e6 commit 42beb8f

File tree

6 files changed

+175
-41
lines changed

6 files changed

+175
-41
lines changed

.github/workflows/run-benchmark.yml

Lines changed: 100 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ name: Run Benchmark
33
on:
44
workflow_dispatch:
55
inputs:
6-
run:
7-
description: "Number of times to run the benchmark"
6+
agent:
7+
description: "Agent to use"
88
required: true
99
type: string
1010
model:
@@ -34,6 +34,7 @@ jobs:
3434
strategy:
3535
matrix:
3636
task: ${{ fromJson(needs.prepare.outputs.tasks) }}
37+
run: [1, 2, 3]
3738
environment: production
3839
steps:
3940
- name: Checkout repository
@@ -52,16 +53,110 @@ jobs:
5253

5354
- name: Print benchmark config
5455
env:
55-
RUN_COUNT: ${{ inputs.run }}
5656
MODEL: ${{ inputs.model }}
5757
TASK: ${{ matrix.task }}
58+
RUN: ${{ matrix.run }}
5859
run: |
59-
echo "Run count: ${RUN_COUNT}"
6060
echo "Model: ${MODEL}"
6161
echo "Task: ${TASK}"
62+
echo "Run: ${RUN}"
6263
6364
- name: Run benchmark
6465
env:
6566
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
6667
DEBUG: true
67-
run: bun dev opencode --task ${{ matrix.task }} --model ${{ inputs.model }}
68+
TASK: ${{ matrix.task }}
69+
MODEL: ${{ inputs.model }}
70+
AGENT: ${{ inputs.agent }}
71+
RESULT_PATH: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
72+
run: bun github/run.ts
73+
74+
- name: Upload benchmark results
75+
uses: actions/upload-artifact@v4
76+
with:
77+
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}
78+
path: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
79+
80+
summarize-runs:
81+
needs: benchmark
82+
runs-on: ubuntu-latest
83+
strategy:
84+
matrix:
85+
task: ${{ fromJson(needs.prepare.outputs.tasks) }}
86+
steps:
87+
- name: Checkout repository
88+
uses: actions/checkout@v4
89+
90+
- name: Setup Bun
91+
uses: oven-sh/setup-bun@v1
92+
with:
93+
bun-version: 1.2.21
94+
95+
- name: Install dependencies
96+
run: bun install
97+
98+
- name: Download run 1 results
99+
uses: actions/download-artifact@v4
100+
with:
101+
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1
102+
path: results
103+
104+
- name: Download run 2 results
105+
uses: actions/download-artifact@v4
106+
with:
107+
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2
108+
path: results
109+
110+
- name: Download run 3 results
111+
uses: actions/download-artifact@v4
112+
with:
113+
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3
114+
path: results
115+
116+
- name: Summarize runs
117+
env:
118+
RESULT_PATHS: results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3.json
119+
RUNS_SUMMARY_PATH: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
120+
run: bun github/summarize-runs.ts
121+
122+
- name: Upload runs summary
123+
uses: actions/upload-artifact@v4
124+
with:
125+
name: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}
126+
path: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
127+
128+
summarize-tasks:
129+
needs: summarize-runs
130+
runs-on: ubuntu-latest
131+
steps:
132+
- name: Checkout repository
133+
uses: actions/checkout@v4
134+
135+
- name: Setup Bun
136+
uses: oven-sh/setup-bun@v1
137+
with:
138+
bun-version: 1.2.21
139+
140+
- name: Install dependencies
141+
run: bun install
142+
143+
- name: Download all runs summaries
144+
uses: actions/download-artifact@v4
145+
with:
146+
pattern: runs-summary-*
147+
path: runs-summaries
148+
149+
- name: Summarize tasks
150+
env:
151+
RUNS_SUMMARY_PATHS: runs-summaries/*/runs-summary-*.json
152+
run: |
153+
RUNS_SUMMARY_PATHS_COMMA=$(find runs-summaries -name 'runs-summary-*.json' | tr '\n' ',' | sed 's/,$//')
154+
export RUNS_SUMMARY_PATHS="$RUNS_SUMMARY_PATHS_COMMA"
155+
export TASKS_SUMMARY_PATH=tasks-summary.json
156+
bun github/summarize-tasks.ts
157+
158+
- name: Upload tasks summary
159+
uses: actions/upload-artifact@v4
160+
with:
161+
name: tasks-summary
162+
path: tasks-summary.json

cli.ts

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ import { hideBin } from "yargs/helpers";
66
import { Agent } from "~/src/agents/index.js";
77
import { Task } from "~/src/tasks/index.js";
88
import { Summarizer } from "~/src/summarizer.js";
9-
import { withRetries } from "~/src/util/retry.js";
10-
import { buildRadarChartUrl } from "~/src/util/charts.js";
119
import { Logger } from "~/src/util/logger.js";
1210
import { Eval } from "./src/eval.js";
1311

@@ -21,9 +19,6 @@ const cli = yargs(hideBin(process.argv))
2119
[
2220
"$0 opencode --model opencode/gpt-5-codex --task DataDog/datadog-lambda-python@93d4a07..d776378",
2321
],
24-
[
25-
"$0 opencode --model opencode/claude-sonnet-4-5 --task DataDog/datadog-lambda-python@93d4a07..d776378 --output results.json",
26-
],
2722
])
2823
.strict();
2924

@@ -58,37 +53,14 @@ cli.command(
5853
type: "string",
5954
description: "task to use in the format of repo@from..to",
6055
required: true,
61-
})
62-
.option("timeout", {
63-
type: "number",
64-
description: "timeout in minutes for each episode",
65-
default: 40,
66-
})
67-
.option("output", {
68-
type: "string",
69-
description: "output file to save the results to",
7056
}),
71-
async ({
72-
agent: agentName,
73-
model: modelId,
74-
task: taskId,
75-
timeout: timeoutMins,
76-
output: outputPath,
77-
}) => {
57+
async ({ agent: agentName, model: modelId, task: taskId }) => {
7858
if (!agentName) throw new Error("Agent name is required");
7959

8060
const logger = Logger.create(`[model ${modelId}]`);
8161

82-
// Run episodes
83-
logger.log(`Starting episode with ${timeoutMins}min timeout...`);
84-
const result = await withRetries(
85-
() => Eval.run(agentName, modelId, taskId, { logger }),
86-
{
87-
retries: 3,
88-
timeoutMs: timeoutMins * 60 * 1000,
89-
logger,
90-
},
91-
);
62+
// Run eval
63+
const result = await Eval.run(agentName, modelId, taskId, { logger });
9264

9365
// Summary episodes
9466
const summary = await Summarizer.summarizeRuns([result]);
@@ -129,11 +101,6 @@ cli.command(
129101
// datasetLabel: modelId,
130102
//});
131103
//logger.log(`Radar Chart: ${chartUrl}\n`);
132-
133-
// Store result
134-
if (outputPath) {
135-
await writeFile(outputPath, JSON.stringify(result));
136-
}
137104
},
138105
);
139106

github/run.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bun
2+
import { Logger } from "../src/util/logger.js";
3+
import { Eval } from "../src/eval.js";
4+
import { writeFile } from "node:fs/promises";
5+
6+
const task = process.env.TASK!;
7+
const model = process.env.MODEL!;
8+
const agent = process.env.AGENT!;
9+
const resultPath = process.env.RESULT_PATH!;
10+
11+
// Run eval
12+
const result = await Eval.run(agent, model, task, {
13+
logger: Logger.create(`[model ${model}]`),
14+
});
15+
16+
// Store result
17+
await writeFile(resultPath, JSON.stringify(result));

github/summarize-runs.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bun
2+
import { readFile, writeFile } from "node:fs/promises";
3+
import { Summarizer } from "../src/summarizer.js";
4+
5+
const resultPaths = process.env.RESULT_PATHS!;
6+
const runsSummaryPath = process.env.RUNS_SUMMARY_PATH!;
7+
8+
const results = await Promise.all(
9+
resultPaths.split(",").map(async (resultPath) => {
10+
const result = await readFile(resultPath, "utf8");
11+
return JSON.parse(result);
12+
}),
13+
);
14+
15+
const summary = await Summarizer.summarizeRuns(results);
16+
17+
await writeFile(runsSummaryPath, JSON.stringify(summary));

github/summarize-tasks.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bun
2+
import { readFile, writeFile } from "node:fs/promises";
3+
import { Summarizer } from "../src/summarizer.js";
4+
5+
const runsSummaryPaths = process.env.RUNS_SUMMARY_PATHS!;
6+
const tasksSummaryPath = process.env.TASKS_SUMMARY_PATH!;
7+
8+
const runsSummaries = await Promise.all(
9+
runsSummaryPaths.split(",").map(async (runsSummaryPath) => {
10+
const runsSummary = await readFile(runsSummaryPath, "utf8");
11+
return JSON.parse(runsSummary);
12+
}),
13+
);
14+
15+
const summary = await Summarizer.summarizeTasks(runsSummaries);
16+
17+
await writeFile(tasksSummaryPath, JSON.stringify(summary));

src/eval.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { Metric } from "./metrics/index.js";
1212
import { average, variance, weightedSum } from "./util/math.js";
1313
import { Judge } from "./judges.js";
1414
import { getZenLanguageModel } from "./zenModels.js";
15+
import { withRetries } from "./util/retry.js";
1516

1617
export namespace Eval {
1718
export const DISAGREEMENT_PENALTY = 0.5;
@@ -24,6 +25,26 @@ export namespace Eval {
2425
opts: {
2526
logger: Logger.Instance;
2627
},
28+
) {
29+
const timeoutMins = 40;
30+
opts.logger.log(`Starting episode with ${timeoutMins}min timeout...`);
31+
return await withRetries(
32+
() => runOnce(agentName, modelId, taskId, { logger: opts.logger }),
33+
{
34+
retries: 3,
35+
timeoutMs: timeoutMins * 60 * 1000,
36+
logger: opts.logger,
37+
},
38+
);
39+
}
40+
41+
async function runOnce(
42+
agentName: string,
43+
modelId: string,
44+
taskId: string,
45+
opts: {
46+
logger: Logger.Instance;
47+
},
2748
) {
2849
const agent = Agent.get(agentName);
2950
Agent.validateModel(agent, modelId);

0 commit comments

Comments
 (0)