This guide shows you how to create a benchmark for your customized workflow using the self-improving coding agent framework.
Create a new file in base_agent/src/benchmarks/ (e.g., my_benchmark.py):
from .base import BaseBenchmark, Problem
class MyBenchmark(BaseBenchmark):
name = "my_benchmark" # Unique identifier
def __init__(self, seed=None, subset_size=None):
super().__init__(seed, subset_size)
self._problems = self._load_problems()
@property
def problems(self) -> list[Problem]:
return self._problems
async def score_problem(self, problem, agent_workdir,
agent_answer_dir, container_name):
# Your scoring logic here
passEach problem needs:
problem_id: Unique identifier (string)statement: What you want the agent to doanswer: Expected resultanswer_discussion: Optional explanation
Example:
Problem(
problem_id="task_1",
statement="Calculate 2+2 and submit the answer",
answer=4,
answer_discussion="Simple addition"
)The score_problem method should:
- Read the agent's answer from
agent_answer_dir/answer.txt - Compare it to the expected answer
- Return
(score, error, discussion)tuple
async def score_problem(self, problem, agent_workdir,
agent_answer_dir, container_name):
answer_path = Path(agent_answer_dir) / "answer.txt"
submitted = answer_path.read_text().strip()
if submitted == str(problem.answer):
return 1.0, None, problem.answer_discussion
else:
return 0.0, "Answer mismatch", problem.answer_discussionIf your workflow needs input files, implement setup_problem:
async def setup_problem(self, problem, problem_data_dir, container_name):
# Create files the agent will need
(problem_data_dir / "input.txt").write_text("Input data here")The benchmark should be automatically discoverable. To use it:
from base_agent.src.benchmarks.my_benchmark import MyBenchmark
benchmark = MyBenchmark()
# Run with your agentscore = 1.0 if submitted == expected else 0.0if abs(float(submitted) - expected) < 1e-6:
score = 1.0output_file = Path(agent_workdir) / "result.txt"
if output_file.exists():
content = output_file.read_text()
# Check content# Execute code and check results
# (requires running commands in container)score = 0.0
if condition1:
score += 0.5
if condition2:
score += 0.5
return score, None, discussionLoad problems from various sources:
with open("problems.jsonl") as f:
for line in f:
data = json.loads(line)
problems.append(Problem(...))from datasets import load_dataset
dataset = load_dataset("org/dataset")
problems = [Problem(...) for item in dataset["test"]]import csv
with open("problems.csv") as f:
reader = csv.DictReader(f)
problems = [Problem(
problem_id=row["id"],
statement=row["question"],
answer=row["answer"]
) for row in reader]problems = [
Problem(
problem_id=f"problem_{i}",
statement=f"Solve task {i}",
answer=compute_answer(i)
) for i in range(100)
]Create a simple test:
# tests/benchmarks/test_my_benchmark.py
import pytest
from base_agent.src.benchmarks.my_benchmark import MyBenchmark
def test_benchmark_creation():
benchmark = MyBenchmark()
assert len(benchmark.problems) > 0
assert benchmark.name == "my_benchmark"
@pytest.mark.asyncio
async def test_scoring():
benchmark = MyBenchmark()
problem = benchmark.problems[0]
# Create mock answer file
from pathlib import Path
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
answer_dir = Path(tmpdir)
(answer_dir / "answer.txt").write_text("expected_answer")
score, error, discussion = await benchmark.score_problem(
problem, tmpdir, str(answer_dir), "test_container"
)
assert score >= 0.0 and score <= 1.0# Run from project root
python runner.py --benchmark my_benchmark --subset-size 5Override in your benchmark if needed
Use Docker API to run commands in the agent container for verification
Check multiple output files in agent_workdir
For long-running tasks, implement intermediate checkpoints
Give the agent only partial information, test information gathering
Study these existing benchmarks for reference:
gsm8k.py- Simple numeric answer evaluationhumaneval.py- Code generation and execution testingfile_editing.py- File manipulation tasksswebench_verified.py- Complex software engineering tasks
- Create your benchmark file in
base_agent/src/benchmarks/ - Implement the required methods
- Create test problems/data
- Write unit tests
- Run with a small subset to validate
- Scale up to full benchmark
For more help, check:
- base.py - Base class definition
- custom_example.py - Detailed examples
- Existing benchmark implementations
Happy benchmarking!