Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
| Flag | Description |
| ------------------------ | ----------------------------------------------------------------------------- |
| `--model MODEL` | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`) |
| `--judge MODEL` | Judge model for LLM grading (default: `openrouter/anthropic/claude-opus-4.5`) |
| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) |
| `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs |
| `--runs N` | Number of runs per task for averaging |
| `--timeout-multiplier N` | Scale timeouts for slower models |
Expand All @@ -103,6 +103,29 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
| `--upload FILE` | Upload a previous results JSON |
| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) |

### Judge

By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session. When `--judge` is specified, it calls the model API directly instead, bypassing OpenClaw personality injection.

```bash
# Default: OpenClaw agent session (no --judge needed)
./scripts/run.sh --model openrouter/anthropic/claude-sonnet-4

# Direct API via OpenRouter
./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5

# Direct API via Anthropic
./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514

# Direct API via OpenAI
./scripts/run.sh --model openai/gpt-4o --judge openai/gpt-4o

# Headless Claude CLI
./scripts/run.sh --model openai/gpt-4o --judge claude
```

Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.

## Contributing Tasks

We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are:
Expand All @@ -112,6 +135,10 @@ We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.m
- **Reproducible** — Same task should produce consistent grading
- **Challenging** — Tests agent capabilities, not just LLM knowledge

### Transcript Archive

Session transcripts are automatically saved to `results/{run_id}_transcripts/` alongside the results JSON. Each task's full agent conversation is preserved as a JSONL file (e.g. `task_01_calendar.jsonl`) for post-run analysis.

## Links

- **Leaderboard:** [pinchbench.com](https://pinchbench.com)
Expand Down
111 changes: 82 additions & 29 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,11 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument(
"--judge",
default=None,
help="Judge model identifier (default: openrouter/anthropic/claude-opus-4.5)",
help=(
"Judge model or backend. Default (unset): OpenClaw agent session with "
"openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API "
"directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)"
),
)
parser.add_argument(
"--verbose",
Expand Down Expand Up @@ -582,6 +586,47 @@ def main():
tasks_by_id = {task.task_id: task for task in tasks_to_run}

runs_per_task = max(1, args.runs)

# Incremental result writer: builds partial result JSON from completed
# tasks so external tools can poll progress while the benchmark runs.
incremental_dir = Path(args.output_dir)
incremental_dir.mkdir(parents=True, exist_ok=True)
incremental_path = incremental_dir / f"{run_id}_{model_slug}.json"

def _write_incremental_results():
task_entries = [
{
"task_id": r["task_id"],
"status": r["status"],
"timed_out": r["timed_out"],
"execution_time": r["execution_time"],
"transcript_length": len(r["transcript"]),
"usage": r.get("usage", {}),
"workspace": r["workspace"],
"grading": grades_by_task_id.get(r["task_id"], {}),
"frontmatter": tasks_by_id[r["task_id"]].frontmatter,
}
for r in results
]
efficiency = _compute_efficiency_summary(task_entries, grades_by_task_id)
partial = {
"model": args.model,
"benchmark_version": _get_git_version(skill_root),
"run_id": run_id,
"timestamp": time.time(),
"suite": args.suite,
"runs_per_task": runs_per_task,
"tasks": task_entries,
"efficiency": efficiency,
"in_progress": True,
"completed_tasks": len(grades_by_task_id),
"total_tasks": len(tasks_to_run),
}
try:
incremental_path.write_text(json.dumps(partial, indent=2), encoding="utf-8")
except OSError:
pass

for i, task in enumerate(tasks_to_run, 1):
task_grades = []
task_results = []
Expand All @@ -604,6 +649,7 @@ def main():
run_id=f"{run_id}-{run_index + 1}",
timeout_multiplier=args.timeout_multiplier,
skill_dir=skill_dir,
output_dir=Path(args.output_dir) / f"{run_id}_transcripts",
verbose=args.verbose,
)
except Exception as exc:
Expand All @@ -628,6 +674,7 @@ def main():
)
if args.judge:
grade_kwargs["judge_model"] = args.judge
grade_kwargs["judge_backend"] = "api"
grade = grade_task(**grade_kwargs)
except Exception as exc:
if execution_error:
Expand Down Expand Up @@ -693,39 +740,45 @@ def main():
"⚠️ Sanity check scored 0%% but transcripts were missing for all runs; skipping fail-fast as likely infrastructure/logging issue."
)

# Incremental write: update result JSON after each task so partial
# results are available while the benchmark is still running.
_write_incremental_results()

output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{run_id}_{model_slug}.json"

task_entries = [
{
"task_id": result["task_id"],
"status": result["status"],
"timed_out": result["timed_out"],
"execution_time": result["execution_time"],
"transcript_length": len(result["transcript"]),
"usage": result.get("usage", {}),
"workspace": result["workspace"],
"grading": grades_by_task_id[result["task_id"]],
"frontmatter": tasks_by_id[result["task_id"]].frontmatter,
def _build_and_write_results():
"""Build aggregate result from completed tasks and write to output_path."""
task_entries = [
{
"task_id": result["task_id"],
"status": result["status"],
"timed_out": result["timed_out"],
"execution_time": result["execution_time"],
"transcript_length": len(result["transcript"]),
"usage": result.get("usage", {}),
"workspace": result["workspace"],
"grading": grades_by_task_id[result["task_id"]],
"frontmatter": tasks_by_id[result["task_id"]].frontmatter,
}
for result in results
]
efficiency = _compute_efficiency_summary(task_entries, grades_by_task_id)
aggregate = {
"model": args.model,
"benchmark_version": _get_git_version(skill_root),
"run_id": run_id,
"timestamp": time.time(),
"suite": args.suite,
"runs_per_task": runs_per_task,
"tasks": task_entries,
"efficiency": efficiency,
}
for result in results
]

efficiency = _compute_efficiency_summary(task_entries, grades_by_task_id)

aggregate = {
"model": args.model,
"benchmark_version": _get_git_version(skill_root),
"run_id": run_id,
"timestamp": time.time(),
"suite": args.suite,
"runs_per_task": runs_per_task,
"tasks": task_entries,
"efficiency": efficiency,
}
output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8")
return task_entries, efficiency

output_path = output_dir / f"{run_id}_{model_slug}.json"
output_path.write_text(json.dumps(aggregate, indent=2), encoding="utf-8")
task_entries, efficiency = _build_and_write_results()

# Calculate and log final score summary
total_score = sum(grades_by_task_id[tid]["mean"] for tid in grades_by_task_id)
Expand Down
Loading