Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
| Flag | Description |
| ------------------------ | ----------------------------------------------------------------------------- |
| `--model MODEL` | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`) |
| `--judge MODEL` | Judge model for LLM grading (default: `openrouter/anthropic/claude-opus-4.5`) |
| `--judge MODEL` | Judge model for LLM grading; uses direct API when set (see below) |
| `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs |
| `--runs N` | Number of runs per task for averaging |
| `--timeout-multiplier N` | Scale timeouts for slower models |
Expand All @@ -103,6 +103,29 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
| `--upload FILE` | Upload a previous results JSON |
| `--official-key KEY` | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var) |

### Judge

By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session. When `--judge` is specified, it calls the model API directly instead, bypassing OpenClaw personality injection.

```bash
# Default: OpenClaw agent session (no --judge needed)
./scripts/run.sh --model openrouter/anthropic/claude-sonnet-4

# Direct API via OpenRouter
./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5

# Direct API via Anthropic
./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514

# Direct API via OpenAI
./scripts/run.sh --model openai/gpt-4o --judge openai/gpt-4o

# Headless Claude CLI
./scripts/run.sh --model openai/gpt-4o --judge claude
```

Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.

## Contributing Tasks

We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are:
Expand Down
7 changes: 6 additions & 1 deletion scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,11 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument(
"--judge",
default=None,
help="Judge model identifier (default: openrouter/anthropic/claude-opus-4.5)",
help=(
"Judge model or backend. Default (unset): OpenClaw agent session with "
"openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API "
"directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)"
),
)
parser.add_argument(
"--verbose",
Expand Down Expand Up @@ -628,6 +632,7 @@ def main():
)
if args.judge:
grade_kwargs["judge_model"] = args.judge
grade_kwargs["judge_backend"] = "api"
grade = grade_task(**grade_kwargs)
except Exception as exc:
if execution_error:
Expand Down
178 changes: 176 additions & 2 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,14 +366,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s

_BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"]

def _remove_readonly(func, path, _):
def _remove_readonly(func, path, _):
try:
os.chmod(path, stat.S_IWRITE)
func(path)
except OSError:
pass
func(path)

saved_bootstrap: dict[str, bytes] = {}
if workspace.exists():
Expand Down Expand Up @@ -972,3 +970,179 @@ def run_openclaw_prompt(
"stdout": stdout,
"stderr": stderr,
}


_JUDGE_SYSTEM_MSG = (
"You are a strict grading function. "
"Respond with ONLY a JSON object, no prose, no markdown fences, no extra text."
)


def call_judge_api(
*,
prompt: str,
model: str,
timeout_seconds: float = 120.0,
) -> Dict[str, Any]:
"""Call a judge model directly via API, bypassing OpenClaw.

Dispatches based on model prefix:
- openrouter/* -> OpenRouter chat completions API
- anthropic/* -> Anthropic Messages API
- openai/* -> OpenAI chat completions API
- claude -> headless Claude CLI (claude -p)

Returns {"status": str, "text": str, "error"?: str}.
"""
if model == "claude" or model.startswith("claude:"):
return _judge_via_claude_cli(prompt, model, timeout_seconds)
if model.startswith("anthropic/"):
return _judge_via_anthropic(prompt, model, timeout_seconds)
if model.startswith("openai/"):
return _judge_via_openai(prompt, model, timeout_seconds)
# Default: OpenRouter (handles openrouter/ prefix and bare provider/model)
return _judge_via_openrouter(prompt, model, timeout_seconds)


def _judge_via_openai_compat(
prompt: str,
api_model: str,
endpoint: str,
api_key: str,
timeout_seconds: float,
extra_headers: Optional[Dict[str, str]] = None,
) -> Dict[str, Any]:
"""Shared implementation for OpenAI-compatible chat completions APIs."""
payload = json.dumps({
"model": api_model,
"messages": [
{"role": "system", "content": _JUDGE_SYSTEM_MSG},
{"role": "user", "content": prompt},
],
"temperature": 0.0,
"max_tokens": 2048,
}).encode("utf-8")

headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
if extra_headers:
headers.update(extra_headers)

req = request.Request(endpoint, data=payload, headers=headers, method="POST")
try:
with request.urlopen(req, timeout=timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
except error.HTTPError as exc:
body = ""
try:
body = exc.read().decode("utf-8", errors="replace")[:500]
except Exception:
pass
logger.error("Judge API error (%s): %s", exc.code, body)
return {"status": "error", "text": "", "error": f"HTTP {exc.code}: {body}"}
except error.URLError as exc:
logger.error("Judge network error: %s", exc)
return {"status": "error", "text": "", "error": str(exc)}
except TimeoutError:
return {"status": "timeout", "text": "", "error": "Request timed out"}

choices = data.get("choices", [])
if not choices:
return {"status": "error", "text": "", "error": "No choices in response"}
text = choices[0].get("message", {}).get("content", "")
return {"status": "success", "text": text}


def _judge_via_openrouter(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
return {"status": "error", "text": "", "error": "OPENROUTER_API_KEY not set"}
bare_model = model.removeprefix("openrouter/")
return _judge_via_openai_compat(
prompt, bare_model,
"https://openrouter.ai/api/v1/chat/completions",
api_key, timeout_seconds,
extra_headers={"HTTP-Referer": "https://pinchbench.com", "X-Title": "PinchBench-Judge"},
)


def _judge_via_openai(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
return {"status": "error", "text": "", "error": "OPENAI_API_KEY not set"}
bare_model = model.removeprefix("openai/")
return _judge_via_openai_compat(
prompt, bare_model,
"https://api.openai.com/v1/chat/completions",
api_key, timeout_seconds,
)


def _judge_via_anthropic(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
return {"status": "error", "text": "", "error": "ANTHROPIC_API_KEY not set"}
bare_model = model.removeprefix("anthropic/")
payload = json.dumps({
"model": bare_model,
"max_tokens": 2048,
"temperature": 0.0,
"system": _JUDGE_SYSTEM_MSG,
"messages": [{"role": "user", "content": prompt}],
}).encode("utf-8")
headers = {
"x-api-key": api_key,
"Content-Type": "application/json",
"anthropic-version": "2023-06-01",
}
req = request.Request(
"https://api.anthropic.com/v1/messages",
data=payload, headers=headers, method="POST",
)
try:
with request.urlopen(req, timeout=timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
except error.HTTPError as exc:
body = ""
try:
body = exc.read().decode("utf-8", errors="replace")[:500]
except Exception:
pass
logger.error("Anthropic judge API error (%s): %s", exc.code, body)
return {"status": "error", "text": "", "error": f"HTTP {exc.code}: {body}"}
except error.URLError as exc:
logger.error("Anthropic judge network error: %s", exc)
return {"status": "error", "text": "", "error": str(exc)}
except TimeoutError:
return {"status": "timeout", "text": "", "error": "Request timed out"}

content = data.get("content", [])
text = "".join(block.get("text", "") for block in content if block.get("type") == "text")
return {"status": "success", "text": text}


def _judge_via_claude_cli(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
"""Use headless Claude CLI (claude -p) as judge."""
cmd: List[str] = ["claude", "-p"]
# Support "claude:model-name" to pass --model
if ":" in model:
_, cli_model = model.split(":", 1)
cmd.extend(["--model", cli_model])
try:
result = subprocess.run(
cmd,
input=f"{_JUDGE_SYSTEM_MSG}\n\n{prompt}",
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False,
)
except FileNotFoundError:
return {"status": "error", "text": "", "error": "claude CLI not found"}
except subprocess.TimeoutExpired:
return {"status": "timeout", "text": "", "error": "claude -p timed out"}
if result.returncode != 0:
return {"status": "error", "text": "", "error": f"claude exit {result.returncode}: {result.stderr[:300]}"}
return {"status": "success", "text": result.stdout}
Loading