From 87bb258215f24b5bb8066ca25f4a621a1424c0d2 Mon Sep 17 00:00:00 2001 From: OpenClaw Agent Date: Sat, 7 Mar 2026 09:04:04 -0800 Subject: [PATCH 1/3] Add thinking levels support for benchmarking reasoning depth - Add --thinking CLI argument to specify comma-separated thinking levels - Pass thinking level to OpenClaw agent via --thinking flag - Run each task across all specified thinking levels - Include thinking_level in task results - Add thinking_aggregates section with per-level statistics - Support levels: off, minimal, low, medium, high - Update SKILL.md and README.md with documentation Closes #9 --- README.md | 17 ++++ SKILL.md | 21 +++++ scripts/benchmark.py | 213 +++++++++++++++++++++++++++++-------------- scripts/lib_agent.py | 55 +++++++---- 4 files changed, 220 insertions(+), 86 deletions(-) diff --git a/README.md b/README.md index 4009eb5..acd6afc 100644 --- a/README.md +++ b/README.md @@ -78,11 +78,28 @@ Skip uploading with `--no-upload` if you just want local results. | `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs | | `--runs N` | Number of runs per task for averaging | | `--timeout-multiplier N` | Scale timeouts for slower models | +| `--thinking LEVELS` | Comma-separated thinking levels (e.g., `low,medium,high`) | | `--output-dir DIR` | Where to save results (default: `results/`) | | `--no-upload` | Skip uploading to leaderboard | | `--register` | Request an API token for submissions | | `--upload FILE` | Upload a previous results JSON | +## Thinking Levels + +Many models support configurable thinking/reasoning levels. Test how different reasoning depths affect task performance: + +```bash +# Test multiple thinking levels +./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking low,medium,high + +# Compare a single level against default +./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking high +``` + +Valid levels: `off`, `minimal`, `low`, `medium`, `high` + +Results include a `thinking_aggregates` section with per-level statistics, and each task result includes the `thinking_level` used. + ## Contributing Tasks We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are: diff --git a/SKILL.md b/SKILL.md index d6f0284..0cfbd98 100644 --- a/SKILL.md +++ b/SKILL.md @@ -73,6 +73,7 @@ uv run benchmark.py --model anthropic/claude-sonnet-4 --no-upload | `--output-dir` | Results directory (default: `results/`) | | `--timeout-multiplier` | Scale task timeouts for slower models | | `--runs` | Number of runs per task for averaging | +| `--thinking` | Comma-separated thinking levels (e.g., `low,medium,high`) | | `--no-upload` | Skip uploading to leaderboard | | `--register` | Request new API token for submissions | | `--upload FILE` | Upload previous results JSON | @@ -89,6 +90,26 @@ uv run benchmark.py --register uv run benchmark.py --model anthropic/claude-sonnet-4 ``` +## Thinking Levels + +Many models support different thinking/reasoning levels (e.g., Claude's extended thinking). PinchBench can run tasks across multiple thinking levels to measure how reasoning depth affects performance: + +```bash +# Run with multiple thinking levels +uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking low,medium,high + +# Run with a single thinking level +uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking high +``` + +Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high` + +Results include per-level aggregates: +- `thinking_aggregates`: Summary statistics for each thinking level +- Per-task results include `thinking_level` field + +**Note:** Thinking levels are passed directly to OpenClaw's `--thinking` flag. Not all models support all levels. + ## Results Results are saved as JSON in the output directory: diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 10fe48a..6dce252 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -28,6 +28,7 @@ ensure_agent_exists, execute_openclaw_task, slugify_model, + VALID_THINKING_LEVELS, ) from lib_grading import GradeResult, grade_task from lib_tasks import Task, TaskLoader @@ -212,6 +213,14 @@ def _parse_args() -> argparse.Namespace: default=1, help="Number of runs per task for averaging", ) + parser.add_argument( + "--thinking", + type=str, + default=None, + help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). " + f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}. " + "If not specified, runs without explicit thinking level.", + ) return parser.parse_args() @@ -223,6 +232,31 @@ def _select_task_ids(tasks: List[Task], suite: str) -> Optional[List[str]]: return [task_id.strip() for task_id in suite.split(",") if task_id.strip()] +def _parse_thinking_levels(thinking_arg: Optional[str]) -> List[Optional[str]]: + """ + Parse thinking levels from the argument. + + Returns a list of thinking levels to test. + Each element is either a valid thinking level string or None (no explicit level). + """ + if thinking_arg is None: + return [None] # Run once without explicit thinking level + + levels = [] + for level in thinking_arg.split(","): + level = level.strip().lower() + if level in VALID_THINKING_LEVELS: + levels.append(level) + else: + logger.warning( + "Invalid thinking level '%s', skipping. Valid levels: %s", + level, + ", ".join(VALID_THINKING_LEVELS), + ) + + return levels if levels else [None] + + def _next_run_id(run_root: Path) -> str: run_root.mkdir(parents=True, exist_ok=True) existing = [] @@ -357,8 +391,9 @@ def main(): cleanup_agent_sessions(agent_id) task_ids = _select_task_ids(runner.tasks, args.suite) + thinking_levels = _parse_thinking_levels(args.thinking) results = [] - grades_by_task_id = {} + grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {} tasks_to_run = runner.tasks if task_ids is not None: @@ -366,72 +401,112 @@ def main(): tasks_by_id = {task.task_id: task for task in tasks_to_run} runs_per_task = max(1, args.runs) - for i, task in enumerate(tasks_to_run, 1): - task_grades = [] - for run_index in range(runs_per_task): - logger.info("\n%s", "=" * 80) - logger.info( - "📋 Task %s/%s (Run %s/%s)", - i, - len(tasks_to_run), - run_index + 1, - runs_per_task, - ) - logger.info("%s", "=" * 80) - execution_error = None - try: - result = execute_openclaw_task( - task=task, - agent_id=agent_id, - model_id=args.model, - run_id=f"{run_id}-{run_index + 1}", - timeout_multiplier=args.timeout_multiplier, - skill_dir=skill_dir, - ) - except Exception as exc: - execution_error = str(exc) - logger.warning( - "Task execution failed for %s, continuing: %s", task.task_id, exc - ) - result = { - "agent_id": agent_id, - "task_id": task.task_id, - "status": "error", - "transcript": [], - "usage": {}, - "workspace": "", - "exit_code": -1, - "timed_out": False, - "execution_time": 0.0, - "stdout": "", - "stderr": execution_error, - } - try: - grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir) - except Exception as exc: - if execution_error: - note = f"Execution failed: {execution_error}; Grading failed: {exc}" - else: - note = f"Grading failed: {exc}" - logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc) - grade = GradeResult( - task_id=task.task_id, - score=0.0, - max_score=1.0, - grading_type=task.grading_type, - breakdown={}, - notes=note, + total_runs = len(tasks_to_run) * runs_per_task * len(thinking_levels) + run_counter = 0 + + for thinking_level in thinking_levels: + thinking_label = thinking_level or "default" + logger.info("\n%s", "=" * 80) + logger.info("🧠 Thinking Level: %s", thinking_label) + logger.info("%s", "=" * 80) + + for i, task in enumerate(tasks_to_run, 1): + task_key = f"{task.task_id}:{thinking_label}" if thinking_level else task.task_id + task_grades = [] + + for run_index in range(runs_per_task): + run_counter += 1 + logger.info("\n%s", "-" * 80) + logger.info( + "📋 Task %s/%s (Run %s/%s) [%s] — Overall progress: %s/%s", + i, + len(tasks_to_run), + run_index + 1, + runs_per_task, + thinking_label, + run_counter, + total_runs, ) - task_grades.append(grade) - results.append(result) + logger.info("%s", "-" * 80) + execution_error = None + try: + result = execute_openclaw_task( + task=task, + agent_id=agent_id, + model_id=args.model, + run_id=f"{run_id}-{run_index + 1}", + timeout_multiplier=args.timeout_multiplier, + skill_dir=skill_dir, + thinking_level=thinking_level, + ) + except Exception as exc: + execution_error = str(exc) + logger.warning( + "Task execution failed for %s, continuing: %s", task.task_id, exc + ) + result = { + "agent_id": agent_id, + "task_id": task.task_id, + "thinking_level": thinking_level, + "status": "error", + "transcript": [], + "usage": {}, + "workspace": "", + "exit_code": -1, + "timed_out": False, + "execution_time": 0.0, + "stdout": "", + "stderr": execution_error, + } + try: + grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir) + except Exception as exc: + if execution_error: + note = f"Execution failed: {execution_error}; Grading failed: {exc}" + else: + note = f"Grading failed: {exc}" + logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc) + grade = GradeResult( + task_id=task.task_id, + score=0.0, + max_score=1.0, + grading_type=task.grading_type, + breakdown={}, + notes=note, + ) + task_grades.append(grade) + result["thinking_level"] = thinking_level + results.append(result) + + task_scores = [grade.score for grade in task_grades] + grades_by_task_and_thinking[task_key] = { + "task_id": task.task_id, + "thinking_level": thinking_level, + "runs": [grade.to_dict() for grade in task_grades], + "mean": statistics.mean(task_scores), + "std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0, + "min": min(task_scores), + "max": max(task_scores), + } - task_scores = [grade.score for grade in task_grades] - grades_by_task_id[task.task_id] = { - "runs": [grade.to_dict() for grade in task_grades], - "mean": statistics.mean(task_scores), - "std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0, - "min": min(task_scores), - "max": max(task_scores), + # Compute per-thinking-level aggregates + thinking_aggregates: Dict[str, Dict[str, Any]] = {} + for thinking_level in thinking_levels: + thinking_label = thinking_level or "default" + level_keys = [ + k for k, v in grades_by_task_and_thinking.items() + if v.get("thinking_level") == thinking_level + ] + if not level_keys: + continue + scores = [grades_by_task_and_thinking[k]["mean"] for k in level_keys] + thinking_aggregates[thinking_label] = { + "thinking_level": thinking_label, + "task_count": len(scores), + "mean_score": statistics.mean(scores) if scores else 0.0, + "std_score": statistics.stdev(scores) if len(scores) > 1 else 0.0, + "min_score": min(scores) if scores else 0.0, + "max_score": max(scores) if scores else 0.0, } output_dir = Path(args.output_dir) @@ -443,16 +518,22 @@ def main(): "timestamp": time.time(), "suite": args.suite, "runs_per_task": runs_per_task, + "thinking_levels": [tl or "default" for tl in thinking_levels], + "thinking_aggregates": thinking_aggregates, "tasks": [ { "task_id": result["task_id"], + "thinking_level": result.get("thinking_level"), "status": result["status"], "timed_out": result["timed_out"], "execution_time": result["execution_time"], "transcript_length": len(result["transcript"]), "usage": result.get("usage", {}), "workspace": result["workspace"], - "grading": grades_by_task_id[result["task_id"]], + "grading": grades_by_task_and_thinking.get( + f"{result['task_id']}:{result.get('thinking_level') or 'default'}", + grades_by_task_and_thinking.get(result["task_id"], {}), + ), "frontmatter": tasks_by_id[result["task_id"]].frontmatter, } for result in results diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 46e3355..bfd1f63 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -386,6 +386,10 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str return totals +# Valid thinking levels for OpenClaw agents +VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high") + + def execute_openclaw_task( *, task: Task, @@ -394,10 +398,13 @@ def execute_openclaw_task( run_id: str, timeout_multiplier: float, skill_dir: Path, + thinking_level: str | None = None, ) -> Dict[str, Any]: logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id) logger.info(" Task: %s", task.name) logger.info(" Category: %s", task.category) + if thinking_level: + logger.info(" Thinking: %s", thinking_level) # Clean up previous session transcripts so we can reliably find this task's # transcript (OpenClaw uses its own UUID-based naming, not our session ID). @@ -413,17 +420,20 @@ def execute_openclaw_task( timed_out = False try: + cmd = [ + "openclaw", + "agent", + "--agent", + agent_id, + "--session-id", + session_id, + "--message", + task.prompt, + ] + if thinking_level: + cmd.extend(["--thinking", thinking_level]) result = subprocess.run( - [ - "openclaw", - "agent", - "--agent", - agent_id, - "--session-id", - session_id, - "--message", - task.prompt, - ], + cmd, capture_output=True, text=True, cwd=str(workspace), @@ -457,6 +467,7 @@ def execute_openclaw_task( return { "agent_id": agent_id, "task_id": task.task_id, + "thinking_level": thinking_level, "status": status, "transcript": transcript, "usage": usage, @@ -475,6 +486,7 @@ def run_openclaw_prompt( prompt: str, workspace: Path, timeout_seconds: float, + thinking_level: str | None = None, ) -> Dict[str, Any]: """Run a single OpenClaw prompt for helper agents like the judge.""" # Clean up previous session transcripts so we can reliably find this @@ -519,17 +531,20 @@ def run_openclaw_prompt( timed_out = True break try: + cmd = [ + "openclaw", + "agent", + "--agent", + agent_id, + "--session-id", + session_id, + "--message", + chunk, + ] + if thinking_level: + cmd.extend(["--thinking", thinking_level]) result = subprocess.run( - [ - "openclaw", - "agent", - "--agent", - agent_id, - "--session-id", - session_id, - "--message", - chunk, - ], + cmd, capture_output=True, text=True, cwd=str(workspace), From 268dd538ebc288e9dd5db920a4f17f8004b063a3 Mon Sep 17 00:00:00 2001 From: OpenClaw Agent Date: Sat, 7 Mar 2026 10:00:49 -0800 Subject: [PATCH 2/3] Fix thinking levels: use off/minimal/low/medium/high/xhigh/adaptive - Add xhigh and adaptive to valid thinking levels (matching OpenClaw) - Add model-aware xhigh validation (only GPT-5.x models support it) - Validate thinking levels before passing to OpenClaw subprocess - Document model-specific restrictions in help text and docs - Follow existing code style (Optional[str] instead of str | None) - No unnecessary changes to existing code --- README.md | 7 ++++- SKILL.md | 9 ++++-- scripts/benchmark.py | 33 +++++++++++--------- scripts/lib_agent.py | 74 +++++++++++++++++++++++++++++++++++++++----- 4 files changed, 97 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index acd6afc..7248c44 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,12 @@ Many models support configurable thinking/reasoning levels. Test how different r ./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking high ``` -Valid levels: `off`, `minimal`, `low`, `medium`, `high` +Valid levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive` + +**Model-specific notes:** +- `xhigh` is only available for GPT-5.x models (gpt-5.4, gpt-5.2, codex variants) +- `adaptive` is provider-managed reasoning budget (Anthropic Claude 4.6 family) +- Invalid levels for your model are warned and skipped Results include a `thinking_aggregates` section with per-level statistics, and each task result includes the `thinking_level` used. diff --git a/SKILL.md b/SKILL.md index 0cfbd98..a1fcba9 100644 --- a/SKILL.md +++ b/SKILL.md @@ -102,14 +102,17 @@ uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking low,medium,high uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking high ``` -Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high` +Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive` + +**Model-specific notes:** +- `xhigh` is only supported by GPT-5.x models (gpt-5.4, gpt-5.2, codex variants) +- `adaptive` is provider-managed reasoning (Anthropic Claude 4.6 family) +- Invalid levels for your model are warned and skipped Results include per-level aggregates: - `thinking_aggregates`: Summary statistics for each thinking level - Per-task results include `thinking_level` field -**Note:** Thinking levels are passed directly to OpenClaw's `--thinking` flag. Not all models support all levels. - ## Results Results are saved as JSON in the output directory: diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 6dce252..b2dacf2 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -28,7 +28,8 @@ ensure_agent_exists, execute_openclaw_task, slugify_model, - VALID_THINKING_LEVELS, + THINKING_LEVELS, + validate_thinking_level, ) from lib_grading import GradeResult, grade_task from lib_tasks import Task, TaskLoader @@ -218,7 +219,8 @@ def _parse_args() -> argparse.Namespace: type=str, default=None, help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). " - f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}. " + f"Valid levels: {', '.join(THINKING_LEVELS)}. " + "Note: 'xhigh' is only supported by GPT-5.x models. " "If not specified, runs without explicit thinking level.", ) return parser.parse_args() @@ -232,27 +234,28 @@ def _select_task_ids(tasks: List[Task], suite: str) -> Optional[List[str]]: return [task_id.strip() for task_id in suite.split(",") if task_id.strip()] -def _parse_thinking_levels(thinking_arg: Optional[str]) -> List[Optional[str]]: +def _parse_thinking_levels( + thinking_arg: Optional[str], + model_id: Optional[str] = None, +) -> List[Optional[str]]: """ Parse thinking levels from the argument. - Returns a list of thinking levels to test. - Each element is either a valid thinking level string or None (no explicit level). + Args: + thinking_arg: Comma-separated thinking levels or None + model_id: Optional model ID to check xhigh compatibility + + Returns: + List of validated thinking levels (or [None] if no explicit level). """ if thinking_arg is None: return [None] # Run once without explicit thinking level levels = [] for level in thinking_arg.split(","): - level = level.strip().lower() - if level in VALID_THINKING_LEVELS: - levels.append(level) - else: - logger.warning( - "Invalid thinking level '%s', skipping. Valid levels: %s", - level, - ", ".join(VALID_THINKING_LEVELS), - ) + validated = validate_thinking_level(level.strip(), model_id) + if validated: + levels.append(validated) return levels if levels else [None] @@ -391,7 +394,7 @@ def main(): cleanup_agent_sessions(agent_id) task_ids = _select_task_ids(runner.tasks, args.suite) - thinking_levels = _parse_thinking_levels(args.thinking) + thinking_levels = _parse_thinking_levels(args.thinking, args.model) results = [] grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {} diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index bfd1f63..5ec99fe 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -9,7 +9,7 @@ import subprocess import time from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from lib_tasks import Task @@ -17,6 +17,28 @@ logger = logging.getLogger(__name__) MAX_OPENCLAW_MESSAGE_CHARS = 4000 +# Thinking levels supported by OpenClaw +# See: https://docs.openclaw.ai/tools/thinking +THINKING_LEVELS = ("off", "minimal", "low", "medium", "high", "xhigh", "adaptive") + +# Models that support xhigh thinking level (high reasoning budget) +# Sourced from OpenClaw src/auto-reply/thinking.ts XHIGH_MODEL_REFS +XHIGH_MODELS = { + # OpenAI + "openai/gpt-5.4", + "openai/gpt-5.4-pro", + "openai/gpt-5.2", + # OpenAI Codex + "openai-codex/gpt-5.4", + "openai-codex/gpt-5.3-codex", + "openai-codex/gpt-5.3-codex-spark", + "openai-codex/gpt-5.2-codex", + "openai-codex/gpt-5.1-codex", + # GitHub Copilot + "github-copilot/gpt-5.2-codex", + "github-copilot/gpt-5.2", +} + def slugify_model(model_id: str) -> str: return model_id.replace("/", "-").replace(".", "-") @@ -31,6 +53,48 @@ def normalize_model_id(model_id: str) -> str: return f"openrouter/{model_id}" +def supports_xhigh_thinking(model_id: str) -> bool: + """Check if a model supports xhigh thinking level.""" + normalized = normalize_model_id(model_id).lower() + model_lower = model_id.lower() + # Check full provider/model form + if normalized in {m.lower() for m in XHIGH_MODELS}: + return True + # Check just model ID (without provider) + model_only = model_lower.split("/")[-1] if "/" in model_lower else model_lower + return model_only in {m.split("/")[-1].lower() for m in XHIGH_MODELS} + + +def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optional[str]: + """ + Validate a thinking level and check model compatibility. + + Args: + level: The thinking level to validate + model_id: Optional model ID to check xhigh compatibility + + Returns: + The validated level, or None if invalid + """ + level_lower = level.lower().strip() + if level_lower not in THINKING_LEVELS: + logger.warning( + "Invalid thinking level '%s'. Valid levels: %s", + level, + ", ".join(THINKING_LEVELS), + ) + return None + if level_lower == "xhigh" and model_id and not supports_xhigh_thinking(model_id): + logger.warning( + "Thinking level 'xhigh' not supported by model '%s'. " + "xhigh is only available for: %s", + model_id, + ", ".join(sorted(set(m.split("/")[1] for m in XHIGH_MODELS))), + ) + return None + return level_lower + + def _get_agent_workspace(agent_id: str) -> Path | None: """Get the workspace path for an agent from OpenClaw config.""" try: @@ -386,10 +450,6 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str return totals -# Valid thinking levels for OpenClaw agents -VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high") - - def execute_openclaw_task( *, task: Task, @@ -398,7 +458,7 @@ def execute_openclaw_task( run_id: str, timeout_multiplier: float, skill_dir: Path, - thinking_level: str | None = None, + thinking_level: Optional[str] = None, ) -> Dict[str, Any]: logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id) logger.info(" Task: %s", task.name) @@ -486,7 +546,7 @@ def run_openclaw_prompt( prompt: str, workspace: Path, timeout_seconds: float, - thinking_level: str | None = None, + thinking_level: Optional[str] = None, ) -> Dict[str, Any]: """Run a single OpenClaw prompt for helper agents like the judge.""" # Clean up previous session transcripts so we can reliably find this From cbc34329ad48f990aef0c28ab3445f749c049f39 Mon Sep 17 00:00:00 2001 From: OpenClaw Agent Date: Sat, 7 Mar 2026 10:25:58 -0800 Subject: [PATCH 3/3] Harden thinking-level validation and model compatibility checks - Add strict xhigh model matching (provider-aware) - Add adaptive support detection (Anthropic Claude 4.6 family) - Deduplicate requested thinking levels while preserving order - Fail fast when --thinking is provided but no valid levels remain - Keep subprocess input constrained to validated levels --- scripts/benchmark.py | 27 ++++++++++++++----- scripts/lib_agent.py | 63 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index b2dacf2..9839671 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -220,7 +220,7 @@ def _parse_args() -> argparse.Namespace: default=None, help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). " f"Valid levels: {', '.join(THINKING_LEVELS)}. " - "Note: 'xhigh' is only supported by GPT-5.x models. " + "Note: 'xhigh' requires GPT-5.x models; 'adaptive' is for Anthropic Claude 4.6. " "If not specified, runs without explicit thinking level.", ) return parser.parse_args() @@ -243,21 +243,32 @@ def _parse_thinking_levels( Args: thinking_arg: Comma-separated thinking levels or None - model_id: Optional model ID to check xhigh compatibility + model_id: Optional model ID to check level compatibility Returns: List of validated thinking levels (or [None] if no explicit level). + + Raises: + ValueError: If --thinking was provided but no levels are valid for the model. """ if thinking_arg is None: return [None] # Run once without explicit thinking level - levels = [] + levels: List[str] = [] + seen = set() for level in thinking_arg.split(","): validated = validate_thinking_level(level.strip(), model_id) - if validated: + if validated and validated not in seen: levels.append(validated) + seen.add(validated) + + if not levels: + raise ValueError( + "No valid thinking levels remain after validation. " + "Check your --thinking values for this model." + ) - return levels if levels else [None] + return levels def _next_run_id(run_root: Path) -> str: @@ -394,7 +405,11 @@ def main(): cleanup_agent_sessions(agent_id) task_ids = _select_task_ids(runner.tasks, args.suite) - thinking_levels = _parse_thinking_levels(args.thinking, args.model) + try: + thinking_levels = _parse_thinking_levels(args.thinking, args.model) + except ValueError as exc: + logger.error(str(exc)) + sys.exit(2) results = [] grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {} diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 5ec99fe..b73c8e5 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -39,6 +39,18 @@ "github-copilot/gpt-5.2", } +XHIGH_MODELS_LOWER = {model.lower() for model in XHIGH_MODELS} +XHIGH_MODEL_IDS_LOWER = {model.split("/")[-1].lower() for model in XHIGH_MODELS} + +# Adaptive thinking is currently provider-managed for Anthropic Claude 4.6 models. +ADAPTIVE_PROVIDER = "anthropic" +ADAPTIVE_MODEL_PREFIXES = ( + "claude-opus-4-6", + "claude-opus-4.6", + "claude-sonnet-4-6", + "claude-sonnet-4.6", +) + def slugify_model(model_id: str) -> str: return model_id.replace("/", "-").replace(".", "-") @@ -56,13 +68,42 @@ def normalize_model_id(model_id: str) -> str: def supports_xhigh_thinking(model_id: str) -> bool: """Check if a model supports xhigh thinking level.""" normalized = normalize_model_id(model_id).lower() - model_lower = model_id.lower() - # Check full provider/model form - if normalized in {m.lower() for m in XHIGH_MODELS}: + if normalized in XHIGH_MODELS_LOWER: return True - # Check just model ID (without provider) - model_only = model_lower.split("/")[-1] if "/" in model_lower else model_lower - return model_only in {m.split("/")[-1].lower() for m in XHIGH_MODELS} + + # Handle openrouter/provider/model format. + parts = normalized.split("/") + if len(parts) == 3 and parts[0] == "openrouter": + provider_model = f"{parts[1]}/{parts[2]}" + if provider_model in XHIGH_MODELS_LOWER: + return True + + # Only allow bare model-id fallback when the caller did not provide a provider. + if "/" not in model_id: + return model_id.lower() in XHIGH_MODEL_IDS_LOWER + + return False + + +def supports_adaptive_thinking(model_id: str) -> bool: + """Check if a model natively supports adaptive thinking.""" + normalized = normalize_model_id(model_id).lower() + parts = normalized.split("/") + + # openrouter/provider/model -> provider/model + if len(parts) == 3 and parts[0] == "openrouter": + provider = parts[1] + model = parts[2] + elif len(parts) >= 2: + provider = parts[-2] + model = parts[-1] + else: + return False + + if provider != ADAPTIVE_PROVIDER: + return False + + return any(model.startswith(prefix) for prefix in ADAPTIVE_MODEL_PREFIXES) def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optional[str]: @@ -87,9 +128,15 @@ def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optio if level_lower == "xhigh" and model_id and not supports_xhigh_thinking(model_id): logger.warning( "Thinking level 'xhigh' not supported by model '%s'. " - "xhigh is only available for: %s", + "xhigh is only available for GPT-5.x model families.", + model_id, + ) + return None + if level_lower == "adaptive" and model_id and not supports_adaptive_thinking(model_id): + logger.warning( + "Thinking level 'adaptive' is not natively supported by model '%s'. " + "adaptive is currently intended for Anthropic Claude 4.6 models.", model_id, - ", ".join(sorted(set(m.split("/")[1] for m in XHIGH_MODELS))), ) return None return level_lower