From 87bb258215f24b5bb8066ca25f4a621a1424c0d2 Mon Sep 17 00:00:00 2001
From: OpenClaw Agent <agent@openclaw.local>
Date: Sat, 7 Mar 2026 09:04:04 -0800
Subject: [PATCH 1/3] Add thinking levels support for benchmarking reasoning
 depth

- Add --thinking CLI argument to specify comma-separated thinking levels
- Pass thinking level to OpenClaw agent via --thinking flag
- Run each task across all specified thinking levels
- Include thinking_level in task results
- Add thinking_aggregates section with per-level statistics
- Support levels: off, minimal, low, medium, high
- Update SKILL.md and README.md with documentation

Closes #9
---
 README.md            |  17 ++++
 SKILL.md             |  21 +++++
 scripts/benchmark.py | 213 +++++++++++++++++++++++++++++--------------
 scripts/lib_agent.py |  55 +++++++----
 4 files changed, 220 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index 4009eb5..acd6afc 100644
--- a/README.md
+++ b/README.md
@@ -78,11 +78,28 @@ Skip uploading with `--no-upload` if you just want local results.
 | `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs |
 | `--runs N` | Number of runs per task for averaging |
 | `--timeout-multiplier N` | Scale timeouts for slower models |
+| `--thinking LEVELS` | Comma-separated thinking levels (e.g., `low,medium,high`) |
 | `--output-dir DIR` | Where to save results (default: `results/`) |
 | `--no-upload` | Skip uploading to leaderboard |
 | `--register` | Request an API token for submissions |
 | `--upload FILE` | Upload a previous results JSON |
 
+## Thinking Levels
+
+Many models support configurable thinking/reasoning levels. Test how different reasoning depths affect task performance:
+
+```bash
+# Test multiple thinking levels
+./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking low,medium,high
+
+# Compare a single level against default
+./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking high
+```
+
+Valid levels: `off`, `minimal`, `low`, `medium`, `high`
+
+Results include a `thinking_aggregates` section with per-level statistics, and each task result includes the `thinking_level` used.
+
 ## Contributing Tasks
 
 We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are:
diff --git a/SKILL.md b/SKILL.md
index d6f0284..0cfbd98 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -73,6 +73,7 @@ uv run benchmark.py --model anthropic/claude-sonnet-4 --no-upload
 | `--output-dir` | Results directory (default: `results/`) |
 | `--timeout-multiplier` | Scale task timeouts for slower models |
 | `--runs` | Number of runs per task for averaging |
+| `--thinking` | Comma-separated thinking levels (e.g., `low,medium,high`) |
 | `--no-upload` | Skip uploading to leaderboard |
 | `--register` | Request new API token for submissions |
 | `--upload FILE` | Upload previous results JSON |
@@ -89,6 +90,26 @@ uv run benchmark.py --register
 uv run benchmark.py --model anthropic/claude-sonnet-4
 ```
 
+## Thinking Levels
+
+Many models support different thinking/reasoning levels (e.g., Claude's extended thinking). PinchBench can run tasks across multiple thinking levels to measure how reasoning depth affects performance:
+
+```bash
+# Run with multiple thinking levels
+uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking low,medium,high
+
+# Run with a single thinking level
+uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking high
+```
+
+Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high`
+
+Results include per-level aggregates:
+- `thinking_aggregates`: Summary statistics for each thinking level
+- Per-task results include `thinking_level` field
+
+**Note:** Thinking levels are passed directly to OpenClaw's `--thinking` flag. Not all models support all levels.
+
 ## Results
 
 Results are saved as JSON in the output directory:
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 10fe48a..6dce252 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -28,6 +28,7 @@
     ensure_agent_exists,
     execute_openclaw_task,
     slugify_model,
+    VALID_THINKING_LEVELS,
 )
 from lib_grading import GradeResult, grade_task
 from lib_tasks import Task, TaskLoader
@@ -212,6 +213,14 @@ def _parse_args() -> argparse.Namespace:
         default=1,
         help="Number of runs per task for averaging",
     )
+    parser.add_argument(
+        "--thinking",
+        type=str,
+        default=None,
+        help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). "
+        f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}. "
+        "If not specified, runs without explicit thinking level.",
+    )
     return parser.parse_args()
 
 
@@ -223,6 +232,31 @@ def _select_task_ids(tasks: List[Task], suite: str) -> Optional[List[str]]:
     return [task_id.strip() for task_id in suite.split(",") if task_id.strip()]
 
 
+def _parse_thinking_levels(thinking_arg: Optional[str]) -> List[Optional[str]]:
+    """
+    Parse thinking levels from the argument.
+
+    Returns a list of thinking levels to test.
+    Each element is either a valid thinking level string or None (no explicit level).
+    """
+    if thinking_arg is None:
+        return [None]  # Run once without explicit thinking level
+
+    levels = []
+    for level in thinking_arg.split(","):
+        level = level.strip().lower()
+        if level in VALID_THINKING_LEVELS:
+            levels.append(level)
+        else:
+            logger.warning(
+                "Invalid thinking level '%s', skipping. Valid levels: %s",
+                level,
+                ", ".join(VALID_THINKING_LEVELS),
+            )
+
+    return levels if levels else [None]
+
+
 def _next_run_id(run_root: Path) -> str:
     run_root.mkdir(parents=True, exist_ok=True)
     existing = []
@@ -357,8 +391,9 @@ def main():
     cleanup_agent_sessions(agent_id)
 
     task_ids = _select_task_ids(runner.tasks, args.suite)
+    thinking_levels = _parse_thinking_levels(args.thinking)
     results = []
-    grades_by_task_id = {}
+    grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {}
 
     tasks_to_run = runner.tasks
     if task_ids is not None:
@@ -366,72 +401,112 @@ def main():
     tasks_by_id = {task.task_id: task for task in tasks_to_run}
 
     runs_per_task = max(1, args.runs)
-    for i, task in enumerate(tasks_to_run, 1):
-        task_grades = []
-        for run_index in range(runs_per_task):
-            logger.info("\n%s", "=" * 80)
-            logger.info(
-                "📋 Task %s/%s (Run %s/%s)",
-                i,
-                len(tasks_to_run),
-                run_index + 1,
-                runs_per_task,
-            )
-            logger.info("%s", "=" * 80)
-            execution_error = None
-            try:
-                result = execute_openclaw_task(
-                    task=task,
-                    agent_id=agent_id,
-                    model_id=args.model,
-                    run_id=f"{run_id}-{run_index + 1}",
-                    timeout_multiplier=args.timeout_multiplier,
-                    skill_dir=skill_dir,
-                )
-            except Exception as exc:
-                execution_error = str(exc)
-                logger.warning(
-                    "Task execution failed for %s, continuing: %s", task.task_id, exc
-                )
-                result = {
-                    "agent_id": agent_id,
-                    "task_id": task.task_id,
-                    "status": "error",
-                    "transcript": [],
-                    "usage": {},
-                    "workspace": "",
-                    "exit_code": -1,
-                    "timed_out": False,
-                    "execution_time": 0.0,
-                    "stdout": "",
-                    "stderr": execution_error,
-                }
-            try:
-                grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir)
-            except Exception as exc:
-                if execution_error:
-                    note = f"Execution failed: {execution_error}; Grading failed: {exc}"
-                else:
-                    note = f"Grading failed: {exc}"
-                logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc)
-                grade = GradeResult(
-                    task_id=task.task_id,
-                    score=0.0,
-                    max_score=1.0,
-                    grading_type=task.grading_type,
-                    breakdown={},
-                    notes=note,
+    total_runs = len(tasks_to_run) * runs_per_task * len(thinking_levels)
+    run_counter = 0
+
+    for thinking_level in thinking_levels:
+        thinking_label = thinking_level or "default"
+        logger.info("\n%s", "=" * 80)
+        logger.info("🧠 Thinking Level: %s", thinking_label)
+        logger.info("%s", "=" * 80)
+
+        for i, task in enumerate(tasks_to_run, 1):
+            task_key = f"{task.task_id}:{thinking_label}" if thinking_level else task.task_id
+            task_grades = []
+
+            for run_index in range(runs_per_task):
+                run_counter += 1
+                logger.info("\n%s", "-" * 80)
+                logger.info(
+                    "📋 Task %s/%s (Run %s/%s) [%s] — Overall progress: %s/%s",
+                    i,
+                    len(tasks_to_run),
+                    run_index + 1,
+                    runs_per_task,
+                    thinking_label,
+                    run_counter,
+                    total_runs,
                 )
-            task_grades.append(grade)
-            results.append(result)
+                logger.info("%s", "-" * 80)
+                execution_error = None
+                try:
+                    result = execute_openclaw_task(
+                        task=task,
+                        agent_id=agent_id,
+                        model_id=args.model,
+                        run_id=f"{run_id}-{run_index + 1}",
+                        timeout_multiplier=args.timeout_multiplier,
+                        skill_dir=skill_dir,
+                        thinking_level=thinking_level,
+                    )
+                except Exception as exc:
+                    execution_error = str(exc)
+                    logger.warning(
+                        "Task execution failed for %s, continuing: %s", task.task_id, exc
+                    )
+                    result = {
+                        "agent_id": agent_id,
+                        "task_id": task.task_id,
+                        "thinking_level": thinking_level,
+                        "status": "error",
+                        "transcript": [],
+                        "usage": {},
+                        "workspace": "",
+                        "exit_code": -1,
+                        "timed_out": False,
+                        "execution_time": 0.0,
+                        "stdout": "",
+                        "stderr": execution_error,
+                    }
+                try:
+                    grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir)
+                except Exception as exc:
+                    if execution_error:
+                        note = f"Execution failed: {execution_error}; Grading failed: {exc}"
+                    else:
+                        note = f"Grading failed: {exc}"
+                    logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc)
+                    grade = GradeResult(
+                        task_id=task.task_id,
+                        score=0.0,
+                        max_score=1.0,
+                        grading_type=task.grading_type,
+                        breakdown={},
+                        notes=note,
+                    )
+                task_grades.append(grade)
+                result["thinking_level"] = thinking_level
+                results.append(result)
+
+            task_scores = [grade.score for grade in task_grades]
+            grades_by_task_and_thinking[task_key] = {
+                "task_id": task.task_id,
+                "thinking_level": thinking_level,
+                "runs": [grade.to_dict() for grade in task_grades],
+                "mean": statistics.mean(task_scores),
+                "std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0,
+                "min": min(task_scores),
+                "max": max(task_scores),
+            }
 
-        task_scores = [grade.score for grade in task_grades]
-        grades_by_task_id[task.task_id] = {
-            "runs": [grade.to_dict() for grade in task_grades],
-            "mean": statistics.mean(task_scores),
-            "std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0,
-            "min": min(task_scores),
-            "max": max(task_scores),
+    # Compute per-thinking-level aggregates
+    thinking_aggregates: Dict[str, Dict[str, Any]] = {}
+    for thinking_level in thinking_levels:
+        thinking_label = thinking_level or "default"
+        level_keys = [
+            k for k, v in grades_by_task_and_thinking.items()
+            if v.get("thinking_level") == thinking_level
+        ]
+        if not level_keys:
+            continue
+        scores = [grades_by_task_and_thinking[k]["mean"] for k in level_keys]
+        thinking_aggregates[thinking_label] = {
+            "thinking_level": thinking_label,
+            "task_count": len(scores),
+            "mean_score": statistics.mean(scores) if scores else 0.0,
+            "std_score": statistics.stdev(scores) if len(scores) > 1 else 0.0,
+            "min_score": min(scores) if scores else 0.0,
+            "max_score": max(scores) if scores else 0.0,
         }
 
     output_dir = Path(args.output_dir)
@@ -443,16 +518,22 @@ def main():
         "timestamp": time.time(),
         "suite": args.suite,
         "runs_per_task": runs_per_task,
+        "thinking_levels": [tl or "default" for tl in thinking_levels],
+        "thinking_aggregates": thinking_aggregates,
         "tasks": [
             {
                 "task_id": result["task_id"],
+                "thinking_level": result.get("thinking_level"),
                 "status": result["status"],
                 "timed_out": result["timed_out"],
                 "execution_time": result["execution_time"],
                 "transcript_length": len(result["transcript"]),
                 "usage": result.get("usage", {}),
                 "workspace": result["workspace"],
-                "grading": grades_by_task_id[result["task_id"]],
+                "grading": grades_by_task_and_thinking.get(
+                    f"{result['task_id']}:{result.get('thinking_level') or 'default'}",
+                    grades_by_task_and_thinking.get(result["task_id"], {}),
+                ),
                 "frontmatter": tasks_by_id[result["task_id"]].frontmatter,
             }
             for result in results
diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
index 46e3355..bfd1f63 100644
--- a/scripts/lib_agent.py
+++ b/scripts/lib_agent.py
@@ -386,6 +386,10 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str
     return totals
 
 
+# Valid thinking levels for OpenClaw agents
+VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high")
+
+
 def execute_openclaw_task(
     *,
     task: Task,
@@ -394,10 +398,13 @@ def execute_openclaw_task(
     run_id: str,
     timeout_multiplier: float,
     skill_dir: Path,
+    thinking_level: str | None = None,
 ) -> Dict[str, Any]:
     logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
     logger.info("   Task: %s", task.name)
     logger.info("   Category: %s", task.category)
+    if thinking_level:
+        logger.info("   Thinking: %s", thinking_level)
 
     # Clean up previous session transcripts so we can reliably find this task's
     # transcript (OpenClaw uses its own UUID-based naming, not our session ID).
@@ -413,17 +420,20 @@ def execute_openclaw_task(
     timed_out = False
 
     try:
+        cmd = [
+            "openclaw",
+            "agent",
+            "--agent",
+            agent_id,
+            "--session-id",
+            session_id,
+            "--message",
+            task.prompt,
+        ]
+        if thinking_level:
+            cmd.extend(["--thinking", thinking_level])
         result = subprocess.run(
-            [
-                "openclaw",
-                "agent",
-                "--agent",
-                agent_id,
-                "--session-id",
-                session_id,
-                "--message",
-                task.prompt,
-            ],
+            cmd,
             capture_output=True,
             text=True,
             cwd=str(workspace),
@@ -457,6 +467,7 @@ def execute_openclaw_task(
     return {
         "agent_id": agent_id,
         "task_id": task.task_id,
+        "thinking_level": thinking_level,
         "status": status,
         "transcript": transcript,
         "usage": usage,
@@ -475,6 +486,7 @@ def run_openclaw_prompt(
     prompt: str,
     workspace: Path,
     timeout_seconds: float,
+    thinking_level: str | None = None,
 ) -> Dict[str, Any]:
     """Run a single OpenClaw prompt for helper agents like the judge."""
     # Clean up previous session transcripts so we can reliably find this
@@ -519,17 +531,20 @@ def run_openclaw_prompt(
             timed_out = True
             break
         try:
+            cmd = [
+                "openclaw",
+                "agent",
+                "--agent",
+                agent_id,
+                "--session-id",
+                session_id,
+                "--message",
+                chunk,
+            ]
+            if thinking_level:
+                cmd.extend(["--thinking", thinking_level])
             result = subprocess.run(
-                [
-                    "openclaw",
-                    "agent",
-                    "--agent",
-                    agent_id,
-                    "--session-id",
-                    session_id,
-                    "--message",
-                    chunk,
-                ],
+                cmd,
                 capture_output=True,
                 text=True,
                 cwd=str(workspace),

From 268dd538ebc288e9dd5db920a4f17f8004b063a3 Mon Sep 17 00:00:00 2001
From: OpenClaw Agent <agent@openclaw.local>
Date: Sat, 7 Mar 2026 10:00:49 -0800
Subject: [PATCH 2/3] Fix thinking levels: use
 off/minimal/low/medium/high/xhigh/adaptive

- Add xhigh and adaptive to valid thinking levels (matching OpenClaw)
- Add model-aware xhigh validation (only GPT-5.x models support it)
- Validate thinking levels before passing to OpenClaw subprocess
- Document model-specific restrictions in help text and docs
- Follow existing code style (Optional[str] instead of str | None)
- No unnecessary changes to existing code
---
 README.md            |  7 ++++-
 SKILL.md             |  9 ++++--
 scripts/benchmark.py | 33 +++++++++++---------
 scripts/lib_agent.py | 74 +++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index acd6afc..7248c44 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,12 @@ Many models support configurable thinking/reasoning levels. Test how different r
 ./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking high
 ```
 
-Valid levels: `off`, `minimal`, `low`, `medium`, `high`
+Valid levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive`
+
+**Model-specific notes:**
+- `xhigh` is only available for GPT-5.x models (gpt-5.4, gpt-5.2, codex variants)
+- `adaptive` is provider-managed reasoning budget (Anthropic Claude 4.6 family)
+- Invalid levels for your model are warned and skipped
 
 Results include a `thinking_aggregates` section with per-level statistics, and each task result includes the `thinking_level` used.
 
diff --git a/SKILL.md b/SKILL.md
index 0cfbd98..a1fcba9 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -102,14 +102,17 @@ uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking low,medium,high
 uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking high
 ```
 
-Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high`
+Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive`
+
+**Model-specific notes:**
+- `xhigh` is only supported by GPT-5.x models (gpt-5.4, gpt-5.2, codex variants)
+- `adaptive` is provider-managed reasoning (Anthropic Claude 4.6 family)
+- Invalid levels for your model are warned and skipped
 
 Results include per-level aggregates:
 - `thinking_aggregates`: Summary statistics for each thinking level
 - Per-task results include `thinking_level` field
 
-**Note:** Thinking levels are passed directly to OpenClaw's `--thinking` flag. Not all models support all levels.
-
 ## Results
 
 Results are saved as JSON in the output directory:
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 6dce252..b2dacf2 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -28,7 +28,8 @@
     ensure_agent_exists,
     execute_openclaw_task,
     slugify_model,
-    VALID_THINKING_LEVELS,
+    THINKING_LEVELS,
+    validate_thinking_level,
 )
 from lib_grading import GradeResult, grade_task
 from lib_tasks import Task, TaskLoader
@@ -218,7 +219,8 @@ def _parse_args() -> argparse.Namespace:
         type=str,
         default=None,
         help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). "
-        f"Valid levels: {', '.join(VALID_THINKING_LEVELS)}. "
+        f"Valid levels: {', '.join(THINKING_LEVELS)}. "
+        "Note: 'xhigh' is only supported by GPT-5.x models. "
         "If not specified, runs without explicit thinking level.",
     )
     return parser.parse_args()
@@ -232,27 +234,28 @@ def _select_task_ids(tasks: List[Task], suite: str) -> Optional[List[str]]:
     return [task_id.strip() for task_id in suite.split(",") if task_id.strip()]
 
 
-def _parse_thinking_levels(thinking_arg: Optional[str]) -> List[Optional[str]]:
+def _parse_thinking_levels(
+    thinking_arg: Optional[str],
+    model_id: Optional[str] = None,
+) -> List[Optional[str]]:
     """
     Parse thinking levels from the argument.
 
-    Returns a list of thinking levels to test.
-    Each element is either a valid thinking level string or None (no explicit level).
+    Args:
+        thinking_arg: Comma-separated thinking levels or None
+        model_id: Optional model ID to check xhigh compatibility
+
+    Returns:
+        List of validated thinking levels (or [None] if no explicit level).
     """
     if thinking_arg is None:
         return [None]  # Run once without explicit thinking level
 
     levels = []
     for level in thinking_arg.split(","):
-        level = level.strip().lower()
-        if level in VALID_THINKING_LEVELS:
-            levels.append(level)
-        else:
-            logger.warning(
-                "Invalid thinking level '%s', skipping. Valid levels: %s",
-                level,
-                ", ".join(VALID_THINKING_LEVELS),
-            )
+        validated = validate_thinking_level(level.strip(), model_id)
+        if validated:
+            levels.append(validated)
 
     return levels if levels else [None]
 
@@ -391,7 +394,7 @@ def main():
     cleanup_agent_sessions(agent_id)
 
     task_ids = _select_task_ids(runner.tasks, args.suite)
-    thinking_levels = _parse_thinking_levels(args.thinking)
+    thinking_levels = _parse_thinking_levels(args.thinking, args.model)
     results = []
     grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {}
 
diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
index bfd1f63..5ec99fe 100644
--- a/scripts/lib_agent.py
+++ b/scripts/lib_agent.py
@@ -9,7 +9,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from lib_tasks import Task
 
@@ -17,6 +17,28 @@
 logger = logging.getLogger(__name__)
 MAX_OPENCLAW_MESSAGE_CHARS = 4000
 
+# Thinking levels supported by OpenClaw
+# See: https://docs.openclaw.ai/tools/thinking
+THINKING_LEVELS = ("off", "minimal", "low", "medium", "high", "xhigh", "adaptive")
+
+# Models that support xhigh thinking level (high reasoning budget)
+# Sourced from OpenClaw src/auto-reply/thinking.ts XHIGH_MODEL_REFS
+XHIGH_MODELS = {
+    # OpenAI
+    "openai/gpt-5.4",
+    "openai/gpt-5.4-pro",
+    "openai/gpt-5.2",
+    # OpenAI Codex
+    "openai-codex/gpt-5.4",
+    "openai-codex/gpt-5.3-codex",
+    "openai-codex/gpt-5.3-codex-spark",
+    "openai-codex/gpt-5.2-codex",
+    "openai-codex/gpt-5.1-codex",
+    # GitHub Copilot
+    "github-copilot/gpt-5.2-codex",
+    "github-copilot/gpt-5.2",
+}
+
 
 def slugify_model(model_id: str) -> str:
     return model_id.replace("/", "-").replace(".", "-")
@@ -31,6 +53,48 @@ def normalize_model_id(model_id: str) -> str:
     return f"openrouter/{model_id}"
 
 
+def supports_xhigh_thinking(model_id: str) -> bool:
+    """Check if a model supports xhigh thinking level."""
+    normalized = normalize_model_id(model_id).lower()
+    model_lower = model_id.lower()
+    # Check full provider/model form
+    if normalized in {m.lower() for m in XHIGH_MODELS}:
+        return True
+    # Check just model ID (without provider)
+    model_only = model_lower.split("/")[-1] if "/" in model_lower else model_lower
+    return model_only in {m.split("/")[-1].lower() for m in XHIGH_MODELS}
+
+
+def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optional[str]:
+    """
+    Validate a thinking level and check model compatibility.
+
+    Args:
+        level: The thinking level to validate
+        model_id: Optional model ID to check xhigh compatibility
+
+    Returns:
+        The validated level, or None if invalid
+    """
+    level_lower = level.lower().strip()
+    if level_lower not in THINKING_LEVELS:
+        logger.warning(
+            "Invalid thinking level '%s'. Valid levels: %s",
+            level,
+            ", ".join(THINKING_LEVELS),
+        )
+        return None
+    if level_lower == "xhigh" and model_id and not supports_xhigh_thinking(model_id):
+        logger.warning(
+            "Thinking level 'xhigh' not supported by model '%s'. "
+            "xhigh is only available for: %s",
+            model_id,
+            ", ".join(sorted(set(m.split("/")[1] for m in XHIGH_MODELS))),
+        )
+        return None
+    return level_lower
+
+
 def _get_agent_workspace(agent_id: str) -> Path | None:
     """Get the workspace path for an agent from OpenClaw config."""
     try:
@@ -386,10 +450,6 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str
     return totals
 
 
-# Valid thinking levels for OpenClaw agents
-VALID_THINKING_LEVELS = ("off", "minimal", "low", "medium", "high")
-
-
 def execute_openclaw_task(
     *,
     task: Task,
@@ -398,7 +458,7 @@ def execute_openclaw_task(
     run_id: str,
     timeout_multiplier: float,
     skill_dir: Path,
-    thinking_level: str | None = None,
+    thinking_level: Optional[str] = None,
 ) -> Dict[str, Any]:
     logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
     logger.info("   Task: %s", task.name)
@@ -486,7 +546,7 @@ def run_openclaw_prompt(
     prompt: str,
     workspace: Path,
     timeout_seconds: float,
-    thinking_level: str | None = None,
+    thinking_level: Optional[str] = None,
 ) -> Dict[str, Any]:
     """Run a single OpenClaw prompt for helper agents like the judge."""
     # Clean up previous session transcripts so we can reliably find this

From cbc34329ad48f990aef0c28ab3445f749c049f39 Mon Sep 17 00:00:00 2001
From: OpenClaw Agent <agent@openclaw.local>
Date: Sat, 7 Mar 2026 10:25:58 -0800
Subject: [PATCH 3/3] Harden thinking-level validation and model compatibility
 checks

- Add strict xhigh model matching (provider-aware)
- Add adaptive support detection (Anthropic Claude 4.6 family)
- Deduplicate requested thinking levels while preserving order
- Fail fast when --thinking is provided but no valid levels remain
- Keep subprocess input constrained to validated levels
---
 scripts/benchmark.py | 27 ++++++++++++++-----
 scripts/lib_agent.py | 63 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index b2dacf2..9839671 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -220,7 +220,7 @@ def _parse_args() -> argparse.Namespace:
         default=None,
         help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). "
         f"Valid levels: {', '.join(THINKING_LEVELS)}. "
-        "Note: 'xhigh' is only supported by GPT-5.x models. "
+        "Note: 'xhigh' requires GPT-5.x models; 'adaptive' is for Anthropic Claude 4.6. "
         "If not specified, runs without explicit thinking level.",
     )
     return parser.parse_args()
@@ -243,21 +243,32 @@ def _parse_thinking_levels(
 
     Args:
         thinking_arg: Comma-separated thinking levels or None
-        model_id: Optional model ID to check xhigh compatibility
+        model_id: Optional model ID to check level compatibility
 
     Returns:
         List of validated thinking levels (or [None] if no explicit level).
+
+    Raises:
+        ValueError: If --thinking was provided but no levels are valid for the model.
     """
     if thinking_arg is None:
         return [None]  # Run once without explicit thinking level
 
-    levels = []
+    levels: List[str] = []
+    seen = set()
     for level in thinking_arg.split(","):
         validated = validate_thinking_level(level.strip(), model_id)
-        if validated:
+        if validated and validated not in seen:
             levels.append(validated)
+            seen.add(validated)
+
+    if not levels:
+        raise ValueError(
+            "No valid thinking levels remain after validation. "
+            "Check your --thinking values for this model."
+        )
 
-    return levels if levels else [None]
+    return levels
 
 
 def _next_run_id(run_root: Path) -> str:
@@ -394,7 +405,11 @@ def main():
     cleanup_agent_sessions(agent_id)
 
     task_ids = _select_task_ids(runner.tasks, args.suite)
-    thinking_levels = _parse_thinking_levels(args.thinking, args.model)
+    try:
+        thinking_levels = _parse_thinking_levels(args.thinking, args.model)
+    except ValueError as exc:
+        logger.error(str(exc))
+        sys.exit(2)
     results = []
     grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {}
 
diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
index 5ec99fe..b73c8e5 100644
--- a/scripts/lib_agent.py
+++ b/scripts/lib_agent.py
@@ -39,6 +39,18 @@
     "github-copilot/gpt-5.2",
 }
 
+XHIGH_MODELS_LOWER = {model.lower() for model in XHIGH_MODELS}
+XHIGH_MODEL_IDS_LOWER = {model.split("/")[-1].lower() for model in XHIGH_MODELS}
+
+# Adaptive thinking is currently provider-managed for Anthropic Claude 4.6 models.
+ADAPTIVE_PROVIDER = "anthropic"
+ADAPTIVE_MODEL_PREFIXES = (
+    "claude-opus-4-6",
+    "claude-opus-4.6",
+    "claude-sonnet-4-6",
+    "claude-sonnet-4.6",
+)
+
 
 def slugify_model(model_id: str) -> str:
     return model_id.replace("/", "-").replace(".", "-")
@@ -56,13 +68,42 @@ def normalize_model_id(model_id: str) -> str:
 def supports_xhigh_thinking(model_id: str) -> bool:
     """Check if a model supports xhigh thinking level."""
     normalized = normalize_model_id(model_id).lower()
-    model_lower = model_id.lower()
-    # Check full provider/model form
-    if normalized in {m.lower() for m in XHIGH_MODELS}:
+    if normalized in XHIGH_MODELS_LOWER:
         return True
-    # Check just model ID (without provider)
-    model_only = model_lower.split("/")[-1] if "/" in model_lower else model_lower
-    return model_only in {m.split("/")[-1].lower() for m in XHIGH_MODELS}
+
+    # Handle openrouter/provider/model format.
+    parts = normalized.split("/")
+    if len(parts) == 3 and parts[0] == "openrouter":
+        provider_model = f"{parts[1]}/{parts[2]}"
+        if provider_model in XHIGH_MODELS_LOWER:
+            return True
+
+    # Only allow bare model-id fallback when the caller did not provide a provider.
+    if "/" not in model_id:
+        return model_id.lower() in XHIGH_MODEL_IDS_LOWER
+
+    return False
+
+
+def supports_adaptive_thinking(model_id: str) -> bool:
+    """Check if a model natively supports adaptive thinking."""
+    normalized = normalize_model_id(model_id).lower()
+    parts = normalized.split("/")
+
+    # openrouter/provider/model -> provider/model
+    if len(parts) == 3 and parts[0] == "openrouter":
+        provider = parts[1]
+        model = parts[2]
+    elif len(parts) >= 2:
+        provider = parts[-2]
+        model = parts[-1]
+    else:
+        return False
+
+    if provider != ADAPTIVE_PROVIDER:
+        return False
+
+    return any(model.startswith(prefix) for prefix in ADAPTIVE_MODEL_PREFIXES)
 
 
 def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optional[str]:
@@ -87,9 +128,15 @@ def validate_thinking_level(level: str, model_id: Optional[str] = None) -> Optio
     if level_lower == "xhigh" and model_id and not supports_xhigh_thinking(model_id):
         logger.warning(
             "Thinking level 'xhigh' not supported by model '%s'. "
-            "xhigh is only available for: %s",
+            "xhigh is only available for GPT-5.x model families.",
+            model_id,
+        )
+        return None
+    if level_lower == "adaptive" and model_id and not supports_adaptive_thinking(model_id):
+        logger.warning(
+            "Thinking level 'adaptive' is not natively supported by model '%s'. "
+            "adaptive is currently intended for Anthropic Claude 4.6 models.",
             model_id,
-            ", ".join(sorted(set(m.split("/")[1] for m in XHIGH_MODELS))),
         )
         return None
     return level_lower