diff --git a/README.md b/README.md
index a306faf..a11375f 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
 | Flag                     | Description                                                                   |
 | ------------------------ | ----------------------------------------------------------------------------- |
 | `--model MODEL`          | Model to test (e.g., `openrouter/anthropic/claude-sonnet-4`)                  |
-| `--judge MODEL`          | Judge model for LLM grading (default: `openrouter/anthropic/claude-opus-4.5`) |
+| `--judge MODEL`          | Judge model for LLM grading; uses direct API when set (see below)                 |
 | `--suite SUITE`          | `all`, `automated-only`, or comma-separated task IDs                          |
 | `--runs N`               | Number of runs per task for averaging                                         |
 | `--timeout-multiplier N` | Scale timeouts for slower models                                              |
@@ -103,6 +103,29 @@ export PINCHBENCH_OFFICIAL_KEY=your_official_key
 | `--upload FILE`          | Upload a previous results JSON                                                |
 | `--official-key KEY`     | Mark submission as official (or use `PINCHBENCH_OFFICIAL_KEY` env var)         |
 
+### Judge
+
+By default (no `--judge` flag), the LLM judge runs as an OpenClaw agent session. When `--judge` is specified, it calls the model API directly instead, bypassing OpenClaw personality injection.
+
+```bash
+# Default: OpenClaw agent session (no --judge needed)
+./scripts/run.sh --model openrouter/anthropic/claude-sonnet-4
+
+# Direct API via OpenRouter
+./scripts/run.sh --model openai/gpt-4o --judge openrouter/anthropic/claude-sonnet-4-5
+
+# Direct API via Anthropic
+./scripts/run.sh --model openai/gpt-4o --judge anthropic/claude-sonnet-4-5-20250514
+
+# Direct API via OpenAI
+./scripts/run.sh --model openai/gpt-4o --judge openai/gpt-4o
+
+# Headless Claude CLI
+./scripts/run.sh --model openai/gpt-4o --judge claude
+```
+
+Required env vars: `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` depending on the judge model prefix.
+
 ## Contributing Tasks
 
 We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are:
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 566bcb3..5b6e4e3 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -217,7 +217,11 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--judge",
         default=None,
-        help="Judge model identifier (default: openrouter/anthropic/claude-opus-4.5)",
+        help=(
+            "Judge model or backend. Default (unset): OpenClaw agent session with "
+            "openrouter/anthropic/claude-opus-4.5. Set to a model ID to call its API "
+            "directly (e.g. openai/gpt-4o, anthropic/claude-sonnet-4-5-20250514, claude)"
+        ),
     )
     parser.add_argument(
         "--verbose",
@@ -628,6 +632,7 @@ def main():
                 )
                 if args.judge:
                     grade_kwargs["judge_model"] = args.judge
+                    grade_kwargs["judge_backend"] = "api"
                 grade = grade_task(**grade_kwargs)
             except Exception as exc:
                 if execution_error:
diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
index 7f261bf..3e7ae0e 100644
--- a/scripts/lib_agent.py
+++ b/scripts/lib_agent.py
@@ -366,14 +366,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s
 
     _BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"]
 
-    def _remove_readonly(func, path, _):
     def _remove_readonly(func, path, _):
         try:
             os.chmod(path, stat.S_IWRITE)
             func(path)
         except OSError:
             pass
-        func(path)
 
     saved_bootstrap: dict[str, bytes] = {}
     if workspace.exists():
@@ -972,3 +970,179 @@ def run_openclaw_prompt(
         "stdout": stdout,
         "stderr": stderr,
     }
+
+
+_JUDGE_SYSTEM_MSG = (
+    "You are a strict grading function. "
+    "Respond with ONLY a JSON object, no prose, no markdown fences, no extra text."
+)
+
+
+def call_judge_api(
+    *,
+    prompt: str,
+    model: str,
+    timeout_seconds: float = 120.0,
+) -> Dict[str, Any]:
+    """Call a judge model directly via API, bypassing OpenClaw.
+
+    Dispatches based on model prefix:
+      - openrouter/* -> OpenRouter chat completions API
+      - anthropic/*  -> Anthropic Messages API
+      - openai/*     -> OpenAI chat completions API
+      - claude       -> headless Claude CLI (claude -p)
+
+    Returns {"status": str, "text": str, "error"?: str}.
+    """
+    if model == "claude" or model.startswith("claude:"):
+        return _judge_via_claude_cli(prompt, model, timeout_seconds)
+    if model.startswith("anthropic/"):
+        return _judge_via_anthropic(prompt, model, timeout_seconds)
+    if model.startswith("openai/"):
+        return _judge_via_openai(prompt, model, timeout_seconds)
+    # Default: OpenRouter (handles openrouter/ prefix and bare provider/model)
+    return _judge_via_openrouter(prompt, model, timeout_seconds)
+
+
+def _judge_via_openai_compat(
+    prompt: str,
+    api_model: str,
+    endpoint: str,
+    api_key: str,
+    timeout_seconds: float,
+    extra_headers: Optional[Dict[str, str]] = None,
+) -> Dict[str, Any]:
+    """Shared implementation for OpenAI-compatible chat completions APIs."""
+    payload = json.dumps({
+        "model": api_model,
+        "messages": [
+            {"role": "system", "content": _JUDGE_SYSTEM_MSG},
+            {"role": "user", "content": prompt},
+        ],
+        "temperature": 0.0,
+        "max_tokens": 2048,
+    }).encode("utf-8")
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    if extra_headers:
+        headers.update(extra_headers)
+
+    req = request.Request(endpoint, data=payload, headers=headers, method="POST")
+    try:
+        with request.urlopen(req, timeout=timeout_seconds) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+    except error.HTTPError as exc:
+        body = ""
+        try:
+            body = exc.read().decode("utf-8", errors="replace")[:500]
+        except Exception:
+            pass
+        logger.error("Judge API error (%s): %s", exc.code, body)
+        return {"status": "error", "text": "", "error": f"HTTP {exc.code}: {body}"}
+    except error.URLError as exc:
+        logger.error("Judge network error: %s", exc)
+        return {"status": "error", "text": "", "error": str(exc)}
+    except TimeoutError:
+        return {"status": "timeout", "text": "", "error": "Request timed out"}
+
+    choices = data.get("choices", [])
+    if not choices:
+        return {"status": "error", "text": "", "error": "No choices in response"}
+    text = choices[0].get("message", {}).get("content", "")
+    return {"status": "success", "text": text}
+
+
+def _judge_via_openrouter(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        return {"status": "error", "text": "", "error": "OPENROUTER_API_KEY not set"}
+    bare_model = model.removeprefix("openrouter/")
+    return _judge_via_openai_compat(
+        prompt, bare_model,
+        "https://openrouter.ai/api/v1/chat/completions",
+        api_key, timeout_seconds,
+        extra_headers={"HTTP-Referer": "https://pinchbench.com", "X-Title": "PinchBench-Judge"},
+    )
+
+
+def _judge_via_openai(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return {"status": "error", "text": "", "error": "OPENAI_API_KEY not set"}
+    bare_model = model.removeprefix("openai/")
+    return _judge_via_openai_compat(
+        prompt, bare_model,
+        "https://api.openai.com/v1/chat/completions",
+        api_key, timeout_seconds,
+    )
+
+
+def _judge_via_anthropic(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return {"status": "error", "text": "", "error": "ANTHROPIC_API_KEY not set"}
+    bare_model = model.removeprefix("anthropic/")
+    payload = json.dumps({
+        "model": bare_model,
+        "max_tokens": 2048,
+        "temperature": 0.0,
+        "system": _JUDGE_SYSTEM_MSG,
+        "messages": [{"role": "user", "content": prompt}],
+    }).encode("utf-8")
+    headers = {
+        "x-api-key": api_key,
+        "Content-Type": "application/json",
+        "anthropic-version": "2023-06-01",
+    }
+    req = request.Request(
+        "https://api.anthropic.com/v1/messages",
+        data=payload, headers=headers, method="POST",
+    )
+    try:
+        with request.urlopen(req, timeout=timeout_seconds) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+    except error.HTTPError as exc:
+        body = ""
+        try:
+            body = exc.read().decode("utf-8", errors="replace")[:500]
+        except Exception:
+            pass
+        logger.error("Anthropic judge API error (%s): %s", exc.code, body)
+        return {"status": "error", "text": "", "error": f"HTTP {exc.code}: {body}"}
+    except error.URLError as exc:
+        logger.error("Anthropic judge network error: %s", exc)
+        return {"status": "error", "text": "", "error": str(exc)}
+    except TimeoutError:
+        return {"status": "timeout", "text": "", "error": "Request timed out"}
+
+    content = data.get("content", [])
+    text = "".join(block.get("text", "") for block in content if block.get("type") == "text")
+    return {"status": "success", "text": text}
+
+
+def _judge_via_claude_cli(prompt: str, model: str, timeout_seconds: float) -> Dict[str, Any]:
+    """Use headless Claude CLI (claude -p) as judge."""
+    cmd: List[str] = ["claude", "-p"]
+    # Support "claude:model-name" to pass --model
+    if ":" in model:
+        _, cli_model = model.split(":", 1)
+        cmd.extend(["--model", cli_model])
+    try:
+        result = subprocess.run(
+            cmd,
+            input=f"{_JUDGE_SYSTEM_MSG}\n\n{prompt}",
+            capture_output=True,
+            text=True,
+            timeout=timeout_seconds,
+            check=False,
+        )
+    except FileNotFoundError:
+        return {"status": "error", "text": "", "error": "claude CLI not found"}
+    except subprocess.TimeoutExpired:
+        return {"status": "timeout", "text": "", "error": "claude -p timed out"}
+    if result.returncode != 0:
+        return {"status": "error", "text": "", "error": f"claude exit {result.returncode}: {result.stderr[:300]}"}
+    return {"status": "success", "text": result.stdout}
diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
index bf25b2d..87de7e5 100644
--- a/scripts/lib_grading.py
+++ b/scripts/lib_grading.py
@@ -9,9 +9,9 @@
 import re
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
-from lib_agent import ensure_agent_exists, run_openclaw_prompt, slugify_model
+from lib_agent import call_judge_api, ensure_agent_exists, run_openclaw_prompt, slugify_model
 from lib_tasks import Task
 
 
@@ -51,6 +51,7 @@ def grade_task(
     judge_model: str = DEFAULT_JUDGE_MODEL,
     judge_agent_prefix: str = DEFAULT_JUDGE_AGENT_PREFIX,
     judge_timeout_seconds: float = DEFAULT_JUDGE_TIMEOUT_SECONDS,
+    judge_backend: str = "openclaw",
     verbose: bool = False,
 ) -> GradeResult:
     grading_type = task.grading_type
@@ -70,6 +71,7 @@ def grade_task(
             judge_model=judge_model,
             judge_agent_prefix=judge_agent_prefix,
             judge_timeout_seconds=judge_timeout_seconds,
+            judge_backend=judge_backend,
             skill_dir=skill_dir,
             verbose=verbose,
         )
@@ -84,6 +86,7 @@ def grade_task(
             judge_model=judge_model,
             judge_agent_prefix=judge_agent_prefix,
             judge_timeout_seconds=judge_timeout_seconds,
+            judge_backend=judge_backend,
             skill_dir=skill_dir,
             verbose=verbose,
         )
@@ -144,7 +147,8 @@ def _grade_llm_judge(
     judge_model: str,
     judge_agent_prefix: str,
     judge_timeout_seconds: float,
-    skill_dir: Path,
+    judge_backend: str = "openclaw",
+    skill_dir: Optional[Path] = None,
     verbose: bool = False,
 ) -> GradeResult:
     transcript = execution_result.get("transcript", [])
@@ -174,24 +178,44 @@ def _grade_llm_judge(
     rubric = task.llm_judge_rubric or _format_grading_criteria(task)
     prompt = _build_judge_prompt(task, transcript_summary, rubric, workspace_content)
 
-    agent_id = _ensure_judge_agent(judge_agent_prefix, judge_model, skill_dir)
-    judge_workspace = Path(f"/tmp/pinchbench/judge/{task.task_id}")
-    judge_result = run_openclaw_prompt(
-        agent_id=agent_id,
-        prompt=prompt,
-        workspace=judge_workspace,
-        timeout_seconds=judge_timeout_seconds,
-    )
+    if judge_backend == "api":
+        # Direct API call — bypasses OpenClaw personality injection
+        judge_result = call_judge_api(
+            prompt=prompt,
+            model=judge_model,
+            timeout_seconds=judge_timeout_seconds,
+        )
 
-    if verbose:
-        logger.info("   [VERBOSE] Judge execution status: %s", judge_result.get("status"))
-        logger.info("   [VERBOSE] Judge exit code: %s", judge_result.get("exit_code"))
-        logger.info("   [VERBOSE] Judge stderr: %s", judge_result.get("stderr", "")[:500])
+        if verbose:
+            logger.info("   [VERBOSE] Judge execution status: %s", judge_result.get("status"))
+            if judge_result.get("error"):
+                logger.info("   [VERBOSE] Judge error: %s", judge_result["error"])
+
+        if judge_result.get("status") != "success":
+            logger.warning("Judge API call failed: %s", judge_result.get("error", judge_result.get("status")))
+
+        raw_parsed = _parse_judge_text(judge_result.get("text", ""))
+    else:
+        # Default: OpenClaw agent session
+        agent_id = _ensure_judge_agent(judge_agent_prefix, judge_model, skill_dir)
+        judge_workspace = Path(f"/tmp/pinchbench/judge/{task.task_id}")
+        judge_result = run_openclaw_prompt(
+            agent_id=agent_id,
+            prompt=prompt,
+            workspace=judge_workspace,
+            timeout_seconds=judge_timeout_seconds,
+        )
+
+        if verbose:
+            logger.info("   [VERBOSE] Judge execution status: %s", judge_result.get("status"))
+            logger.info("   [VERBOSE] Judge exit code: %s", judge_result.get("exit_code"))
+            logger.info("   [VERBOSE] Judge stderr: %s", judge_result.get("stderr", "")[:500])
+
+        if judge_result.get("status") != "success":
+            logger.warning("Judge execution failed: %s", judge_result.get("status"))
 
-    if judge_result.get("status") != "success":
-        logger.warning("Judge execution failed: %s", judge_result.get("status"))
+        raw_parsed = _parse_judge_response(judge_result.get("transcript", []))
 
-    raw_parsed = _parse_judge_response(judge_result.get("transcript", []))
     if verbose:
         logger.info("   [VERBOSE] Judge raw response parsed: %s", raw_parsed)
     
@@ -464,6 +488,80 @@ def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
     return {}
 
 
+def _parse_judge_text(raw_text: str) -> Dict[str, Any]:
+    """Parse judge response from raw text (direct API call, no OpenClaw transcript)."""
+    raw_text = raw_text.strip()
+    if not raw_text:
+        return {}
+
+    # Try direct JSON parse first (ideal case with system prompt enforcement)
+    try:
+        parsed = json.loads(raw_text)
+        if isinstance(parsed, dict):
+            return parsed
+    except json.JSONDecodeError:
+        pass
+
+    # Try extracting from code blocks
+    code_block_match = re.search(r"```(?:json)?\s*(.*?)\s*```", raw_text, re.DOTALL)
+    if code_block_match:
+        try:
+            parsed = json.loads(code_block_match.group(1))
+            if isinstance(parsed, dict):
+                return parsed
+        except json.JSONDecodeError:
+            pass
+
+    # Find balanced-brace JSON objects
+    json_candidates: List[str] = []
+    brace_depth = 0
+    current_json: List[str] = []
+    for char in raw_text:
+        if char == "{":
+            if brace_depth == 0:
+                current_json = []
+            brace_depth += 1
+        if brace_depth > 0:
+            current_json.append(char)
+        if char == "}":
+            brace_depth -= 1
+            if brace_depth == 0 and current_json:
+                json_candidates.append("".join(current_json))
+
+    for candidate in reversed(json_candidates):
+        try:
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict) and "scores" in parsed:
+                return parsed
+        except json.JSONDecodeError:
+            continue
+    for candidate in reversed(json_candidates):
+        try:
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                return parsed
+        except json.JSONDecodeError:
+            continue
+
+    # Fallback: regex for total score
+    score_pattern = re.search(
+        r"(?:total|overall|final)\s*(?:score)?[:\s]*(0\.\d+|1\.0+)",
+        raw_text,
+        re.IGNORECASE,
+    )
+    if score_pattern:
+        try:
+            total = float(score_pattern.group(1))
+            if 0.0 <= total <= 1.0:
+                logger.warning("Fell back to regex score extraction (total=%.2f)", total)
+                return {"scores": {}, "total": total, "notes": "Score extracted from prose"}
+        except ValueError:
+            pass
+
+    logger.warning("Failed to parse judge text response")
+    return {}
+
+
 def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
     """
     Normalize judge response to expected format with 'scores', 'total', and 'notes'.