From 0c419e45aac78f6bdeadf5ac83eca9aa8cd14b49 Mon Sep 17 00:00:00 2001 From: zhaixu2 Date: Tue, 24 Mar 2026 18:38:38 +0800 Subject: [PATCH 1/2] fix: Windows compat, grading correctness, and judge efficiency 4 bugs found while benchmarking lenovo/aly model on Windows with OpenClaw: 1. fix(lib_agent): preserve bootstrap files across task workspace resets - save SOUL.md/BOOTSTRAP.md/USER.md/IDENTITY.md/HEARTBEAT.md/TOOLS.md before shutil.rmtree() and restore afterward - without this the agent re-enters onboarding flow on every task after the first, producing useless responses 2. fix(lib_agent): handle read-only files when clearing workspace on Windows - add onerror handler to shutil.rmtree() calls that chmod+deletes read-only files (git objects, etc.) - tasks that create a git repo (e.g. task_11_clawdhub) mark .git/objects as read-only; rmtree would crash with [WinError 5] on the next task, causing all subsequent tasks to score 0 3. fix(lib_grading): skip LLM judge when task execution failed with no transcript - when status != success and transcript is empty, return score=0.0 directly - previously every failed task still triggered a full judge agent run (~2-3 min each), wasting API quota and always returning 0 anyway 4. fix(lib_tasks): skip TASK_TEMPLATE.md placeholder task - filter out tasks whose id contains _XX_ or equals task_XX_name - the template was being loaded and run as a real task, consuming one benchmark slot and always scoring 0 5. fix(task_08_memory): support OpenClaw read tool name in read_notes grading - grading code only checked read_file/readFile (Cursor/Windsurf names) - OpenClaw uses read as the tool name; add it to the check so the criterion scores correctly --- scripts/lib_agent.py | 113 ++++++++++++++++++++++++++++++++-------- scripts/lib_grading.py | 86 ++++++++++++++++++++++++++++-- scripts/lib_tasks.py | 3 ++ tasks/task_08_memory.md | 13 +++-- 4 files changed, 184 insertions(+), 31 deletions(-) diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index bac28d2..67c5c22 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -7,6 +7,8 @@ import json import logging import os +import platform +import stat import subprocess import time from pathlib import Path @@ -18,6 +20,8 @@ logger = logging.getLogger(__name__) +USE_SHELL = platform.system() == "Windows" + class ModelValidationError(Exception): """Raised when a model ID is invalid or inaccessible.""" @@ -25,7 +29,8 @@ class ModelValidationError(Exception): pass -MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "4000")) +MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "8000")) +JUDGE_MAX_MSG_CHARS = int(os.environ.get("PINCHBENCH_JUDGE_MAX_MSG_CHARS", "3000")) def _coerce_subprocess_output(value: Any) -> str: @@ -163,6 +168,7 @@ def _get_agent_workspace(agent_id: str) -> Path | None: capture_output=True, text=True, check=False, + shell=USE_SHELL, ) if list_result.returncode != 0: return None @@ -206,6 +212,7 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo capture_output=True, text=True, check=False, + shell=USE_SHELL, ) except FileNotFoundError: logger.error("openclaw CLI not found while listing agents") @@ -248,6 +255,7 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo capture_output=True, text=True, check=False, + shell=USE_SHELL, ) logger.info("Creating OpenClaw agent %s", agent_id) @@ -267,6 +275,7 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo capture_output=True, text=True, check=False, + shell=USE_SHELL, ) except FileNotFoundError: logger.error("openclaw CLI not found while creating agent") @@ -276,6 +285,44 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo logger.warning( "Agent creation returned %s: %s", create_result.returncode, create_result.stderr ) + + # Copy main agent's models.json to bench agent so custom providers (e.g. lenovo) + # are available. OpenClaw only copies a subset of providers when creating a new agent. + main_models = Path.home() / ".openclaw" / "agents" / "main" / "agent" / "models.json" + if main_models.exists(): + bench_agent_dir = _get_agent_store_dir(agent_id) / "agent" + bench_agent_dir.mkdir(parents=True, exist_ok=True) + bench_models = bench_agent_dir / "models.json" + import shutil as _shutil + _shutil.copy2(main_models, bench_models) + # Set defaultProvider/defaultModel so OpenClaw uses the requested model + if "/" in model_id: + provider_name, model_name = model_id.split("/", 1) + try: + import json as _json + raw = bench_models.read_text("utf-8-sig") + data = _json.loads(raw) + data["defaultProvider"] = provider_name + data["defaultModel"] = model_name + bench_models.write_text(_json.dumps(data, indent=2, ensure_ascii=False), "utf-8") + logger.info( + "Set bench agent default model to %s / %s", provider_name, model_name + ) + except Exception as exc: + logger.warning("Failed to set default model in bench models.json: %s", exc) + logger.info("Copied main agent models.json to bench agent %s", agent_id) + + # Delete sessions.json so OpenClaw picks up the new defaultProvider/defaultModel + # instead of reusing a cached session entry that still points to an old model. + bench_sessions_dir = _get_agent_store_dir(agent_id) / "sessions" + sessions_store = bench_sessions_dir / "sessions.json" + if sessions_store.exists(): + try: + sessions_store.unlink() + logger.info("Deleted stale sessions.json for bench agent %s", agent_id) + except OSError as exc: + logger.warning("Failed to delete sessions.json: %s", exc) + return True @@ -317,12 +364,24 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s logger.warning("Could not find agent workspace, using fallback") workspace = Path(f"/tmp/pinchbench/{run_id}/{task.task_id}") - # Clear workspace before each task to prevent stale files from prior tasks - # from contaminating the agent's context. + _BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"] + + def _remove_readonly(func, path, _): + os.chmod(path, stat.S_IWRITE) + func(path) + + saved_bootstrap: dict[str, bytes] = {} if workspace.exists(): - shutil.rmtree(workspace) + for fname in _BOOTSTRAP_FILES: + fpath = workspace / fname + if fpath.exists(): + saved_bootstrap[fname] = fpath.read_bytes() + shutil.rmtree(workspace, onerror=_remove_readonly) workspace.mkdir(parents=True, exist_ok=True) + for fname, content in saved_bootstrap.items(): + (workspace / fname).write_bytes(content) + for file_spec in task.workspace_files: if "content" in file_spec: dest = workspace / file_spec["path"] @@ -339,17 +398,6 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s logger.error("Workspace file not found: %s", source) raise - # Remove bootstrap files that would trigger the onboarding flow - # These interfere with benchmark tasks - for bootstrap_file in ["BOOTSTRAP.md", "SOUL.md", "USER.md", "IDENTITY.md"]: - bootstrap_path = workspace / bootstrap_file - if bootstrap_path.exists(): - try: - bootstrap_path.unlink() - logger.info("Removed bootstrap file: %s", bootstrap_file) - except OSError as exc: - logger.warning("Failed to remove %s: %s", bootstrap_file, exc) - # Copy skills from main workspace to benchmark workspace # This enables benchmark agents to use installed skills like nano-pdf main_skills_dir = Path.home() / ".openclaw" / "workspace" / "skills" @@ -363,7 +411,7 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s import shutil if dest_skill_dir.exists(): - shutil.rmtree(dest_skill_dir) + shutil.rmtree(dest_skill_dir, onerror=_remove_readonly) shutil.copytree(skill_dir_src, dest_skill_dir) logger.info("Copied skill to benchmark workspace: %s", skill_dir_src.name) @@ -684,6 +732,7 @@ def execute_openclaw_task( cwd=str(workspace), timeout=remaining, check=False, + shell=USE_SHELL, ) stdout += result.stdout stderr += result.stderr @@ -717,6 +766,7 @@ def execute_openclaw_task( cwd=str(workspace), timeout=timeout_seconds, check=False, + shell=USE_SHELL, ) stdout = result.stdout stderr = result.stderr @@ -801,10 +851,19 @@ def run_openclaw_prompt( timeout_seconds: float, ) -> Dict[str, Any]: """Run a single OpenClaw prompt for helper agents like the judge.""" - # Clean up previous session transcripts so we can reliably find this - # prompt's transcript (OpenClaw uses its own UUID-based naming). cleanup_agent_sessions(agent_id) + agent_workspace = _get_agent_workspace(agent_id) + if agent_workspace and agent_workspace.exists(): + for bootstrap_file in ["BOOTSTRAP.md", "SOUL.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md"]: + bp = agent_workspace / bootstrap_file + if bp.exists(): + try: + bp.unlink() + logger.debug("Removed bootstrap file from judge workspace: %s", bootstrap_file) + except OSError as exc: + logger.warning("Failed to remove bootstrap file %s: %s", bootstrap_file, exc) + start_time = time.time() workspace.mkdir(parents=True, exist_ok=True) session_id = f"judge_{int(time.time() * 1000)}" @@ -814,8 +873,8 @@ def run_openclaw_prompt( timed_out = False chunks = [ - prompt[i : i + MAX_OPENCLAW_MESSAGE_CHARS] - for i in range(0, max(1, len(prompt)), MAX_OPENCLAW_MESSAGE_CHARS) + prompt[i : i + JUDGE_MAX_MSG_CHARS] + for i in range(0, max(1, len(prompt)), JUDGE_MAX_MSG_CHARS) ] if len(chunks) > 1: total_chunks = len(chunks) @@ -843,22 +902,32 @@ def run_openclaw_prompt( timed_out = True break try: + openclaw_path = os.environ.get("OPENCLAW_PATH", "openclaw") + # On Windows, cmd.exe splits command-line arguments at literal newlines, + # causing the message to be truncated after the first line. + # Escape newlines to literal \n sequences so the full prompt is received. + send_chunk = ( + chunk.replace("\r\n", "\\n").replace("\n", "\\n").replace("\r", "\\n") + if USE_SHELL + else chunk + ) result = subprocess.run( [ - "openclaw", + openclaw_path, "agent", "--agent", agent_id, "--session-id", session_id, "--message", - chunk, + send_chunk, ], capture_output=True, text=True, cwd=str(workspace), timeout=remaining, check=False, + shell=USE_SHELL, ) stdout += result.stdout stderr += result.stderr diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index 46961bb..02bb924 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -147,11 +147,32 @@ def _grade_llm_judge( skill_dir: Path, verbose: bool = False, ) -> GradeResult: - transcript_summary = _summarize_transcript(execution_result.get("transcript", [])) + transcript = execution_result.get("transcript", []) + execution_status = execution_result.get("status", "unknown") + + if not transcript and execution_status != "success": + if verbose: + logger.info( + " [VERBOSE] Skipping LLM judge: status=%s, transcript empty", + execution_status, + ) + return GradeResult( + task_id=task.task_id, + score=0.0, + max_score=1.0, + grading_type="llm_judge", + breakdown={}, + notes=f"Skipped: task execution failed ({execution_status}), no transcript to evaluate", + ) + + transcript_summary = _summarize_transcript(transcript) if verbose: logger.info(" [VERBOSE] Transcript summary for judge (first 1000 chars):\n%s", transcript_summary[:1000]) + workspace_content = _read_workspace_files(execution_result.get("workspace", "")) + if verbose and workspace_content: + logger.info(" [VERBOSE] Workspace files passed to judge (first 500 chars):\n%s", workspace_content[:500]) rubric = task.llm_judge_rubric or _format_grading_criteria(task) - prompt = _build_judge_prompt(task, transcript_summary, rubric) + prompt = _build_judge_prompt(task, transcript_summary, rubric, workspace_content) agent_id = _ensure_judge_agent(judge_agent_prefix, judge_model, skill_dir) judge_workspace = Path(f"/tmp/pinchbench/judge/{task.task_id}") @@ -162,6 +183,14 @@ def _grade_llm_judge( timeout_seconds=judge_timeout_seconds, ) + if verbose: + logger.info(" [VERBOSE] Judge execution status: %s", judge_result.get("status")) + logger.info(" [VERBOSE] Judge exit code: %s", judge_result.get("exit_code")) + logger.info(" [VERBOSE] Judge stderr: %s", judge_result.get("stderr", "")[:500]) + + if judge_result.get("status") != "success": + logger.warning("Judge execution failed: %s", judge_result.get("status")) + raw_parsed = _parse_judge_response(judge_result.get("transcript", [])) if verbose: logger.info(" [VERBOSE] Judge raw response parsed: %s", raw_parsed) @@ -252,9 +281,20 @@ def _summarize_transcript(transcript: List[Dict[str, Any]]) -> str: if role == "assistant": for item in msg.get("content", []): if item.get("type") == "toolCall": + args = item.get("arguments", {}) + truncated_args: Dict[str, Any] = {} + for k, v in args.items(): + if isinstance(v, str) and len(v) > 200: + truncated_args[k] = v[:200] + "...[truncated]" + else: + truncated_args[k] = v summary_parts.append( - f"Tool: {item.get('name')}({json.dumps(item.get('arguments', {}))})" + f"Tool: {item.get('name')}({json.dumps(truncated_args)})" ) + elif item.get("type") == "text": + text = item.get("text", "").strip() + if text: + summary_parts.append(f"Assistant: {text[:2000]}") elif role == "toolResult": content = msg.get("content", []) if content: @@ -267,7 +307,43 @@ def _summarize_transcript(transcript: List[Dict[str, Any]]) -> str: return "\n".join(summary_parts) -def _build_judge_prompt(task: Task, transcript_summary: str, rubric: str) -> str: +def _read_workspace_files(workspace_path: str) -> str: + """Read user-created text files from workspace to provide grading context.""" + if not workspace_path: + return "" + workspace = Path(workspace_path) + if not workspace.exists(): + return "" + skip_names = { + "BOOTSTRAP.md", "SOUL.md", "USER.md", "IDENTITY.md", + "HEARTBEAT.md", "TOOLS.md", "AGENTS.md", + } + skip_dirs = {".git", ".openclaw", "__pycache__", "node_modules", "skills"} + file_contents: List[str] = [] + for f in sorted(workspace.rglob("*")): + if not f.is_file(): + continue + rel = f.relative_to(workspace) + parts = rel.parts + if any(part.startswith(".") or part in skip_dirs for part in parts): + continue + if f.name in skip_names: + continue + try: + content = f.read_text(encoding="utf-8") + file_contents.append(f"### File: {rel}\n{content[:3000]}") + except (OSError, UnicodeDecodeError): + pass + return "\n\n".join(file_contents) + + +def _build_judge_prompt(task: Task, transcript_summary: str, rubric: str, workspace_content: str = "") -> str: + workspace_section = "" + if workspace_content.strip(): + workspace_section = ( + "## Workspace Files Created by Agent\n" + f"{workspace_content}\n\n" + ) return ( "You are a grading function. Your ONLY job is to output a single JSON object.\n\n" "CRITICAL RULES:\n" @@ -284,6 +360,7 @@ def _build_judge_prompt(task: Task, transcript_summary: str, rubric: str) -> str f"{task.expected_behavior}\n\n" "## Agent Transcript (summarized)\n" f"{transcript_summary}\n\n" + f"{workspace_section}" "## Grading Rubric\n" f"{rubric}\n\n" "Score each criterion from 0.0 to 1.0.\n\n" @@ -312,6 +389,7 @@ def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]: if item.get("type") == "text": content_chunks.append(item.get("text", "")) raw_text = "\n".join(content_chunks).strip() + logger.info(" [VERBOSE] Judge raw response text (first 2000 chars):\n%s", raw_text[:2000]) if not raw_text: return {} diff --git a/scripts/lib_tasks.py b/scripts/lib_tasks.py index bee1d19..0f81d8f 100644 --- a/scripts/lib_tasks.py +++ b/scripts/lib_tasks.py @@ -90,6 +90,9 @@ def load_all_tasks(self) -> List[Task]: for task_file in task_files: try: task = self.load_task(task_file) + if "_XX_" in task.task_id or task.task_id == "task_XX_name": + logger.debug("Skipping template task: %s", task.task_id) + continue tasks.append(task) logger.info(f"Successfully loaded task: {task.task_id}") except Exception as e: diff --git a/tasks/task_08_memory.md b/tasks/task_08_memory.md index 5050f10..9cb5f6f 100644 --- a/tasks/task_08_memory.md +++ b/tasks/task_08_memory.md @@ -146,11 +146,14 @@ def grade(transcript: list, workspace_path: str) -> dict: for item in msg.get("content", []): if item.get("type") == "toolCall": tool_name = item.get("name", "") - params = item.get("params", {}) - # Check if agent read notes.md - if tool_name in ["read_file", "readFile"]: - files = params.get("files", []) - if any("notes.md" in str(f) for f in files): + # Support multiple tool name conventions: + # - read (OpenClaw) + # - read_file / readFile (Cursor, Windsurf, Claude Code) + if tool_name in ["read", "read_file", "readFile"]: + args = item.get("arguments", item.get("params", {})) + path_val = str(args.get("path", args.get("file_path", ""))) + files = args.get("files", []) + if "notes.md" in path_val or any("notes.md" in str(f) for f in files): read_notes = True break From 7d5eadbf0f2f56df7da0dc2b4a093c4d7b285a4a Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Tue, 31 Mar 2026 11:02:25 -0400 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: kilo-code-bot[bot] <240665456+kilo-code-bot[bot]@users.noreply.github.com> --- scripts/lib_agent.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 67c5c22..7f261bf 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -367,7 +367,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s _BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"] def _remove_readonly(func, path, _): - os.chmod(path, stat.S_IWRITE) + def _remove_readonly(func, path, _): + try: + os.chmod(path, stat.S_IWRITE) + func(path) + except OSError: + pass func(path) saved_bootstrap: dict[str, bytes] = {}