diff --git a/README.md b/README.md index a306faf..2154bdd 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,10 @@ We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.m - **Reproducible** — Same task should produce consistent grading - **Challenging** — Tests agent capabilities, not just LLM knowledge +### Transcript Archive + +Session transcripts are automatically saved to `results/{run_id}_transcripts/` alongside the results JSON. Each task's full agent conversation is preserved as a JSONL file (e.g. `task_01_calendar.jsonl`) for post-run analysis. + ## Links - **Leaderboard:** [pinchbench.com](https://pinchbench.com) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 566bcb3..8cd0d52 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -604,6 +604,7 @@ def main(): run_id=f"{run_id}-{run_index + 1}", timeout_multiplier=args.timeout_multiplier, skill_dir=skill_dir, + output_dir=Path(args.output_dir) / f"{run_id}_transcripts", verbose=args.verbose, ) except Exception as exc: diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 7f261bf..6cc0d0d 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -366,14 +366,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s _BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"] - def _remove_readonly(func, path, _): def _remove_readonly(func, path, _): try: os.chmod(path, stat.S_IWRITE) func(path) except OSError: pass - func(path) saved_bootstrap: dict[str, bytes] = {} if workspace.exists(): @@ -528,7 +526,9 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None return max(pool, key=lambda path: path.stat().st_mtime) -def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]: +def _load_transcript( + agent_id: str, session_id: str, started_at: float +) -> tuple[List[Dict[str, Any]], Optional[Path]]: agent_dir = _get_agent_store_dir(agent_id) transcript_path = None @@ -623,7 +623,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[ "Transcript not found — sessions dir does not exist: %s", sessions_dir, ) - return [] + return [], None transcript: List[Dict[str, Any]] = [] for line in transcript_path.read_text(encoding="utf-8").splitlines(): @@ -634,7 +634,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[ except json.JSONDecodeError as exc: logger.warning("Failed to parse transcript line: %s", exc) transcript.append({"raw": line, "parse_error": str(exc)}) - return transcript + return transcript, transcript_path def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str, Any]: @@ -676,6 +676,7 @@ def execute_openclaw_task( run_id: str, timeout_multiplier: float, skill_dir: Path, + output_dir: Optional[Path] = None, verbose: bool = False, ) -> Dict[str, Any]: logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id) @@ -783,10 +784,21 @@ def execute_openclaw_task( except FileNotFoundError as exc: stderr = f"openclaw command not found: {exc}" - transcript = _load_transcript(agent_id, session_id, start_time) + transcript, transcript_path = _load_transcript(agent_id, session_id, start_time) usage = _extract_usage_from_transcript(transcript) execution_time = time.time() - start_time + # Archive the raw transcript JSONL before cleanup_agent_sessions deletes it + if transcript_path and output_dir: + import shutil as _shutil + output_dir.mkdir(parents=True, exist_ok=True) + archive_dest = output_dir / f"{task.task_id}.jsonl" + try: + _shutil.copy2(transcript_path, archive_dest) + logger.info("Archived transcript to %s", archive_dest) + except OSError as exc: + logger.warning("Failed to archive transcript: %s", exc) + status = "success" if timed_out: status = "timeout" @@ -948,7 +960,7 @@ def run_openclaw_prompt( stderr += f"openclaw command not found: {exc}" break - transcript = _load_transcript(agent_id, session_id, start_time) + transcript, _ = _load_transcript(agent_id, session_id, start_time) execution_time = time.time() - start_time status = "success"