pinchbench · juppytt · Apr 1, 2026
diff --git a/README.md b/README.md
@@ -112,6 +112,10 @@ We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.m
 - **Reproducible** — Same task should produce consistent grading
 - **Challenging** — Tests agent capabilities, not just LLM knowledge
 
+### Transcript Archive
+
+Session transcripts are automatically saved to `results/{run_id}_transcripts/` alongside the results JSON. Each task's full agent conversation is preserved as a JSONL file (e.g. `task_01_calendar.jsonl`) for post-run analysis.
+
 ## Links
 
 - **Leaderboard:** [pinchbench.com](https://pinchbench.com)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -604,6 +604,7 @@ def main():
                     run_id=f"{run_id}-{run_index + 1}",
                     timeout_multiplier=args.timeout_multiplier,
                     skill_dir=skill_dir,
+                    output_dir=Path(args.output_dir) / f"{run_id}_transcripts",
                     verbose=args.verbose,
                 )
             except Exception as exc:

diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
@@ -366,14 +366,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s
 
     _BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"]
 
-    def _remove_readonly(func, path, _):
     def _remove_readonly(func, path, _):
         try:
             os.chmod(path, stat.S_IWRITE)
             func(path)
         except OSError:
             pass
-        func(path)
 
     saved_bootstrap: dict[str, bytes] = {}
     if workspace.exists():
@@ -528,7 +526,9 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None
     return max(pool, key=lambda path: path.stat().st_mtime)
 
 
-def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]:
+def _load_transcript(
+    agent_id: str, session_id: str, started_at: float
+) -> tuple[List[Dict[str, Any]], Optional[Path]]:
     agent_dir = _get_agent_store_dir(agent_id)
     transcript_path = None
 
@@ -623,7 +623,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[
                 "Transcript not found — sessions dir does not exist: %s",
                 sessions_dir,
             )
-        return []
+        return [], None
 
     transcript: List[Dict[str, Any]] = []
     for line in transcript_path.read_text(encoding="utf-8").splitlines():
@@ -634,7 +634,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[
         except json.JSONDecodeError as exc:
             logger.warning("Failed to parse transcript line: %s", exc)
             transcript.append({"raw": line, "parse_error": str(exc)})
-    return transcript
+    return transcript, transcript_path
 
 
 def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
@@ -676,6 +676,7 @@ def execute_openclaw_task(
     run_id: str,
     timeout_multiplier: float,
     skill_dir: Path,
+    output_dir: Optional[Path] = None,
     verbose: bool = False,
 ) -> Dict[str, Any]:
     logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
@@ -783,10 +784,21 @@ def execute_openclaw_task(
         except FileNotFoundError as exc:
             stderr = f"openclaw command not found: {exc}"
 
-    transcript = _load_transcript(agent_id, session_id, start_time)
+    transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
     usage = _extract_usage_from_transcript(transcript)
     execution_time = time.time() - start_time
 
+    # Archive the raw transcript JSONL before cleanup_agent_sessions deletes it
+    if transcript_path and output_dir:
+        import shutil as _shutil
+        output_dir.mkdir(parents=True, exist_ok=True)
+        archive_dest = output_dir / f"{task.task_id}.jsonl"
+        try:
+            _shutil.copy2(transcript_path, archive_dest)
+            logger.info("Archived transcript to %s", archive_dest)
+        except OSError as exc:
+            logger.warning("Failed to archive transcript: %s", exc)
+
     status = "success"
     if timed_out:
         status = "timeout"
@@ -948,7 +960,7 @@ def run_openclaw_prompt(
             stderr += f"openclaw command not found: {exc}"
             break
 
-    transcript = _load_transcript(agent_id, session_id, start_time)
+    transcript, _ = _load_transcript(agent_id, session_id, start_time)
     execution_time = time.time() - start_time
 
     status = "success"