Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.m
- **Reproducible** — Same task should produce consistent grading
- **Challenging** — Tests agent capabilities, not just LLM knowledge

### Transcript Archive

Session transcripts are automatically saved to `results/{run_id}_transcripts/` alongside the results JSON. Each task's full agent conversation is preserved as a JSONL file (e.g. `task_01_calendar.jsonl`) for post-run analysis.

## Links

- **Leaderboard:** [pinchbench.com](https://pinchbench.com)
Expand Down
1 change: 1 addition & 0 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ def main():
run_id=f"{run_id}-{run_index + 1}",
timeout_multiplier=args.timeout_multiplier,
skill_dir=skill_dir,
output_dir=Path(args.output_dir) / f"{run_id}_transcripts",
verbose=args.verbose,
)
except Exception as exc:
Expand Down
26 changes: 19 additions & 7 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,14 +366,12 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s

_BOOTSTRAP_FILES = ["SOUL.md", "BOOTSTRAP.md", "USER.md", "IDENTITY.md", "HEARTBEAT.md", "TOOLS.md"]

def _remove_readonly(func, path, _):
def _remove_readonly(func, path, _):
try:
os.chmod(path, stat.S_IWRITE)
func(path)
except OSError:
pass
func(path)

saved_bootstrap: dict[str, bytes] = {}
if workspace.exists():
Expand Down Expand Up @@ -528,7 +526,9 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None
return max(pool, key=lambda path: path.stat().st_mtime)


def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]:
def _load_transcript(
agent_id: str, session_id: str, started_at: float
) -> tuple[List[Dict[str, Any]], Optional[Path]]:
agent_dir = _get_agent_store_dir(agent_id)
transcript_path = None

Expand Down Expand Up @@ -623,7 +623,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[
"Transcript not found — sessions dir does not exist: %s",
sessions_dir,
)
return []
return [], None

transcript: List[Dict[str, Any]] = []
for line in transcript_path.read_text(encoding="utf-8").splitlines():
Expand All @@ -634,7 +634,7 @@ def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[
except json.JSONDecodeError as exc:
logger.warning("Failed to parse transcript line: %s", exc)
transcript.append({"raw": line, "parse_error": str(exc)})
return transcript
return transcript, transcript_path


def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
Expand Down Expand Up @@ -676,6 +676,7 @@ def execute_openclaw_task(
run_id: str,
timeout_multiplier: float,
skill_dir: Path,
output_dir: Optional[Path] = None,
verbose: bool = False,
) -> Dict[str, Any]:
logger.info("🤖 Agent [%s] starting task: %s", agent_id, task.task_id)
Expand Down Expand Up @@ -783,10 +784,21 @@ def execute_openclaw_task(
except FileNotFoundError as exc:
stderr = f"openclaw command not found: {exc}"

transcript = _load_transcript(agent_id, session_id, start_time)
transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
usage = _extract_usage_from_transcript(transcript)
execution_time = time.time() - start_time

# Archive the raw transcript JSONL before cleanup_agent_sessions deletes it
if transcript_path and output_dir:
import shutil as _shutil
output_dir.mkdir(parents=True, exist_ok=True)
archive_dest = output_dir / f"{task.task_id}.jsonl"
try:
_shutil.copy2(transcript_path, archive_dest)
logger.info("Archived transcript to %s", archive_dest)
except OSError as exc:
logger.warning("Failed to archive transcript: %s", exc)

status = "success"
if timed_out:
status = "timeout"
Expand Down Expand Up @@ -948,7 +960,7 @@ def run_openclaw_prompt(
stderr += f"openclaw command not found: {exc}"
break

transcript = _load_transcript(agent_id, session_id, start_time)
transcript, _ = _load_transcript(agent_id, session_id, start_time)
execution_time = time.time() - start_time

status = "success"
Expand Down