From 58911a81acb64ef6172eb49d255128e44cd0ca6a Mon Sep 17 00:00:00 2001 From: CKwin26 <156837805+CKwin26@users.noreply.github.com> Date: Tue, 31 Mar 2026 01:34:17 -0400 Subject: [PATCH 1/2] harden repair reruns --- config.researchclaw.example.yaml | 23 + researchclaw/config.py | 38 + researchclaw/docker/entrypoint.sh | 7 +- researchclaw/experiment/colab_sandbox.py | 2 + researchclaw/experiment/docker_sandbox.py | 30 +- researchclaw/experiment/sandbox.py | 20 +- researchclaw/experiment/ssh_sandbox.py | 64 +- researchclaw/llm/acp_client.py | 523 +++++++- researchclaw/pipeline/_helpers.py | 144 ++- researchclaw/pipeline/runner.py | 14 + .../pipeline/stage_impls/_code_generation.py | 1124 +++++++++++++++-- .../pipeline/stage_impls/_execution.py | 429 ++++++- tests/test_rc_cli.py | 51 + tests/test_rc_executor.py | 505 ++++++++ tests/test_rc_runner.py | 43 + tests/test_ssh_and_colab_sandbox.py | 37 +- 16 files changed, 2896 insertions(+), 158 deletions(-) diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml index 5b8d43f2..da4f3158 100644 --- a/config.researchclaw.example.yaml +++ b/config.researchclaw.example.yaml @@ -50,6 +50,13 @@ llm: # primary_model: "MiniMax-M2.5" # fallback_models: # - "MiniMax-M2.5-highspeed" + # acp: + # reconnect_retries: 2 + # reconnect_backoff_sec: 2.0 + # verbose: true + # capture_status_on_failure: true + # debug_log_path: "artifacts/acp_debug.jsonl" + # archive_failed_prompt_files: true security: hitl_required_stages: [5, 9, 20] @@ -66,6 +73,22 @@ experiment: max_iterations: 10 metric_key: "primary_metric" metric_direction: "minimize" + # Optional hard guards for trust-first experiment runs. + # When enabled, generated experiments must use real local assets/caches, fail fast + # if those assets are missing, and emit structured machine-readable results. + require_real_data: false + forbid_synthetic_proxy: false + fail_on_stdout_parsed_results: false + required_real_data_refs: [] + benchmark_agent: + enabled: true + preserve_existing_assets: true + pass_existing_assets_as_reference: true + code_agent: + enabled: true + # If the ACP transport drops during multi-round Stage 10 generation, + # fall back to the older one-shot generator instead of failing immediately. + fallback_to_legacy_on_acp_failure: false sandbox: # Use ".venv/Scripts/python.exe" on Windows python_path: ".venv/bin/python3" diff --git a/researchclaw/config.py b/researchclaw/config.py index 8b2173f2..c6154e5a 100644 --- a/researchclaw/config.py +++ b/researchclaw/config.py @@ -184,6 +184,13 @@ class AcpConfig: acpx_command: str = "" session_name: str = "researchclaw" timeout_sec: int = 1800 + verbose: bool = False + stateless_prompt: bool = False + reconnect_retries: int = 2 + reconnect_backoff_sec: float = 2.0 + capture_status_on_failure: bool = False + debug_log_path: str = "" + archive_failed_prompt_files: bool = False @dataclass(frozen=True) @@ -295,6 +302,7 @@ class CodeAgentConfig: """Configuration for the advanced multi-phase code generation agent.""" enabled: bool = True + fallback_to_legacy_on_acp_failure: bool = False # Phase 1: Blueprint planning (deep implementation blueprint) architecture_planning: bool = True # Phase 2: Sequential file generation (one-by-one following blueprint) @@ -347,6 +355,8 @@ class BenchmarkAgentConfig: min_benchmarks: int = 1 min_baselines: int = 2 prefer_cached: bool = True + preserve_existing_assets: bool = True + pass_existing_assets_as_reference: bool = True # Orchestrator max_iterations: int = 2 @@ -426,6 +436,10 @@ class ExperimentConfig: metric_key: str = "primary_metric" metric_direction: str = "minimize" keep_threshold: float = 0.0 + require_real_data: bool = False + forbid_synthetic_proxy: bool = False + fail_on_stdout_parsed_results: bool = False + required_real_data_refs: tuple[str, ...] = () sandbox: SandboxConfig = field(default_factory=SandboxConfig) docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig) agentic: AgenticConfig = field(default_factory=AgenticConfig) @@ -972,6 +986,17 @@ def _parse_llm_config(data: dict[str, Any]) -> LlmConfig: acpx_command=acp_data.get("acpx_command", ""), session_name=acp_data.get("session_name", "researchclaw"), timeout_sec=int(acp_data.get("timeout_sec", 1800)), + verbose=bool(acp_data.get("verbose", False)), + stateless_prompt=bool(acp_data.get("stateless_prompt", False)), + reconnect_retries=_safe_int(acp_data.get("reconnect_retries"), 2), + reconnect_backoff_sec=_safe_float(acp_data.get("reconnect_backoff_sec"), 2.0), + capture_status_on_failure=bool( + acp_data.get("capture_status_on_failure", False) + ), + debug_log_path=str(acp_data.get("debug_log_path", "")), + archive_failed_prompt_files=bool( + acp_data.get("archive_failed_prompt_files", False) + ), ), ) @@ -1008,6 +1033,12 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig: metric_key=data.get("metric_key", "primary_metric"), metric_direction=data.get("metric_direction", "minimize"), keep_threshold=_safe_float(data.get("keep_threshold"), 0.0), + require_real_data=bool(data.get("require_real_data", False)), + forbid_synthetic_proxy=bool(data.get("forbid_synthetic_proxy", False)), + fail_on_stdout_parsed_results=bool( + data.get("fail_on_stdout_parsed_results", False) + ), + required_real_data_refs=tuple(data.get("required_real_data_refs") or ()), sandbox=SandboxConfig( python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH), gpu_required=bool(sandbox_data.get("gpu_required", False)), @@ -1086,6 +1117,10 @@ def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig: min_benchmarks=_safe_int(data.get("min_benchmarks"), 1), min_baselines=_safe_int(data.get("min_baselines"), 2), prefer_cached=bool(data.get("prefer_cached", True)), + preserve_existing_assets=bool(data.get("preserve_existing_assets", True)), + pass_existing_assets_as_reference=bool( + data.get("pass_existing_assets_as_reference", True) + ), max_iterations=_safe_int(data.get("max_iterations"), 2), ) @@ -1142,6 +1177,9 @@ def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig: return CodeAgentConfig() return CodeAgentConfig( enabled=bool(data.get("enabled", True)), + fallback_to_legacy_on_acp_failure=bool( + data.get("fallback_to_legacy_on_acp_failure", False) + ), architecture_planning=bool(data.get("architecture_planning", True)), sequential_generation=bool(data.get("sequential_generation", True)), hard_validation=bool(data.get("hard_validation", True)), diff --git a/researchclaw/docker/entrypoint.sh b/researchclaw/docker/entrypoint.sh index 316039c0..5104bd9c 100755 --- a/researchclaw/docker/entrypoint.sh +++ b/researchclaw/docker/entrypoint.sh @@ -11,7 +11,10 @@ set -e WORKSPACE="/workspace" -ENTRY_POINT="${1:-main.py}" +ENTRY_POINT="${RC_ENTRY_POINT:-${1:-main.py}}" +if [ "$#" -gt 0 ]; then + shift +fi # ---------------------------------------------------------------- # Phase 0: Install additional pip packages @@ -51,4 +54,4 @@ fi # Phase 2: Run experiment # ---------------------------------------------------------------- echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..." -exec python3 -u "$WORKSPACE/$ENTRY_POINT" +exec python3 -u "$WORKSPACE/$ENTRY_POINT" "$@" diff --git a/researchclaw/experiment/colab_sandbox.py b/researchclaw/experiment/colab_sandbox.py index b6a46542..eec6ad7e 100644 --- a/researchclaw/experiment/colab_sandbox.py +++ b/researchclaw/experiment/colab_sandbox.py @@ -158,6 +158,8 @@ def run_project( *, entry_point: str = "main.py", timeout_sec: int = 300, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends from researchclaw.experiment.sandbox import validate_entry_point diff --git a/researchclaw/experiment/docker_sandbox.py b/researchclaw/experiment/docker_sandbox.py index 3eda27c9..b45f21cd 100644 --- a/researchclaw/experiment/docker_sandbox.py +++ b/researchclaw/experiment/docker_sandbox.py @@ -138,6 +138,8 @@ def run_project( *, entry_point: str = "main.py", timeout_sec: int = 300, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: """Run a multi-file experiment project inside a container.""" self._run_counter += 1 @@ -189,7 +191,13 @@ def run_project( metrics={}, ) - return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec) + return self._execute( + staging, + entry_point=entry_point, + timeout_sec=timeout_sec, + entry_args=args, + env_overrides=env_overrides, + ) # ------------------------------------------------------------------ # Static helpers @@ -254,7 +262,13 @@ def _inject_harness(target_dir: Path) -> None: # ------------------------------------------------------------------ def _execute( - self, staging_dir: Path, *, entry_point: str, timeout_sec: int + self, + staging_dir: Path, + *, + entry_point: str, + timeout_sec: int, + entry_args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: """Core execution: single container, three-phase via entrypoint.sh.""" cfg = self.config @@ -269,6 +283,8 @@ def _execute( staging_dir, entry_point=entry_point, container_name=container_name, + entry_args=entry_args, + env_overrides=env_overrides, ) start = time.monotonic() @@ -349,6 +365,8 @@ def _build_run_command( *, entry_point: str, container_name: str, + entry_args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> list[str]: """Build the ``docker run`` command list. @@ -453,9 +471,17 @@ def _user_flag() -> list[str]: else: cmd.extend(["--gpus", "all"]) + if env_overrides: + for name, value in sorted(env_overrides.items()): + if not value: + continue + cmd.extend(["-e", f"{name}={value}"]) + # Image + entry point (passed as CMD arg to entrypoint.sh) cmd.append(cfg.image) cmd.append(entry_point) + if entry_args: + cmd.extend(entry_args) return cmd diff --git a/researchclaw/experiment/sandbox.py b/researchclaw/experiment/sandbox.py index 09ef276f..f8c0d3d1 100644 --- a/researchclaw/experiment/sandbox.py +++ b/researchclaw/experiment/sandbox.py @@ -297,6 +297,8 @@ def run_project( *, entry_point: str = "main.py", timeout_sec: int = 300, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: ... @@ -350,6 +352,8 @@ def run_project( *, entry_point: str = "main.py", timeout_sec: int = 300, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: """Run a multi-file experiment project in the sandbox. @@ -409,12 +413,14 @@ def run_project( ) start = time.monotonic() - command = self._build_command(entry) + command = self._build_command(entry, args=args) logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project) result: SandboxResult try: env = {**os.environ, "PYTHONUNBUFFERED": "1"} + if env_overrides: + env.update(env_overrides) completed = subprocess.run( command, capture_output=True, @@ -457,7 +463,12 @@ def _next_script_path(self) -> Path: def _write_script(script_path: Path, code: str) -> None: _ = script_path.write_text(code, encoding="utf-8") - def _build_command(self, script_path: Path) -> list[str]: + def _build_command( + self, + script_path: Path, + *, + args: list[str] | None = None, + ) -> list[str]: # Convert relative python_path to absolute WITHOUT resolving symlinks. # Using .resolve() would follow venv symlinks to the system Python binary, # which loses the venv context (site-packages like numpy become unavailable). @@ -466,7 +477,10 @@ def _build_command(self, script_path: Path) -> list[str]: if not python_path.is_absolute(): python_path = Path.cwd() / python_path # -u: unbuffered stdout/stderr so subprocess.run captures all output - return [str(python_path), "-u", str(script_path)] + command = [str(python_path), "-u", str(script_path)] + if args: + command.extend(args) + return command @staticmethod def _result_from_completed( diff --git a/researchclaw/experiment/ssh_sandbox.py b/researchclaw/experiment/ssh_sandbox.py index aad97fca..ec5026da 100644 --- a/researchclaw/experiment/ssh_sandbox.py +++ b/researchclaw/experiment/ssh_sandbox.py @@ -71,6 +71,8 @@ def run_project( *, entry_point: str = "main.py", timeout_sec: int = 300, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: """Run a multi-file experiment project on the remote host.""" self._run_counter += 1 @@ -119,7 +121,13 @@ def run_project( metrics={}, ) - return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec) + return self._execute( + staging, + entry_point=entry_point, + timeout_sec=timeout_sec, + entry_args=args, + env_overrides=env_overrides, + ) # ------------------------------------------------------------------ # Static helpers @@ -158,7 +166,13 @@ def _inject_harness(target_dir: Path) -> None: # ------------------------------------------------------------------ def _execute( - self, staging_dir: Path, *, entry_point: str, timeout_sec: int + self, + staging_dir: Path, + *, + entry_point: str, + timeout_sec: int, + entry_args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> SandboxResult: """Core execution flow for remote experiments. @@ -213,11 +227,17 @@ def _execute( # 4. Execute experiment if cfg.use_docker: exec_cmd = self._build_docker_exec_cmd( - remote_dir, entry_point=entry_point, + remote_dir, + entry_point=entry_point, + args=entry_args, + env_overrides=env_overrides, ) else: exec_cmd = self._build_bare_exec_cmd( - remote_dir, entry_point=entry_point, + remote_dir, + entry_point=entry_point, + args=entry_args, + env_overrides=env_overrides, ) start = time.monotonic() @@ -242,13 +262,26 @@ def _execute( ) def _build_bare_exec_cmd( - self, remote_dir: str, *, entry_point: str, + self, + remote_dir: str, + *, + entry_point: str, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> str: """Build command to run Python directly on remote host (with basic sandboxing).""" cfg = self.config rd = shlex.quote(remote_dir) ep = shlex.quote(entry_point) py = shlex.quote(cfg.remote_python) + arg_text = " ".join(shlex.quote(arg) for arg in (args or [])) + arg_suffix = f" {arg_text}" if arg_text else "" + env_parts = [ + f"{name}={shlex.quote(value)}" + for name, value in sorted((env_overrides or {}).items()) + if value + ] + env_prefix = (" ".join(env_parts) + " ") if env_parts else "" gpu_env = "" if cfg.gpu_ids: @@ -264,17 +297,24 @@ def _build_bare_exec_cmd( f"if command -v unshare >/dev/null 2>&1; then " f"HOME={rd} " f"{gpu_env}" - f"unshare --net {py} -u {ep}; " + f"{env_prefix}" + f"unshare --net {py} -u {ep}{arg_suffix}; " f"else " f"echo 'WARNING: unshare not available, running without network isolation' >&2; " f"HOME={rd} " f"{gpu_env}" - f"{py} -u {ep}; " + f"{env_prefix}" + f"{py} -u {ep}{arg_suffix}; " f"fi" ) def _build_docker_exec_cmd( - self, remote_dir: str, *, entry_point: str, + self, + remote_dir: str, + *, + entry_point: str, + args: list[str] | None = None, + env_overrides: dict[str, str] | None = None, ) -> str: """Build command to run inside a Docker container on the remote host. @@ -307,8 +347,16 @@ def _build_docker_exec_cmd( # Try to pass all GPUs; fails gracefully if none available parts.extend(["--gpus", "all"]) + if env_overrides: + for name, value in sorted(env_overrides.items()): + if not value: + continue + parts.extend(["-e", shlex.quote(f"{name}={value}")]) + parts.append(shlex.quote(cfg.docker_image)) parts.extend(["python3", "-u", shlex.quote(entry_point)]) + if args: + parts.extend(shlex.quote(arg) for arg in args) return " ".join(parts) diff --git a/researchclaw/llm/acp_client.py b/researchclaw/llm/acp_client.py index d5e13bc4..f2fdd1d6 100644 --- a/researchclaw/llm/acp_client.py +++ b/researchclaw/llm/acp_client.py @@ -10,13 +10,17 @@ from __future__ import annotations import atexit +import json import logging import os +from pathlib import Path import re import shutil import subprocess import sys import tempfile +import time +import uuid import weakref from dataclasses import dataclass from typing import Any @@ -41,6 +45,13 @@ class ACPConfig: acpx_command: str = "" # auto-detect if empty session_name: str = "researchclaw" timeout_sec: int = 1800 # per-prompt timeout + verbose: bool = False + stateless_prompt: bool = False + reconnect_retries: int = 2 + reconnect_backoff_sec: float = 2.0 + capture_status_on_failure: bool = False + debug_log_path: str = "" + archive_failed_prompt_files: bool = False def _find_acpx() -> str | None: @@ -90,6 +101,13 @@ def from_rc_config(cls, rc_config: Any) -> ACPClient: acpx_command=getattr(acp, "acpx_command", ""), session_name=getattr(acp, "session_name", "researchclaw"), timeout_sec=getattr(acp, "timeout_sec", 1800), + verbose=getattr(acp, "verbose", False), + stateless_prompt=getattr(acp, "stateless_prompt", False), + reconnect_retries=getattr(acp, "reconnect_retries", 2), + reconnect_backoff_sec=getattr(acp, "reconnect_backoff_sec", 2.0), + capture_status_on_failure=getattr(acp, "capture_status_on_failure", False), + debug_log_path=getattr(acp, "debug_log_path", ""), + archive_failed_prompt_files=getattr(acp, "archive_failed_prompt_files", False), )) # ------------------------------------------------------------------ @@ -137,6 +155,8 @@ def preflight(self) -> tuple[bool, str]: agent = self.config.agent if not shutil.which(agent): return False, f"ACP agent CLI not found: {agent!r} (not on PATH)" + if self.config.stateless_prompt: + return True, f"OK - ACP stateless prompt mode ready ({agent} via acpx)" # Create the session try: self._ensure_session() @@ -146,6 +166,9 @@ def preflight(self) -> tuple[bool, str]: def close(self) -> None: """Close the acpx session.""" + if self.config.stateless_prompt: + self._session_ready = False + return if not self._session_ready: return acpx = self._resolve_acpx() @@ -153,9 +176,12 @@ def close(self) -> None: return try: subprocess.run( - [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), - self.config.agent, "sessions", "close", - self.config.session_name], + [ + *self._acpx_base_command(acpx, approve_all=False), + "sessions", + "close", + self.config.session_name, + ], capture_output=True, timeout=15, ) except Exception: # noqa: BLE001 @@ -195,6 +221,128 @@ def _resolve_acpx(self) -> str | None: def _abs_cwd(self) -> str: return os.path.abspath(self.config.cwd) + def _acpx_base_command(self, acpx: str, *, approve_all: bool) -> list[str]: + cmd = [acpx] + if self.config.verbose: + cmd.append("--verbose") + if approve_all: + cmd.append("--approve-all") + cmd.extend(["--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent]) + return cmd + + def _debug_log_path(self) -> Path | None: + raw = str(getattr(self.config, "debug_log_path", "") or "").strip() + if not raw: + return None + return Path(raw) + + @staticmethod + def _debug_timestamp() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + + def _append_debug_event(self, event: str, **payload: Any) -> None: + record = { + "ts": self._debug_timestamp(), + "event": event, + **payload, + } + serialized = json.dumps(record, ensure_ascii=False, sort_keys=True) + logger.info("ACP_DEBUG %s", serialized) + debug_path = self._debug_log_path() + if debug_path is None: + return + try: + debug_path.parent.mkdir(parents=True, exist_ok=True) + with debug_path.open("a", encoding="utf-8") as handle: + handle.write(serialized + "\n") + except Exception as exc: # noqa: BLE001 + logger.warning("Failed to append ACP debug log %s: %s", debug_path, exc) + + def _archive_prompt_file(self, prompt_path: str, *, session_name: str) -> str: + if not self.config.archive_failed_prompt_files: + return "" + source_path = Path(prompt_path) + if not source_path.exists(): + return "" + debug_path = self._debug_log_path() + base_dir = debug_path.parent if debug_path is not None else Path(self._abs_cwd()) + archive_dir = base_dir / "acp_failed_prompts" + archive_dir.mkdir(parents=True, exist_ok=True) + target_name = f"{session_name}-{source_path.name}" + target_path = archive_dir / target_name + shutil.copy2(source_path, target_path) + return str(target_path) + + def _status_command(self, acpx: str, session_name: str) -> list[str]: + cmd = self._acpx_base_command(acpx, approve_all=False) + cmd.extend(["status", "-s", session_name]) + return cmd + + def _capture_session_status(self, acpx: str, session_name: str) -> str: + if not self.config.capture_status_on_failure: + return "" + try: + result = subprocess.run( + self._status_command(acpx, session_name), + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=15, + ) + except Exception as exc: # noqa: BLE001 + return f"" + + stdout = (result.stdout or "").strip() + stderr = (result.stderr or "").strip() + chunks = [f"exit={result.returncode}"] + if stdout: + chunks.append(f"stdout:\n{stdout}") + if stderr: + chunks.append(f"stderr:\n{stderr}") + return "\n".join(chunks) + + def _record_failure_context( + self, + *, + acpx: str, + session_name: str, + transport: str, + prompt_bytes: int, + prompt_limit: int, + use_file: bool, + error_text: str, + prompt_path: str | None = None, + returncode: int | None = None, + timed_out: bool = False, + ) -> None: + archived_prompt_path = "" + if prompt_path: + try: + archived_prompt_path = self._archive_prompt_file( + prompt_path, + session_name=session_name, + ) + except Exception as exc: # noqa: BLE001 + archived_prompt_path = "" + logger.warning("Failed to archive ACP prompt file %s: %s", prompt_path, exc) + + status_text = self._capture_session_status(acpx, session_name) + self._append_debug_event( + "prompt_failure", + session_name=session_name, + transport=transport, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=use_file, + prompt_path=prompt_path or "", + archived_prompt_path=archived_prompt_path, + returncode=returncode, + timed_out=timed_out, + error=error_text, + session_status=status_text, + ) + def _ensure_session(self) -> None: """Find or create the named acpx session.""" if self._session_ready: @@ -202,28 +350,7 @@ def _ensure_session(self) -> None: acpx = self._resolve_acpx() if not acpx: raise RuntimeError("acpx not found") - - # Use 'ensure' which finds existing or creates new - result = subprocess.run( - [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), - self.config.agent, "sessions", "ensure", - "--name", self.config.session_name], - capture_output=True, text=True, encoding="utf-8", - errors="replace", timeout=30, - ) - if result.returncode != 0: - # Fall back to 'new' - result = subprocess.run( - [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), - self.config.agent, "sessions", "new", - "--name", self.config.session_name], - capture_output=True, text=True, encoding="utf-8", - errors="replace", timeout=30, - ) - if result.returncode != 0: - raise RuntimeError( - f"Failed to create ACP session: {result.stderr.strip()}" - ) + self._create_or_ensure_session(acpx, self.config.session_name, ensure=True) self._session_ready = True logger.info("ACP session '%s' ready (%s)", self.config.session_name, self.config.agent) @@ -231,7 +358,7 @@ def _ensure_session(self) -> None: # for the entire command line, not just the prompt payload. acpx adds # several fixed arguments plus quoting overhead, so leave generous headroom # on Windows and switch to temp-file transport earlier. - _MAX_CLI_PROMPT_BYTES = 20_000 if sys.platform == "win32" else 100_000 + _MAX_CLI_PROMPT_BYTES = 6_000 # On Windows, npm-installed CLIs usually resolve to ``.cmd`` launchers, # which are routed through ``cmd.exe`` and hit a much smaller practical # command-line limit (~8 KB). Use file transport much earlier there. @@ -250,9 +377,9 @@ def _ensure_session(self) -> None: _RECONNECT_ERRORS = ( "agent needs reconnect", "session not found", - "Query closed", + "query closed", + "queue owner disconnected before prompt completion", ) - _MAX_RECONNECT_ATTEMPTS = 2 @classmethod def _cli_prompt_limit(cls, acpx: str | None) -> int: @@ -264,6 +391,18 @@ def _cli_prompt_limit(cls, acpx: str | None) -> int: return min(limit, cls._MAX_CMD_WRAPPER_PROMPT_BYTES) return limit + @staticmethod + def _sanitize_prompt(prompt: str) -> str: + """Strip NUL bytes before subprocess transport. + + ``subprocess.run()`` rejects arguments containing ``\\x00`` with + ``ValueError: embedded null byte``. This can happen when upstream + scraping or artifact text accidentally carries NULs into the prompt. + """ + if "\x00" not in prompt: + return prompt + return prompt.replace("\x00", "") + def _send_prompt(self, prompt: str) -> str: """Send a prompt via acpx and return the response text. @@ -272,12 +411,14 @@ def _send_prompt(self, prompt: str) -> str: is asked to read it. If the session has died (common after long-running stages), retries - up to ``_MAX_RECONNECT_ATTEMPTS`` times with automatic reconnection. + up to the configured reconnect retry count with automatic reconnection. """ acpx = self._resolve_acpx() if not acpx: raise RuntimeError("acpx not found") + prompt = self._sanitize_prompt(prompt) + prompt_bytes = len(prompt.encode("utf-8")) prompt_limit = self._cli_prompt_limit(acpx) use_file = prompt_bytes > prompt_limit @@ -288,13 +429,123 @@ def _send_prompt(self, prompt: str) -> str: prompt_limit, ) + if self.config.stateless_prompt: + last_exc: RuntimeError | None = None + for attempt in range(1 + self.config.reconnect_retries): + session_name = self._new_ephemeral_session(acpx) + self._append_debug_event( + "prompt_attempt", + session_name=session_name, + stateless=True, + attempt=attempt + 1, + max_attempts=1 + self.config.reconnect_retries, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=use_file, + ) + try: + if use_file: + return self._send_prompt_via_file( + acpx, + prompt, + session_name=session_name, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) + return self._send_prompt_cli( + acpx, + prompt, + session_name=session_name, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) + except OSError as os_exc: + if not use_file: + logger.warning( + "Stateless ACP subprocess raised OSError, " + "falling back to temp file: %s", + os_exc, + ) + use_file = True + return self._send_prompt_via_file( + acpx, + prompt, + session_name=session_name, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) + raise RuntimeError( + f"ACP prompt failed: {os_exc}" + ) from os_exc + except RuntimeError as exc: + exc_lower = str(exc).lower() + if not use_file and any( + h in exc_lower for h in self._CMD_TOO_LONG_HINTS + ): + logger.warning( + "Stateless ACP prompt too long for OS, " + "falling back to temp file: %s", + exc, + ) + use_file = True + return self._send_prompt_via_file( + acpx, + prompt, + session_name=session_name, + ) + if not self._is_reconnect_error(exc): + raise + last_exc = exc + if attempt < self.config.reconnect_retries: + self._append_debug_event( + "prompt_retrying", + session_name=session_name, + stateless=True, + attempt=attempt + 1, + remaining_retries=self.config.reconnect_retries - attempt, + error=str(exc), + ) + logger.warning( + "Stateless ACP session died (%s), retrying " + "with a fresh ephemeral session (attempt %d/%d)...", + exc, + attempt + 1, + self.config.reconnect_retries, + ) + self._sleep_before_retry() + continue + finally: + self._close_named_session(acpx, session_name) + + raise last_exc # type: ignore[misc] + last_exc: RuntimeError | None = None - for attempt in range(1 + self._MAX_RECONNECT_ATTEMPTS): + for attempt in range(1 + self.config.reconnect_retries): self._ensure_session() + self._append_debug_event( + "prompt_attempt", + session_name=self.config.session_name, + stateless=False, + attempt=attempt + 1, + max_attempts=1 + self.config.reconnect_retries, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=use_file, + ) try: if use_file: - return self._send_prompt_via_file(acpx, prompt) - return self._send_prompt_cli(acpx, prompt) + return self._send_prompt_via_file( + acpx, + prompt, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) + return self._send_prompt_cli( + acpx, + prompt, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) except OSError as os_exc: # OS-level failure (e.g., Windows CreateProcess arg limit). # Fall back to temp-file transport automatically. @@ -305,7 +556,12 @@ def _send_prompt(self, prompt: str) -> str: os_exc, ) use_file = True - return self._send_prompt_via_file(acpx, prompt) + return self._send_prompt_via_file( + acpx, + prompt, + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + ) raise RuntimeError( f"ACP prompt failed: {os_exc}" ) from os_exc @@ -322,17 +578,26 @@ def _send_prompt(self, prompt: str) -> str: ) use_file = True return self._send_prompt_via_file(acpx, prompt) - if not any(pat in str(exc) for pat in self._RECONNECT_ERRORS): + if not self._is_reconnect_error(exc): raise last_exc = exc - if attempt < self._MAX_RECONNECT_ATTEMPTS: + if attempt < self.config.reconnect_retries: + self._append_debug_event( + "prompt_retrying", + session_name=self.config.session_name, + stateless=False, + attempt=attempt + 1, + remaining_retries=self.config.reconnect_retries - attempt, + error=str(exc), + ) logger.warning( "ACP session died (%s), reconnecting (attempt %d/%d)...", exc, attempt + 1, - self._MAX_RECONNECT_ATTEMPTS, + self.config.reconnect_retries, ) self._force_reconnect() + self._sleep_before_retry() raise last_exc # type: ignore[misc] @@ -344,35 +609,97 @@ def _force_reconnect(self) -> None: pass self._session_ready = False - def _send_prompt_cli(self, acpx: str, prompt: str) -> str: + def _is_reconnect_error(self, exc: Exception) -> bool: + text = str(exc).lower() + return any(pattern in text for pattern in self._RECONNECT_ERRORS) + + def _sleep_before_retry(self) -> None: + delay = max(float(getattr(self.config, "reconnect_backoff_sec", 0.0) or 0.0), 0.0) + if delay > 0: + time.sleep(delay) + + def _send_prompt_cli( + self, + acpx: str, + prompt: str, + *, + session_name: str | None = None, + prompt_bytes: int, + prompt_limit: int, + ) -> str: """Send prompt as a CLI argument (original path).""" + active_session = session_name or self.config.session_name try: result = subprocess.run( - [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(), - self.config.agent, "-s", self.config.session_name, - prompt], + self._prompt_command(acpx, prompt, session_name=active_session), capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=self.config.timeout_sec, ) except subprocess.TimeoutExpired as exc: + self._record_failure_context( + acpx=acpx, + session_name=active_session, + transport="cli", + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=False, + error_text=f"ACP prompt timed out after {self.config.timeout_sec}s", + timed_out=True, + ) raise RuntimeError( f"ACP prompt timed out after {self.config.timeout_sec}s" ) from exc if result.returncode != 0: stderr = (result.stderr or "").strip() + self._record_failure_context( + acpx=acpx, + session_name=active_session, + transport="cli", + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=False, + error_text=stderr, + returncode=result.returncode, + ) raise RuntimeError(f"ACP prompt failed (exit {result.returncode}): {stderr}") - return self._extract_response(result.stdout) + response = self._extract_response(result.stdout) + self._append_debug_event( + "prompt_success", + session_name=active_session, + transport="cli", + prompt_bytes=prompt_bytes, + use_file=False, + response_bytes=len(response.encode("utf-8")), + ) + return response - def _send_prompt_via_file(self, acpx: str, prompt: str) -> str: + def _send_prompt_via_file( + self, + acpx: str, + prompt: str, + *, + session_name: str | None = None, + prompt_bytes: int, + prompt_limit: int, + ) -> str: """Write prompt to a temp file, ask the agent to read and respond.""" fd, prompt_path = tempfile.mkstemp( suffix=".md", prefix="rc_prompt_", ) + active_session = session_name or self.config.session_name try: with os.fdopen(fd, "w", encoding="utf-8") as f: f.write(prompt) + self._append_debug_event( + "prompt_file_written", + session_name=active_session, + transport="file", + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + prompt_path=prompt_path, + ) short_prompt = ( f"Read the file at {prompt_path} in its entirety. " @@ -383,30 +710,134 @@ def _send_prompt_via_file(self, acpx: str, prompt: str) -> str: try: result = subprocess.run( - [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(), - self.config.agent, "-s", self.config.session_name, - short_prompt], + self._prompt_command(acpx, short_prompt, session_name=active_session), capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=self.config.timeout_sec, ) except subprocess.TimeoutExpired as exc: + self._record_failure_context( + acpx=acpx, + session_name=active_session, + transport="file", + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=True, + error_text=f"ACP prompt timed out after {self.config.timeout_sec}s", + prompt_path=prompt_path, + timed_out=True, + ) raise RuntimeError( f"ACP prompt timed out after {self.config.timeout_sec}s" ) from exc if result.returncode != 0: stderr = (result.stderr or "").strip() + self._record_failure_context( + acpx=acpx, + session_name=active_session, + transport="file", + prompt_bytes=prompt_bytes, + prompt_limit=prompt_limit, + use_file=True, + error_text=stderr, + prompt_path=prompt_path, + returncode=result.returncode, + ) raise RuntimeError( f"ACP prompt failed (exit {result.returncode}): {stderr}" ) - return self._extract_response(result.stdout) + response = self._extract_response(result.stdout) + self._append_debug_event( + "prompt_success", + session_name=active_session, + transport="file", + prompt_bytes=prompt_bytes, + use_file=True, + prompt_path=prompt_path, + response_bytes=len(response.encode("utf-8")), + ) + return response finally: try: os.unlink(prompt_path) except OSError: pass + def _prompt_command( + self, + acpx: str, + prompt: str, + *, + session_name: str | None = None, + ) -> list[str]: + """Build the acpx prompt command for session or stateless mode.""" + cmd = self._acpx_base_command(acpx, approve_all=True) + cmd.append("prompt") + active_session = session_name or self.config.session_name + cmd.extend(["-s", active_session]) + cmd.append(prompt) + return cmd + + def _create_or_ensure_session( + self, + acpx: str, + session_name: str, + *, + ensure: bool, + ) -> None: + action = "ensure" if ensure else "new" + result = subprocess.run( + [ + *self._acpx_base_command(acpx, approve_all=False), + "sessions", + action, + "--name", + session_name, + ], + capture_output=True, text=True, encoding="utf-8", + errors="replace", timeout=30, + ) + if result.returncode == 0: + self._append_debug_event( + "session_ready", + session_name=session_name, + ensure=ensure, + stateless=self.config.stateless_prompt, + ) + return + if ensure: + self._create_or_ensure_session(acpx, session_name, ensure=False) + return + raise RuntimeError( + f"Failed to create ACP session: {(result.stderr or '').strip()}" + ) + + def _new_ephemeral_session(self, acpx: str) -> str: + session_name = f"{self.config.session_name}-{uuid.uuid4().hex[:8]}" + self._create_or_ensure_session(acpx, session_name, ensure=False) + logger.info("ACP ephemeral session '%s' ready (%s)", session_name, self.config.agent) + return session_name + + def _close_named_session(self, acpx: str, session_name: str) -> None: + try: + subprocess.run( + [ + *self._acpx_base_command(acpx, approve_all=False), + "sessions", + "close", + session_name, + ], + capture_output=True, timeout=15, + ) + self._append_debug_event( + "session_closed", + session_name=session_name, + stateless=self.config.stateless_prompt, + ) + except Exception: # noqa: BLE001 + pass + @staticmethod def _extract_response(raw_output: str | None) -> str: """Extract the agent's actual response from acpx output. diff --git a/researchclaw/pipeline/_helpers.py b/researchclaw/pipeline/_helpers.py index 74eda81d..ea49a2e2 100644 --- a/researchclaw/pipeline/_helpers.py +++ b/researchclaw/pipeline/_helpers.py @@ -42,6 +42,70 @@ class StageResult: evidence_refs: tuple[str, ...] = () +def detect_synthetic_proxy_signals(file_texts: dict[str, str]) -> list[str]: + """Heuristically detect toy/proxy dataset generation in experiment code.""" + if not file_texts: + return [] + + combined = "\n\n".join(file_texts.values()) + combined_lower = combined.lower() + signals: list[str] = [] + + if "class cachedevidencerepository" in combined_lower: + signals.append( + "contains `CachedEvidenceRepository`, a repository-local synthetic evidence scaffold" + ) + if re.search(r"def\s+_build_example\s*\(", combined): + signals.append("contains `_build_example(...)`, suggesting in-code sample synthesis") + if re.search(r"def\s+_build_splits\s*\(", combined): + signals.append("contains `_build_splits(...)`, suggesting in-code dataset assembly") + if re.search(r"def\s+_sample_circle\s*\(", combined): + signals.append("contains `_sample_circle(...)`, suggesting synthetic circle generation") + + split_match = re.search( + r"_SPLIT_SIZES\s*=\s*\{[^}]*['\"]train['\"]\s*:\s*(\d+)" + r"[^}]*['\"]val['\"]\s*:\s*(\d+)" + r"[^}]*['\"]test['\"]\s*:\s*(\d+)", + combined, + re.DOTALL, + ) + if split_match: + split_sizes = tuple(int(split_match.group(i)) for i in range(1, 4)) + if sum(split_sizes) <= 500: + signals.append( + "contains hard-coded tiny split sizes " + f"train/val/test={split_sizes[0]}/{split_sizes[1]}/{split_sizes[2]}" + ) + + for phrase in ( + "toy dataset", + "proxy dataset", + "synthetic benchmark", + "repository-local benchmark", + ): + if phrase in combined_lower: + signals.append(f"contains suspicious phrase `{phrase}`") + + return signals + + +def should_fail_synthetic_proxy_guard(signals: list[str]) -> bool: + """Return True when synthetic/proxy signals are strong enough to hard-fail.""" + if not signals: + return False + + strong_markers = ("CachedEvidenceRepository", "hard-coded tiny split sizes") + if any(any(marker in signal for marker in strong_markers) for signal in signals): + return True + + has_build_example = any("_build_example" in signal for signal in signals) + has_build_splits = any("_build_splits" in signal for signal in signals) + if has_build_example and has_build_splits: + return True + + return len(signals) >= 3 + + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -235,7 +299,12 @@ def _build_fallback_queries(topic: str) -> list[str]: def _write_stage_meta( stage_dir: Path, stage: Stage, run_id: str, result: "StageResult" ) -> None: - next_stage = NEXT_STAGE[stage] + if result.status is StageStatus.DONE: + next_stage = NEXT_STAGE[stage] + else: + # Failed / paused / blocked stages should point back to themselves so + # retry-resume tooling does not imply that the pipeline advanced. + next_stage = stage meta = { "stage_id": f"{int(stage):02d}-{stage.name.lower()}", "run_id": run_id, @@ -371,6 +440,79 @@ def _load_hardware_profile(run_dir: Path) -> dict[str, Any] | None: return None +def _load_research_repair_metadata(run_dir: Path) -> dict[str, Any] | None: + """Load child-run repair metadata when this run was created via research-repair.""" + metadata_path = run_dir / "research_repair_parent.json" + if not metadata_path.exists(): + return None + try: + data = json.loads(metadata_path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, ValueError): + return None + return data if isinstance(data, dict) else None + + +def _build_research_repair_brief( + run_dir: Path, + *, + max_feedback_items: int = 5, +) -> str: + """Return a compact repair brief for prompt injection. + + The child-run repair metadata is authoritative for repair semantics. + Use this short block instead of repeating the full repair narrative in + `research.topic`. + """ + metadata = _load_research_repair_metadata(run_dir) + if not metadata: + return "" + + compact = str(metadata.get("compact_repair_brief", "")).strip() + if compact: + return compact + + parent_run_id = str(metadata.get("parent_run_id", "")).strip() + target_stage_name = str(metadata.get("target_stage_name", "")).strip() + repair_reason = " ".join( + str(metadata.get("repair_reason", "")).split() + ).strip() + reuse_policy = metadata.get("reuse_policy") + hard_reuse: list[str] = [] + if isinstance(reuse_policy, dict): + hard_reuse = [ + str(item).strip() + for item in reuse_policy.get("hard_reuse_stage_dirs", []) + if str(item).strip() + ] + + feedback_excerpt = str(metadata.get("feedback_excerpt", "")).strip() + feedback_items: list[str] = [] + for raw_line in feedback_excerpt.splitlines(): + line = raw_line.strip() + if not line.startswith("- "): + continue + payload = line[2:].strip() + lowered = payload.lower() + if lowered.startswith(("parent run:", "target stage:", "reason:")): + continue + feedback_items.append(" ".join(payload.split())) + if len(feedback_items) >= max_feedback_items: + break + + lines = ["## Repair Context"] + if parent_run_id: + lines.append(f"- Parent run: `{parent_run_id}`") + if target_stage_name: + lines.append(f"- Authoritative rerun starts at: `{target_stage_name}`") + if repair_reason: + lines.append(f"- Repair reason: {repair_reason}") + if hard_reuse: + lines.append("- Hard reuse: " + ", ".join(hard_reuse)) + lines.append("- Downstream parent analysis and paper artifacts are soft context only.") + lines.extend(f"- {item}" for item in feedback_items) + return "\n".join(lines).strip() + + # --------------------------------------------------------------------------- # Parsing utilities # --------------------------------------------------------------------------- diff --git a/researchclaw/pipeline/runner.py b/researchclaw/pipeline/runner.py index b81ffdb9..558b026d 100644 --- a/researchclaw/pipeline/runner.py +++ b/researchclaw/pipeline/runner.py @@ -47,6 +47,9 @@ def _build_pipeline_summary( "run_id": run_id, "stages_executed": len(results), "stages_done": sum(1 for item in results if item.status == StageStatus.DONE), + "stages_paused": sum( + 1 for item in results if item.status == StageStatus.PAUSED + ), "stages_blocked": sum( 1 for item in results if item.status == StageStatus.BLOCKED_APPROVAL ), @@ -463,6 +466,9 @@ def execute_pipeline( elif result.status == StageStatus.FAILED: err = result.error or "unknown error" print(f"{prefix} {stage.name} — FAILED ({elapsed:.1f}s) — {err}") + elif result.status == StageStatus.PAUSED: + err = result.error or "paused" + print(f"{prefix} {stage.name} — PAUSED ({elapsed:.1f}s) — {err}") elif result.status == StageStatus.BLOCKED_APPROVAL: print(f"{prefix} {stage.name} — blocked (awaiting approval)") results.append(result) @@ -604,6 +610,14 @@ def execute_pipeline( logger.warning("Noncritical stage %s failed - skipping", stage.name) else: break + if result.status == StageStatus.PAUSED: + logger.warning( + "[%s] Pipeline paused at %s: %s", + run_id, + stage.name, + result.error or result.decision, + ) + break if result.status == StageStatus.BLOCKED_APPROVAL and stop_on_gate: break diff --git a/researchclaw/pipeline/stage_impls/_code_generation.py b/researchclaw/pipeline/stage_impls/_code_generation.py index e5f21ddc..2f65a90b 100644 --- a/researchclaw/pipeline/stage_impls/_code_generation.py +++ b/researchclaw/pipeline/stage_impls/_code_generation.py @@ -2,12 +2,16 @@ from __future__ import annotations +import ast +import importlib.util import json import logging import re from pathlib import Path from typing import Any +import yaml + from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.experiment.validator import ( @@ -19,7 +23,9 @@ from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, + _build_research_repair_brief, _chat_with_prompt, + detect_synthetic_proxy_signals, _ensure_sandbox_deps, _extract_code_block, _extract_multi_file_blocks, @@ -28,6 +34,7 @@ _load_hardware_profile, _read_prior_artifact, _safe_json_loads, + should_fail_synthetic_proxy_guard, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus @@ -42,6 +49,121 @@ "mountaincarcontinuous", "lunarlander-continuous", } +_LIKELY_LOCAL_HELPER_MODULES = { + "backbone", + "backbones", + "config", + "configs", + "constants", + "data_loader", + "data_utils", + "dataloader", + "dataset", + "datasets", + "decoder", + "decoders", + "encoder", + "encoders", + "helper", + "helpers", + "layer", + "layers", + "loader", + "loaders", + "loss", + "losses", + "metric", + "metrics", + "model", + "models", + "module", + "modules", + "network", + "networks", + "postprocess", + "postprocessing", + "preprocess", + "preprocessing", + "train_utils", + "trainer", + "trainers", + "transform", + "transforms", + "util", + "utils", +} + +_PLACEHOLDER_EXPERIMENT_PATTERNS = ( + "dummy implementation", + "dummy implementations", + "dummy placeholder", + "placeholder implementation", + "replace with actual implementation", + "replace with actual implementations", + "for standalone operation", + "for demonstration", +) + +_EXPERIMENT_CLASS_NAME_HINTS = ( + "ablation", + "baseline", + "detector", + "fusion", + "model", + "reranker", + "verifier", +) + +_CORE_EXPERIMENT_METHODS = { + "evaluate", + "forward", + "predict", + "run", + "score", + "train_step", +} + +_ABLATION_NAME_HINTS = ( + "without", + "ablation", + "abl_", + "no_", + "minus", +) + +_DISTINCTNESS_CHECK_NAME_HINTS = ( + "ablation_check", + "condition_outputs_differ", + "distinctness", + "outputs_differ", + "sanity_check_condition", + "verify_condition", +) + +_CRITICAL_DEEP_KEYWORDS = ( + "unboundlocalerror", + "unregistered", + "does not exist", + "empty or trivial subclass", + "does not override", + "import-usage mismatch", + "nameerror", + "was removed", + "ptp()", + "copy-paste", + "identical method signatures", + "identical ast", + "not a real ablation", + "shadows stdlib/pip", + "placeholder experiment text found", + "placeholder experiment implementation", + "fixed-constant core method", + "demonstration stub", + "no ablation/condition distinctness self-check", + "distinctness self-check", + "does not call distinctness check", +) + def _check_rl_compatibility(code: str) -> list[str]: """Detect DQN + continuous-action environment mismatches. @@ -64,6 +186,716 @@ def _check_rl_compatibility(code: str) -> list[str]: return errors +def _find_missing_local_module_imports(files: dict[str, str]) -> list[str]: + """Detect local helper-module imports that are not present in *files*. + + This is intentionally narrower than generic import validation: we only flag + imports that strongly indicate an intra-project Python module dependency. + """ + known_modules = { + fname[:-3] + for fname in files + if fname.endswith(".py") + } + issues: list[str] = [] + seen: set[tuple[str, str, int | None]] = set() + + def _record_issue( + file_name: str, + module_name: str, + *, + line: int | None, + ) -> None: + if ( + module_name in known_modules + or module_name.startswith("_") + ): + return + key = (file_name, module_name, line) + if key in seen: + return + seen.add(key) + line_text = f" line {line}" if line is not None else "" + issues.append( + f"[{file_name}] Local helper module '{module_name}.py' is imported at" + f"{line_text} but was not generated. The experiment project must be" + f" self-contained: either return '{module_name}.py' or inline its" + f" code and remove the import." + ) + + for fname, code in files.items(): + if not fname.endswith(".py"): + continue + try: + tree = ast.parse(code) + except SyntaxError: + continue + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + top = alias.name.split(".")[0] + if top in _LIKELY_LOCAL_HELPER_MODULES: + _record_issue( + fname, + top, + line=getattr(node, "lineno", None), + ) + elif isinstance(node, ast.ImportFrom): + line = getattr(node, "lineno", None) + if node.level > 0: + if node.module: + top = node.module.split(".")[0] + _record_issue(fname, top, line=line) + else: + for alias in node.names: + top = alias.name.split(".")[0] + _record_issue(fname, top, line=line) + elif node.module: + top = node.module.split(".")[0] + if top in _LIKELY_LOCAL_HELPER_MODULES: + _record_issue(fname, top, line=line) + + return issues + + +def _strip_docstring(body: list[ast.stmt]) -> list[ast.stmt]: + if ( + body + and isinstance(body[0], ast.Expr) + and isinstance(body[0].value, ast.Constant) + and isinstance(body[0].value.value, str) + ): + return body[1:] + return body + + +def _is_literal_constant(node: ast.AST | None) -> bool: + if node is None: + return True + if isinstance(node, ast.Constant): + return True + if isinstance(node, ast.UnaryOp) and isinstance(node.op, (ast.UAdd, ast.USub)): + return _is_literal_constant(node.operand) + if isinstance(node, (ast.Tuple, ast.List, ast.Set)): + return all(_is_literal_constant(elt) for elt in node.elts) + if isinstance(node, ast.Dict): + return all( + (key is None or _is_literal_constant(key)) and _is_literal_constant(value) + for key, value in zip(node.keys, node.values, strict=False) + ) + return False + + +def _method_is_pass_only(node: ast.FunctionDef | ast.AsyncFunctionDef) -> bool: + body = _strip_docstring(list(node.body)) + return len(body) == 1 and isinstance(body[0], ast.Pass) + + +def _method_returns_fixed_constant( + node: ast.FunctionDef | ast.AsyncFunctionDef, +) -> bool: + body = _strip_docstring(list(node.body)) + return ( + len(body) == 1 + and isinstance(body[0], ast.Return) + and _is_literal_constant(body[0].value) + ) + + +def _looks_like_experiment_class(node: ast.ClassDef) -> bool: + lowered = node.name.lower() + if any(hint in lowered for hint in _EXPERIMENT_CLASS_NAME_HINTS): + return True + return any( + isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) + and item.name in _CORE_EXPERIMENT_METHODS + for item in node.body + ) + + +def _call_name(node: ast.AST) -> str: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return node.attr + return "" + + +def _extract_condition_entries(tree: ast.AST) -> list[tuple[str, str | None]]: + entries: list[tuple[str, str | None]] = [] + + def _extract_label(node: ast.AST) -> str | None: + if isinstance(node, ast.Constant) and isinstance(node.value, str): + return node.value + return None + + def _extract_class_name(node: ast.AST) -> str | None: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Call): + return _extract_class_name(node.func) + if isinstance(node, ast.Attribute): + return node.attr + return None + + for assign in ast.walk(tree): + if not isinstance(assign, ast.Assign): + continue + if not isinstance(assign.value, (ast.List, ast.Tuple)): + continue + for elt in assign.value.elts: + if not isinstance(elt, ast.Tuple) or len(elt.elts) < 2: + continue + label = _extract_label(elt.elts[0]) or "" + class_name = _extract_class_name(elt.elts[1]) + if label or class_name: + entries.append((label, class_name)) + return entries + + +def _looks_like_ablation_entry(label: str, class_name: str | None) -> bool: + lowered = f"{label} {class_name or ''}".lower() + return any(hint in lowered for hint in _ABLATION_NAME_HINTS) + + +def _function_has_distinctness_logic( + node: ast.FunctionDef | ast.AsyncFunctionDef, +) -> bool: + body = _strip_docstring(list(node.body)) + if not body: + return False + for sub in ast.walk(node): + if isinstance(sub, ast.Assert): + return True + if isinstance(sub, ast.Compare): + return True + if isinstance(sub, ast.Call): + call_name = _call_name(sub.func).lower() + if call_name in {"allclose", "array_equal"}: + return True + if "assert" in call_name or "raise" in call_name: + return True + return False + + +def _find_placeholder_experiment_issues(files: dict[str, str]) -> list[str]: + """Detect obviously placeholder experiment implementations. + + This is stricter than generic code-complexity warnings: it looks for + generated experiments that openly advertise themselves as demonstrations, + or condition classes whose core methods are pass-only / fixed-constant stubs. + """ + issues: list[str] = [] + + for fname, code in files.items(): + if not fname.endswith(".py"): + continue + lowered_code = code.lower() + for pattern in _PLACEHOLDER_EXPERIMENT_PATTERNS: + if pattern in lowered_code: + issues.append( + f"[{fname}] Placeholder experiment text found ('{pattern}') — " + "generated experiment code must implement real logic, not " + "demonstration stubs." + ) + break + + try: + tree = ast.parse(code) + except SyntaxError: + continue + + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef) or not _looks_like_experiment_class(node): + continue + + methods = [ + item + for item in node.body + if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) + ] + if not methods: + continue + + pass_only_init = any( + method.name == "__init__" and _method_is_pass_only(method) + for method in methods + ) + constant_core_methods = [ + method.name + for method in methods + if method.name in _CORE_EXPERIMENT_METHODS + and _method_returns_fixed_constant(method) + ] + trivial_core_methods = [ + method.name + for method in methods + if method.name in _CORE_EXPERIMENT_METHODS + and ( + _method_is_pass_only(method) + or _method_returns_fixed_constant(method) + ) + ] + + if pass_only_init and constant_core_methods: + issues.append( + f"[{fname}] Class '{node.name}' looks like a placeholder " + "experiment implementation: __init__ is pass-only and core " + "method(s) " + + ", ".join(sorted(constant_core_methods)) + + " use fixed-constant core method returns. Ablation/condition " + "classes must exercise real differentiating logic." + ) + continue + + if trivial_core_methods and len(trivial_core_methods) == len( + [ + method + for method in methods + if method.name in _CORE_EXPERIMENT_METHODS + ] + ): + issues.append( + f"[{fname}] Class '{node.name}' is a demonstration stub: all " + "core experiment methods (" + + ", ".join(sorted(trivial_core_methods)) + + ") are pass-only or fixed-constant. Generated ablations must " + "implement real computation." + ) + + return issues + + +def _find_condition_distinctness_issues(files: dict[str, str]) -> list[str]: + """Detect missing or non-functional ablation distinctness self-checks.""" + issues: list[str] = [] + condition_entries: list[tuple[str, str | None]] = [] + distinctness_functions: dict[str, tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = {} + called_distinctness_functions: set[str] = set() + + for fname, code in files.items(): + if not fname.endswith(".py"): + continue + try: + tree = ast.parse(code) + except SyntaxError: + continue + + condition_entries.extend(_extract_condition_entries(tree)) + + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + lowered_name = node.name.lower() + if any(hint in lowered_name for hint in _DISTINCTNESS_CHECK_NAME_HINTS): + distinctness_functions[node.name] = (fname, node) + elif isinstance(node, ast.Call): + call_name = _call_name(node.func) + if any(hint in call_name.lower() for hint in _DISTINCTNESS_CHECK_NAME_HINTS): + called_distinctness_functions.add(call_name) + + if not condition_entries: + return issues + + ablation_entries = [ + (label, class_name) + for label, class_name in condition_entries + if _looks_like_ablation_entry(label, class_name) + ] + if len(condition_entries) < 4 or len(ablation_entries) < 2: + return issues + + if not distinctness_functions: + issues.append( + "No ablation/condition distinctness self-check found. Experiments with " + "multiple ablation-like conditions must include a startup check that " + "compares condition outputs on the same probe input and fails if they " + "are identical." + ) + return issues + + valid_function_names: set[str] = set() + for func_name, (fname, func_node) in distinctness_functions.items(): + if _method_is_pass_only(func_node): + issues.append( + f"[{fname}] Distinctness self-check '{func_name}' is pass-only. It " + "must actively compare condition outputs and fail loudly on " + "identical behavior." + ) + continue + if _method_returns_fixed_constant(func_node): + issues.append( + f"[{fname}] Distinctness self-check '{func_name}' returns a fixed " + "constant instead of validating ablation behavior." + ) + continue + if not _function_has_distinctness_logic(func_node): + issues.append( + f"[{fname}] Distinctness self-check '{func_name}' exists but does " + "not contain comparison/assertion logic. It must compare outputs " + "from multiple conditions on the same probe input." + ) + continue + valid_function_names.add(func_name) + + if valid_function_names and not any( + called in valid_function_names for called in called_distinctness_functions + ): + issues.append( + "Experiment defines a condition distinctness self-check but does not " + "call it before running the main evaluation. Call the self-check at " + "startup and fail fast on identical outputs." + ) + + return issues + + +def _is_critical_deep_warning(message: str) -> bool: + lowered = message.lower() + return any(keyword in lowered for keyword in _CRITICAL_DEEP_KEYWORDS) + + +def _repair_self_contained_project( + *, + llm: LLMClient, + prompt_manager: PromptManager, + files: dict[str, str], + issues: list[str], + max_tokens: int, + max_repair: int, +) -> tuple[dict[str, str], list[str]]: + """Repair missing local helper-module files by asking for a full file set.""" + current_files = dict(files) + current_issues = list(issues) + + for attempt in range(1, max_repair + 1): + repair_prompt = ( + "SELF-CONTAINMENT REPAIR REQUIRED.\n\n" + "The generated experiment project is not self-contained. Some files " + "import local helper modules that were never returned.\n\n" + "Missing local-module issues:\n" + + "\n".join(f"- {issue}" for issue in current_issues) + + "\n\nRULES:\n" + "- If any file imports a local helper module such as models, utils, " + "data_utils, metrics, or loaders, you MUST return that helper file " + "too.\n" + "- If you do not want a helper file, inline its code into an existing " + "file and remove the import.\n" + "- Preserve working files unless you are intentionally replacing them.\n" + "- The final project must be runnable via `python main.py`.\n" + "- Return ALL project files using ```filename:...``` blocks.\n\n" + "Current files:\n" + + "\n\n".join( + f"```filename:{fname}\n{code}\n```" + for fname, code in current_files.items() + ) + ) + + resp = _chat_with_prompt( + llm, + prompt_manager.system("code_generation"), + repair_prompt, + max_tokens=max_tokens, + ) + repaired_files = _extract_multi_file_blocks(resp.content) + if not repaired_files: + logger.warning( + "Stage 10: Self-containment repair attempt %d returned no files", + attempt, + ) + continue + + merged = dict(current_files) + merged.update(repaired_files) + current_files = merged + current_issues = _find_missing_local_module_imports(current_files) + if not current_issues: + logger.info( + "Stage 10: Self-containment repair succeeded on attempt %d", + attempt, + ) + return current_files, [] + + return current_files, current_issues + + +def _build_real_data_guard_guidance(config: RCConfig) -> str: + exp_cfg = config.experiment + if not ( + getattr(exp_cfg, "require_real_data", False) + or getattr(exp_cfg, "forbid_synthetic_proxy", False) + or getattr(exp_cfg, "fail_on_stdout_parsed_results", False) + or getattr(exp_cfg, "required_real_data_refs", ()) + ): + return "" + + refs = tuple(getattr(exp_cfg, "required_real_data_refs", ()) or ()) + refs_block = "" + if refs: + refs_block = "Required local data references (use these, do not invent substitutes):\n" + refs_block += "".join(f"- {ref}\n" for ref in refs) + + asset_paths_block = _build_resolved_local_asset_guidance() + + return ( + "\n\nREAL DATA ENFORCEMENT (HARD RULE):\n" + "- This project MUST use real local project assets/caches, not an internally " + "generated proxy benchmark.\n" + "- If the required local assets are unavailable, FAIL FAST with a clear " + "FileNotFoundError or RuntimeError. Do NOT silently degrade to a toy dataset.\n" + "- FORBIDDEN fallback patterns include: helper functions such as " + "`_build_example`, `_build_splits`, or `_sample_circle` that generate the " + "benchmark in code; hard-coded tiny train/val/test split dictionaries; " + "repository-local synthetic evidence repositories; or any results source that " + "exists only as stdout metric lines.\n" + "- main.py must write a structured `results.json`; stdout-only metrics are " + "insufficient for this run.\n" + "- The execution harness invokes `python main.py` directly. Do NOT require " + "dataset/asset CLI flags just to start the experiment. Asset path flags may " + "exist only as optional overrides; the default path resolution must come " + "from the VECTRA_* env vars or the authoritative absolute roots below.\n" + "- Emit machine-readable provenance where practical, including " + "`data_manifest.json` and `protocol_manifest.json`, so later stages can verify " + "which local assets were actually used.\n" + "- Resolve data from the authoritative absolute roots or env vars below. " + "Do NOT invent packaged relative directories such as " + "`./page_minus_titleblock_train1000_local`.\n" + + refs_block + + asset_paths_block + ) + + +def _build_resolved_local_asset_guidance() -> str: + """Expose authoritative local asset roots from the repo's experiment config.""" + specs = _load_project_dataset_specs() + if not specs: + return "" + + def _path_text(value: Any) -> str: + text = " ".join(str(value).replace("\\", "/").split()).strip() + return text + + lines = ["Authoritative local asset roots for this repository:"] + + def _append_path(dataset_name: str, label: str, env_name: str, value: Any) -> None: + text = _path_text(value) + if not text: + return + lines.append(f"- {dataset_name} {label}: {text} (env: {env_name})") + + repo_root = Path(__file__).resolve().parents[3] + lines.append(f"- repo root: {repo_root.as_posix()} (env: VECTRA_REPO_ROOT)") + + simple_key = "engineering_primitives_simple_scenes_noslot_v1_local_20260326" + simple_spec = specs.get(simple_key) + if isinstance(simple_spec, dict): + cache_roots = simple_spec.get("cache_roots") + _append_path(simple_key, "dataset_root", "VECTRA_SIMPLE_DATASET_ROOT", simple_spec.get("dataset_root")) + _append_path(simple_key, "dataset_root", "VECTRA_SIMPLE_ASSET_ROOT", simple_spec.get("dataset_root")) + _append_path(simple_key, "manifest_path", "VECTRA_SIMPLE_MANIFEST_PATH", simple_spec.get("manifest_path")) + if isinstance(cache_roots, dict): + _append_path(simple_key, "learned_cache", "VECTRA_SIMPLE_HEATMAP_DIR", cache_roots.get("learned")) + + page_key = "page_minus_titleblock" + page_spec = specs.get(page_key) + if isinstance(page_spec, dict): + dataset_root = page_spec.get("dataset_root") + split_manifest = page_spec.get("split_manifest_path") + _append_path(page_key, "dataset_root", "VECTRA_PAGE_DATASET_ROOT", dataset_root) + if dataset_root: + dataset_root_path = Path(str(dataset_root)) + _append_path(page_key, "image_dir", "VECTRA_PAGE_IMAGE_DIR", dataset_root_path / "train2017") + _append_path(page_key, "sidecar_dir", "VECTRA_PAGE_SIDECAR_DIR", dataset_root_path / "sidecars" / "train2017") + _append_path(page_key, "split_manifest", "VECTRA_PAGE_SPLIT_JSON", split_manifest) + if split_manifest: + split_manifest_path = Path(str(split_manifest)) + one_drive_png_root = split_manifest_path.parent.parent + _append_path(page_key, "png_root", "VECTRA_ONE_DRIVE_PNG_ROOT", one_drive_png_root) + _append_path(page_key, "gt_solid_csv", "VECTRA_PAGE_GT_SOLID_CSV", split_manifest_path.parent / "gt" / "train2017_solid.csv") + _append_path(page_key, "gt_dashed_csv", "VECTRA_PAGE_GT_DASHED_CSV", split_manifest_path.parent / "gt" / "train2017_dashed.csv") + + probe_key = "DeepPatent2_negative_clutter_probe" + probe_spec = specs.get(probe_key) + if isinstance(probe_spec, dict): + _append_path(probe_key, "dataset_root", "VECTRA_DEEPPATENT_DATASET_ROOT", probe_spec.get("dataset_root")) + + if len(lines) <= 1: + return "" + lines.extend( + [ + "- Loader rule: first read the env vars above if they are set, otherwise fall back to the exact absolute paths above.", + "- If a required asset path does not exist, raise FileNotFoundError naming the env var/path that was missing.", + ] + ) + return "\n" + "\n".join(lines) + "\n" + + +def _load_project_dataset_specs() -> dict[str, dict[str, Any]]: + """Load the repo-root experiment dataset specs for prompt grounding.""" + repo_root = Path(__file__).resolve().parents[3] + config_path = repo_root / "config.py" + if not config_path.exists(): + return {} + + try: + spec = importlib.util.spec_from_file_location( + "researchclaw_project_config_for_codegen", + config_path, + ) + if spec is None or spec.loader is None: + return {} + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + build_default_config = getattr(module, "build_default_config", None) + if not callable(build_default_config): + return {} + project_config = build_default_config() + build_dataset_specs = getattr(project_config, "build_dataset_specs", None) + if not callable(build_dataset_specs): + return {} + dataset_specs = build_dataset_specs() + if not isinstance(dataset_specs, dict): + return {} + return dataset_specs + except Exception: # noqa: BLE001 + logger.debug("Resolved local asset guidance unavailable", exc_info=True) + return {} + + +def _extract_named_plan_items(value: Any, *, limit: int = 8) -> list[str]: + items: list[str] = [] + if value is None: + return items + if isinstance(value, dict): + if "name" in value: + candidate = " ".join(str(value.get("name", "")).split()).strip() + if candidate: + items.append(candidate) + else: + for key in value: + candidate = " ".join(str(key).split()).strip() + if candidate: + items.append(candidate) + elif isinstance(value, (list, tuple, set)): + for item in value: + if isinstance(item, dict): + candidate = " ".join(str(item.get("name", "")).split()).strip() + else: + candidate = " ".join(str(item).split()).strip() + if candidate: + items.append(candidate) + else: + candidate = " ".join(str(value).split()).strip() + if candidate: + items.append(candidate) + + deduped: list[str] = [] + seen: set[str] = set() + for item in items: + key = item.lower() + if key in seen: + continue + seen.add(key) + deduped.append(item) + if len(deduped) >= limit: + break + return deduped + + +def _build_codegen_plan_summary(exp_plan_text: str, config: RCConfig) -> str: + """Compress the experiment plan into the subset needed for Stage 10.""" + if not exp_plan_text.strip(): + return "" + + try: + plan_data = yaml.safe_load(exp_plan_text) + except yaml.YAMLError: + plan_data = None + if not isinstance(plan_data, dict): + excerpt = exp_plan_text[:2200].rstrip() + suffix = "\n...\n" if len(exp_plan_text) > 2200 else "\n" + return "PLAN EXCERPT:\n" + excerpt + suffix + + lines = ["## Experiment Plan Summary"] + + plan_topic = " ".join(str(plan_data.get("topic", "")).split()).strip() + if plan_topic: + if len(plan_topic) > 320: + plan_topic = plan_topic[:320].rstrip() + "..." + lines.append(f"- Topic anchor: {plan_topic}") + + datasets = _extract_named_plan_items(plan_data.get("datasets"), limit=6) + if datasets: + lines.append("- Datasets: " + ", ".join(datasets)) + + baselines = _extract_named_plan_items(plan_data.get("baselines"), limit=8) + if baselines: + lines.append("- Baselines: " + ", ".join(baselines)) + + methods = _extract_named_plan_items(plan_data.get("proposed_methods"), limit=8) + if methods: + lines.append("- Proposed methods: " + ", ".join(methods)) + + ablations = _extract_named_plan_items(plan_data.get("ablations"), limit=8) + if ablations: + lines.append("- Ablations: " + ", ".join(ablations)) + + metrics = plan_data.get("metrics") + if isinstance(metrics, dict): + primary = metrics.get("primary_metric") + if isinstance(primary, dict): + primary_name = " ".join(str(primary.get("name", "")).split()).strip() + direction = " ".join( + str(primary.get("direction", config.experiment.metric_direction)).split() + ).strip() + if primary_name: + lines.append( + f"- Primary metric: {primary_name} ({direction or config.experiment.metric_direction})" + ) + secondary = _extract_named_plan_items(metrics.get("secondary_metrics"), limit=8) + if secondary: + lines.append("- Secondary metrics: " + ", ".join(secondary)) + + compute_budget = plan_data.get("compute_budget") + if isinstance(compute_budget, dict): + total_seconds = compute_budget.get("total_time_budget_seconds") + seeded_conditions = compute_budget.get("seeded_condition_count") + budget_bits: list[str] = [] + if total_seconds is not None: + budget_bits.append(f"total_time_budget_seconds={total_seconds}") + if seeded_conditions is not None: + budget_bits.append(f"seeded_condition_count={seeded_conditions}") + if budget_bits: + lines.append("- Compute budget: " + ", ".join(budget_bits)) + + refs = tuple(getattr(config.experiment, "required_real_data_refs", ()) or ()) + if refs: + lines.append("- Required local asset refs:") + lines.extend(f" - {ref}" for ref in refs[:8]) + + return "\n".join(lines).strip() + + +def _is_acp_transport_failure(exc: Exception) -> bool: + """Return True when a Stage-10 failure came from the ACP transport layer.""" + parts = [str(exc)] + cause = getattr(exc, "__cause__", None) + context = getattr(exc, "__context__", None) + if cause: + parts.append(str(cause)) + if context: + parts.append(str(context)) + text = " ".join(part.strip() for part in parts if part).lower() + if not text: + return False + indicators = ( + "acp prompt failed", + "acp prompt timed out after", + "queue owner disconnected before prompt completion", + "agent needs reconnect", + ) + return any(indicator in text for indicator in indicators) + + def _execute_code_generation( stage_dir: Path, run_dir: Path, @@ -74,6 +906,7 @@ def _execute_code_generation( prompts: PromptManager | None = None, ) -> StageResult: exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or "" + exp_plan_prompt = _build_codegen_plan_summary(exp_plan, config) metric = config.experiment.metric_key max_repair = 5 # BUG-14: Increased from 3 to give more chances for critical bugs files: dict[str, str] = {} @@ -208,11 +1041,29 @@ def _execute_code_generation( ) _bp_block = _bp.to_prompt_block() if _bp_block: - extra_guidance += ( - "\n\n## BenchmarkAgent Selections (USE THESE)\n" + _has_existing_plan_assets = any( + item.get("origin") == "existing_plan" + for item in (_bp.selected_benchmarks + _bp.selected_baselines) + if isinstance(item, dict) + ) + _bp_heading = "## BenchmarkAgent Selections (USE THESE)" + _bp_instruction = ( "The following datasets, baselines, and code snippets were " "automatically selected and validated by the BenchmarkAgent. " "You MUST use these selections in your experiment code.\n\n" + ) + if _has_existing_plan_assets: + _bp_heading = "## BenchmarkAgent Selections (PRESERVE IN-PROJECT ASSETS)" + _bp_instruction = ( + "The following datasets and baselines include in-project " + "assets carried over from the existing experiment plan plus " + "BenchmarkAgent supplements. You MUST preserve the in-project " + "datasets/baselines and may use the extra BenchmarkAgent " + "selections only as supplemental additions.\n\n" + ) + extra_guidance += ( + f"\n\n{_bp_heading}\n" + + _bp_instruction + _bp_block ) logger.info( @@ -317,7 +1168,18 @@ def _execute_code_generation( "- Prefer lightweight CPU-friendly libraries (numpy, scipy, " "sklearn, pandas) unless deep learning is inherent to the topic.\n" "- The experiment MUST be self-contained and runnable without GPU.\n" + "- The returned experiment project must be self-contained at the file " + "level. If `main.py` or any other file imports a local helper module " + "(for example `models`, `utils`, `data_utils`, `metrics`, `loaders`), " + "you MUST return that helper file too.\n" + "- Never reference a local Python module that is absent from the " + "returned file set. If in doubt, inline the helper code into an " + "existing returned file instead of importing a missing module.\n" ) + repair_brief = _build_research_repair_brief(run_dir) + if repair_brief: + extra_guidance += "\n\n" + repair_brief + extra_guidance += _build_real_data_guard_guidance(config) # --- Code generation: Beast Mode → CodeAgent → Legacy single-shot --- _code_agent_active = False @@ -401,7 +1263,7 @@ def _execute_code_generation( _oc_result: OpenCodeResult = _bridge.generate( stage_dir=stage_dir, topic=config.research.topic, - exp_plan=exp_plan, + exp_plan=exp_plan_prompt, metric=metric, pkg_hint=pkg_hint + "\n" + compute_budget, extra_guidance=extra_guidance, @@ -503,52 +1365,75 @@ def _execute_code_generation( except Exception: # noqa: BLE001 logger.debug("Domain detection unavailable", exc_info=True) - _agent = _CodeAgent( - llm=llm, - prompts=_pm, - config=_ca_cfg, - stage_dir=stage_dir, - sandbox_factory=_sandbox_factory, - experiment_config=config.experiment, - domain_profile=_domain_profile, - code_search_result=_code_search_result, - ) - _agent_result = _agent.generate( - topic=config.research.topic, - exp_plan=exp_plan, - metric=metric, - pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, - max_tokens=_code_max_tokens, - ) - files = _agent_result.files - _code_agent_active = True - - # Write agent artifacts - (stage_dir / "code_agent_log.json").write_text( - json.dumps( - { - "log": _agent_result.validation_log, - "llm_calls": _agent_result.total_llm_calls, - "sandbox_runs": _agent_result.total_sandbox_runs, - "best_score": _agent_result.best_score, - "tree_nodes_explored": _agent_result.tree_nodes_explored, - "review_rounds": _agent_result.review_rounds, - }, - indent=2, - ), - encoding="utf-8", - ) - if _agent_result.architecture_spec: - (stage_dir / "architecture_spec.yaml").write_text( - _agent_result.architecture_spec, encoding="utf-8", + try: + _agent = _CodeAgent( + llm=llm, + prompts=_pm, + config=_ca_cfg, + stage_dir=stage_dir, + sandbox_factory=_sandbox_factory, + experiment_config=config.experiment, + domain_profile=_domain_profile, + code_search_result=_code_search_result, ) - logger.info( - "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f", - _agent_result.total_llm_calls, - _agent_result.total_sandbox_runs, - _agent_result.best_score, - ) - elif not _beast_mode_used and llm is not None: + _agent_result = _agent.generate( + topic=config.research.topic, + exp_plan=exp_plan_prompt, + metric=metric, + pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, + max_tokens=_code_max_tokens, + ) + files = _agent_result.files + _code_agent_active = True + + # Write agent artifacts + (stage_dir / "code_agent_log.json").write_text( + json.dumps( + { + "log": _agent_result.validation_log, + "llm_calls": _agent_result.total_llm_calls, + "sandbox_runs": _agent_result.total_sandbox_runs, + "best_score": _agent_result.best_score, + "tree_nodes_explored": _agent_result.tree_nodes_explored, + "review_rounds": _agent_result.review_rounds, + }, + indent=2, + ), + encoding="utf-8", + ) + if _agent_result.architecture_spec: + (stage_dir / "architecture_spec.yaml").write_text( + _agent_result.architecture_spec, encoding="utf-8", + ) + logger.info( + "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f", + _agent_result.total_llm_calls, + _agent_result.total_sandbox_runs, + _agent_result.best_score, + ) + except Exception as exc: + fallback_enabled = bool( + getattr(_ca_cfg, "fallback_to_legacy_on_acp_failure", False) + ) + if fallback_enabled and _is_acp_transport_failure(exc): + fallback_payload = { + "fallback_triggered": True, + "reason": "code_agent_acp_transport_failure", + "error": str(exc), + "triggered_at": _utcnow_iso(), + } + (stage_dir / "code_agent_fallback.json").write_text( + json.dumps(fallback_payload, indent=2), + encoding="utf-8", + ) + logger.warning( + "CodeAgent ACP transport failure detected; falling back to legacy single-shot generation: %s", + exc, + ) + else: + raise + + if not _beast_mode_used and llm is not None and not _code_agent_active: # ── Legacy single-shot generation ───────────────────────────────── topic = config.research.topic _md = config.experiment.metric_direction @@ -563,7 +1448,7 @@ def _execute_code_generation( topic=topic, metric=metric, pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, - exp_plan=exp_plan, + exp_plan=exp_plan_prompt, metric_direction_hint=_md_hint, ) # R13-3: Use higher max_tokens for reasoning models (they consume tokens @@ -817,20 +1702,16 @@ def _execute_code_generation( # --- P1.1+P1.2: Deep quality analysis (class quality, scoping, API) --- deep_warnings = deep_validate_files(files) + placeholder_impl_issues = _find_placeholder_experiment_issues(files) + distinctness_issues = _find_condition_distinctness_issues(files) + deep_warnings.extend(placeholder_impl_issues) + deep_warnings.extend(distinctness_issues) for w in deep_warnings: logger.warning("Stage 10 deep quality: %s", w) complexity_warnings.extend(deep_warnings) # --- P1.2: If critical deep issues found, attempt one repair cycle --- - critical_deep = [w for w in deep_warnings if any( - kw in w for kw in ("UnboundLocalError", "unregistered", "does not exist", - "empty or trivial subclass", "does NOT override", - "Import-usage mismatch", "NameError", - "was removed", "ptp()", - "copy-paste", "identical method signatures", - "identical AST", "NOT a real ablation", - "shadows stdlib/pip") - )] + critical_deep = [w for w in deep_warnings if _is_critical_deep_warning(w)] if critical_deep and llm is not None: logger.info( "Stage 10: %d critical code issues found — triggering repair cycle", @@ -850,6 +1731,14 @@ def _execute_code_generation( f"- Use scipy.special.erf, not np.erf\n" f"- Ablation/variant classes must have genuinely different logic\n" f"- Every class must have a real implementation, not just `pass`\n" + f"- Do NOT ship dummy/placeholder/demo experiment code or comments " + f"saying 'replace with actual implementation'\n" + f"- Core experiment methods such as evaluate/predict/forward must " + f"NOT return fixed constants like 0.2 or 0.5 as a stand-in for " + f"real computation\n" + f"- Multi-condition experiments MUST include and CALL a startup " + f"ablation distinctness self-check that compares outputs on the " + f"same probe input and raises/asserts if conditions are identical\n" f"- Ablation classes MUST override the parent method that implements " f"the component being ablated (e.g., if ablating attention, override " f"the attention method with a simpler alternative like mean pooling)\n" @@ -883,17 +1772,11 @@ def _execute_code_generation( (exp_dir / fname).write_text(code, encoding="utf-8") # Re-check after repair deep_warnings_after = deep_validate_files(files) + deep_warnings_after.extend(_find_placeholder_experiment_issues(files)) + deep_warnings_after.extend(_find_condition_distinctness_issues(files)) fixed = len(critical_deep) - len([ w for w in deep_warnings_after - if any(kw in w for kw in ( - "UnboundLocalError", "unregistered", "does not exist", - "empty or trivial subclass", "does NOT override", - "Import-usage mismatch", "NameError", - "was removed", "ptp()", - "copy-paste", "identical method signatures", - "identical AST", "NOT a real ablation", - "shadows stdlib/pip", - )) + if _is_critical_deep_warning(w) ]) logger.info( "Stage 10: Deep repair fixed %d/%d critical issues", @@ -913,6 +1796,46 @@ def _execute_code_generation( json.dumps(health, indent=2), encoding="utf-8" ) + # --- Hard gate: reject placeholder/dummy experiment implementations --- + unresolved_placeholder_issues = _find_placeholder_experiment_issues(files) + if unresolved_placeholder_issues: + for issue in unresolved_placeholder_issues: + logger.warning("Stage 10 placeholder gate: %s", issue) + validation_log.append(f"PLACEHOLDER_IMPL: {issue}") + (stage_dir / "validation_report.md").write_text( + "# Code Validation Report\n\n" + "**Status**: BLOCKED — generated experiment code still contains " + "placeholder or demonstration-only implementations\n\n" + + "\n".join(f"- {issue}" for issue in unresolved_placeholder_issues), + encoding="utf-8", + ) + return StageResult( + stage=Stage.CODE_GENERATION, + status=StageStatus.FAILED, + artifacts=("validation_report.md",), + evidence_refs=(), + ) + + # --- Hard gate: require active condition-distinctness self-checks --- + unresolved_distinctness_issues = _find_condition_distinctness_issues(files) + if unresolved_distinctness_issues: + for issue in unresolved_distinctness_issues: + logger.warning("Stage 10 distinctness gate: %s", issue) + validation_log.append(f"DISTINCTNESS_IMPL: {issue}") + (stage_dir / "validation_report.md").write_text( + "# Code Validation Report\n\n" + "**Status**: BLOCKED — generated experiment does not prove condition " + "wiring is distinct\n\n" + + "\n".join(f"- {issue}" for issue in unresolved_distinctness_issues), + encoding="utf-8", + ) + return StageResult( + stage=Stage.CODE_GENERATION, + status=StageStatus.FAILED, + artifacts=("validation_report.md",), + evidence_refs=(), + ) + # --- P1.4: LLM Code Review (Stage 10.5) --- # Skip when CodeAgent is active — Phase 4 review already covers this. if llm is not None and not _code_agent_active: @@ -925,7 +1848,7 @@ def _execute_code_generation( f"You are a senior researcher reviewing experiment code for a " f"research submission.\n\n" f"TOPIC: {config.research.topic}\n" - f"EXPERIMENT PLAN:\n{exp_plan[:3000]}\n\n" + f"EXPERIMENT PLAN:\n{exp_plan_prompt[:3000]}\n\n" f"CODE:\n```python\n{all_code_review}\n```\n\n" f"Review the code and return JSON with this EXACT structure:\n" f'{{"score": <1-10>, "issues": [' @@ -1158,9 +2081,12 @@ def _execute_code_generation( f"when the topic describes a tabular, bandit, or game-theoretic method.\n" f"- Use ONLY lightweight CPU-friendly libraries (numpy, scipy, " f"sklearn) unless the topic EXPLICITLY requires deep learning.\n" - f"- The experiment must be self-contained and runnable without GPU.\n\n" + f"- The experiment must be self-contained and runnable without GPU.\n" + f"- If any file imports a local helper module, return that helper " + f"file too. Do not leave unresolved imports like `from models import ...` " + f"without a generated `models.py`.\n\n" f"{pkg_hint}\n{compute_budget}\n" - f"PLAN:\n{exp_plan}\n\n" + f"PLAN:\n{exp_plan_prompt}\n\n" f"Return multiple files using ```filename:xxx.py format." ) regen_resp = _chat_with_prompt( @@ -1302,7 +2228,64 @@ def _execute_code_generation( except Exception as exc: logger.debug("Ablation validation skipped: %s", exc) + # --- Self-contained project gate --- + unresolved_local_imports = _find_missing_local_module_imports(files) + if unresolved_local_imports: + for issue in unresolved_local_imports: + logger.warning("Stage 10 self-containment: %s", issue) + validation_log.append(f"SELF_CONTAINED: {issue}") + if llm is not None: + files, unresolved_local_imports = _repair_self_contained_project( + llm=llm, + prompt_manager=_pm, + files=files, + issues=unresolved_local_imports, + max_tokens=_code_max_tokens, + max_repair=max_repair, + ) + for fname, code in files.items(): + (exp_dir / fname).write_text(code, encoding="utf-8") + if unresolved_local_imports: + (stage_dir / "validation_report.md").write_text( + "# Code Validation Report\n\n" + "**Status**: BLOCKED — generated experiment project is not self-contained\n\n" + + "\n".join(f"- {issue}" for issue in unresolved_local_imports), + encoding="utf-8", + ) + return StageResult( + stage=Stage.CODE_GENERATION, + status=StageStatus.FAILED, + artifacts=("validation_report.md",), + evidence_refs=(), + ) + # --- Write spec --- + if getattr(config.experiment, "forbid_synthetic_proxy", False): + _proxy_signals = detect_synthetic_proxy_signals( + {fname: code for fname, code in files.items() if fname.endswith(".py")} + ) + if should_fail_synthetic_proxy_guard(_proxy_signals): + guard_payload = { + "status": "failed", + "reason": "synthetic_proxy_detected", + "signals": _proxy_signals, + "timestamp": _utcnow_iso(), + } + (stage_dir / "real_data_guard.json").write_text( + json.dumps(guard_payload, indent=2), encoding="utf-8" + ) + logger.error( + "Stage 10: Real-data guard blocked generated experiment code: %s", + "; ".join(_proxy_signals), + ) + return StageResult( + stage=Stage.CODE_GENERATION, + status=StageStatus.FAILED, + artifacts=("experiment/", "real_data_guard.json"), + evidence_refs=("stage-10/experiment/", "stage-10/real_data_guard.json"), + error="Real-data guard blocked synthetic/proxy fallback code generation.", + ) + file_list = ", ".join(f"`{f}`" for f in sorted(files.keys())) main_validation = validate_code(files.get("main.py", "")) _align_status = "ALIGNED" if alignment_ok else f"MISALIGNED: {alignment_note}" @@ -1361,4 +2344,3 @@ def _execute_code_generation( artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-10/{a}" for a in artifacts), ) - diff --git a/researchclaw/pipeline/stage_impls/_execution.py b/researchclaw/pipeline/stage_impls/_execution.py index 8858cc2d..b9a257cc 100644 --- a/researchclaw/pipeline/stage_impls/_execution.py +++ b/researchclaw/pipeline/stage_impls/_execution.py @@ -5,11 +5,14 @@ import json import logging import math +import os import re import time as _time from pathlib import Path from typing import Any +import yaml + from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.experiment.validator import ( @@ -21,6 +24,7 @@ from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, + _build_research_repair_brief, _chat_with_prompt, _detect_runtime_issues, _ensure_sandbox_deps, @@ -32,6 +36,8 @@ _read_prior_artifact, _safe_filename, _safe_json_loads, + detect_synthetic_proxy_signals, + should_fail_synthetic_proxy_guard, _utcnow_iso, _write_stage_meta, ) @@ -41,6 +47,238 @@ logger = logging.getLogger(__name__) +_KNOWN_REAL_ASSET_ARG_MAP: tuple[tuple[str, str], ...] = ( + ("--simple_manifest", "VECTRA_SIMPLE_MANIFEST_PATH"), + ("--simple_heatmap_cache", "VECTRA_SIMPLE_HEATMAP_DIR"), + ("--simple_asset_root", "VECTRA_SIMPLE_ASSET_ROOT"), + ("--page_dataset_root", "VECTRA_PAGE_DATASET_ROOT"), + ("--page_image_dir", "VECTRA_PAGE_IMAGE_DIR"), + ("--page_sidecar_dir", "VECTRA_PAGE_SIDECAR_DIR"), + ("--page_split_json", "VECTRA_PAGE_SPLIT_JSON"), + ("--gt_solid_csv", "VECTRA_PAGE_GT_SOLID_CSV"), + ("--gt_dashed_csv", "VECTRA_PAGE_GT_DASHED_CSV"), + ("--deep_patent_root", "VECTRA_DEEPPATENT_DATASET_ROOT"), +) + + +def _collect_vectra_env_overrides() -> dict[str, str]: + """Return non-empty VECTRA_* variables from the current runtime.""" + overrides: dict[str, str] = {} + for name, value in os.environ.items(): + if not name.startswith("VECTRA_"): + continue + cleaned = str(value).strip() + if cleaned: + overrides[name] = cleaned + return overrides + + +def _build_project_entrypoint_runtime_overrides( + project_dir: Path, + *, + entry_point: str = "main.py", +) -> tuple[list[str], dict[str, str]]: + """Derive optional CLI args/env for generated projects that require local assets. + + The harness still calls ``python main.py`` by default, but some generated + experiments insist on asset-path flags. When those flags are present, use the + already-resolved VECTRA_* runtime variables as optional overrides. + """ + env_overrides = _collect_vectra_env_overrides() + entry_path = project_dir / entry_point + if not entry_path.exists(): + return [], env_overrides + + try: + source = entry_path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + return [], env_overrides + + args: list[str] = [] + missing: list[str] = [] + for flag, env_name in _KNOWN_REAL_ASSET_ARG_MAP: + if flag not in source: + continue + value = env_overrides.get(env_name, "").strip() + if value: + args.extend([flag, value]) + else: + missing.append(f"{flag} <= {env_name}") + + if args: + logger.info( + "Execution harness injecting %d real-data CLI args for %s", + len(args) // 2, + entry_path, + ) + if missing: + logger.warning( + "Execution harness detected asset CLI flags in %s but runtime vars were missing: %s", + entry_path, + ", ".join(missing), + ) + return args, env_overrides + + +def _shorten_prompt_text(text: str, *, max_chars: int) -> str: + """Collapse whitespace and trim *text* without cutting mid-word when possible.""" + compact = re.sub(r"\s+", " ", text).strip() + if len(compact) <= max_chars: + return compact + head = compact[: max_chars - 3].rsplit(" ", 1)[0].strip() + return f"{head or compact[: max_chars - 3]}..." + + +def _compact_research_topic(topic: str) -> str: + """Return a short topic line for Stage 13 without discarding the plan.""" + if not topic.strip(): + return "" + lines = [line.strip() for line in topic.splitlines()] + summary_lines: list[str] = [] + stop_headers = ( + "important constraints:", + "please produce:", + "existing assets", + "existing baselines:", + "main source doc:", + "core code path:", + "key functions", + "existing result artifacts:", + "existing conclusions to preserve:", + ) + for line in lines: + if not line: + if summary_lines: + break + continue + lowered = line.lower() + if lowered.startswith("- ") or re.match(r"^\d+\.", line): + break + if any(lowered.startswith(header) for header in stop_headers): + break + summary_lines.append(line) + summary = " ".join(summary_lines) if summary_lines else topic + return _shorten_prompt_text(summary, max_chars=260) + + +def _extract_topic_constraints(topic: str, *, max_items: int = 8) -> list[str]: + """Pull compact bullet constraints out of long topic briefs.""" + lines = [line.strip() for line in topic.splitlines()] + bullets: list[str] = [] + capture = False + wanted_headers = { + "important constraints", + "existing conclusions to preserve", + } + for line in lines: + lowered = line.lower().rstrip(":") + if lowered in wanted_headers: + capture = True + continue + if not capture: + continue + if not line: + continue + if line.startswith("- "): + bullets.append(line[2:].strip()) + if len(bullets) >= max_items: + break + continue + if re.match(r"^\d+\.", line) or line.endswith(":"): + capture = False + return bullets + + +def _named_plan_entries(entries: Any, *, max_items: int = 8) -> list[str]: + """Extract entry names from list-shaped plan sections.""" + names: list[str] = [] + if not isinstance(entries, list): + return names + for entry in entries: + if not isinstance(entry, dict): + continue + name = entry.get("name") + if isinstance(name, str) and name.strip(): + names.append(name.strip()) + if len(names) >= max_items: + break + return names + + +def _build_refine_prompt_context( + topic: str, + exp_plan_text: str, + repair_brief: str = "", +) -> tuple[str, str]: + """Split long research briefs into a short topic and a structured anchor.""" + compact_topic = _compact_research_topic(topic) + plan_summary_lines: list[str] = [] + try: + plan_data = yaml.safe_load(exp_plan_text) if exp_plan_text.strip() else {} + except yaml.YAMLError: + plan_data = {} + if isinstance(plan_data, dict): + objectives = plan_data.get("objectives") + if isinstance(objectives, dict): + for label, key in ( + ("Problem formulation", "problem_formulation"), + ("Novelty statement", "novelty_statement"), + ("Recommended first prototype", "recommended_first_prototype"), + ): + value = objectives.get(key) + if isinstance(value, str) and value.strip(): + plan_summary_lines.append( + f"- {label}: {_shorten_prompt_text(value, max_chars=220)}" + ) + research_questions = objectives.get("research_questions") + if isinstance(research_questions, list) and research_questions: + rq_text = "; ".join( + str(item).strip() for item in research_questions[:4] if str(item).strip() + ) + if rq_text: + plan_summary_lines.append( + f"- Research questions: {_shorten_prompt_text(rq_text, max_chars=220)}" + ) + metrics = plan_data.get("metrics") + if isinstance(metrics, dict): + primary_metric = metrics.get("primary_metric") + if isinstance(primary_metric, dict): + metric_name = primary_metric.get("name") + if isinstance(metric_name, str) and metric_name.strip(): + plan_summary_lines.append(f"- Primary metric: {metric_name.strip()}") + for label, key in ( + ("Baselines", "baselines"), + ("Proposed methods", "proposed_methods"), + ("Ablations", "ablations"), + ): + names = _named_plan_entries(plan_data.get(key)) + if names: + plan_summary_lines.append(f"- {label}: {', '.join(names)}") + + anchor_parts: list[str] = [] + if repair_brief.strip(): + anchor_parts.append(repair_brief.strip()) + if plan_summary_lines: + anchor_parts.append( + "Structured experiment plan summary:\n" + "\n".join(plan_summary_lines) + ) + topic_constraints = _extract_topic_constraints(topic) + if topic_constraints: + anchor_parts.append( + "Key research constraints to preserve:\n" + + "\n".join(f"- {item}" for item in topic_constraints) + ) + if exp_plan_text.strip(): + excerpt_limit = 900 if repair_brief.strip() else 1600 + excerpt = exp_plan_text[:excerpt_limit].rstrip() + suffix = "\n...\n" if len(exp_plan_text) > excerpt_limit else "\n" + anchor_parts.append( + "Original experiment plan excerpt:\n" + f"```yaml\n{excerpt}{suffix}```\n" + ) + return compact_topic or _shorten_prompt_text(topic, max_chars=260), "\n\n".join(anchor_parts) + + def _execute_resource_planning( stage_dir: Path, run_dir: Path, @@ -131,6 +369,8 @@ def _execute_experiment_run( runs_dir.mkdir(parents=True, exist_ok=True) mode = config.experiment.mode if mode in ("sandbox", "docker"): + stage_status = StageStatus.DONE + stage_error: str | None = None # P7: Auto-install missing dependencies before subprocess sandbox if mode == "sandbox": _all_code = code_text @@ -145,8 +385,14 @@ def _execute_experiment_run( sandbox = create_sandbox(config.experiment, runs_dir / "sandbox") # Use run_project for multi-file, run for single-file if exp_dir_path and Path(exp_dir_path).is_dir(): + entry_args, env_overrides = _build_project_entrypoint_runtime_overrides( + Path(exp_dir_path) + ) result = sandbox.run_project( - Path(exp_dir_path), timeout_sec=config.experiment.time_budget_sec + Path(exp_dir_path), + timeout_sec=config.experiment.time_budget_sec, + args=entry_args, + env_overrides=env_overrides, ) else: result = sandbox.run( @@ -220,6 +466,51 @@ def _execute_experiment_run( } if structured_results is not None: run_payload["structured_results"] = structured_results + + guard_issues: list[str] = [] + _structured_source = ( + structured_results.get("source") + if isinstance(structured_results, dict) + else None + ) + if getattr(config.experiment, "fail_on_stdout_parsed_results", False): + if structured_results is None and effective_metrics: + guard_issues.append( + "structured results.json was missing; metrics were only recoverable via stdout parsing" + ) + elif _structured_source == "stdout_parsed": + guard_issues.append( + "results.json declares source=stdout_parsed instead of a structured experiment output" + ) + + if getattr(config.experiment, "forbid_synthetic_proxy", False) and sandbox_project.exists(): + _project_texts: dict[str, str] = {} + for _pyf in sandbox_project.glob("*.py"): + try: + _project_texts[_pyf.name] = _pyf.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + continue + _proxy_signals = detect_synthetic_proxy_signals(_project_texts) + if should_fail_synthetic_proxy_guard(_proxy_signals): + guard_issues.extend(_proxy_signals) + + if guard_issues: + guard_payload = { + "status": "failed", + "issues": guard_issues, + "structured_results_present": structured_results is not None, + "structured_results_source": _structured_source, + "timestamp": _utcnow_iso(), + } + (stage_dir / "real_data_guard.json").write_text( + json.dumps(guard_payload, indent=2), encoding="utf-8" + ) + run_status = "failed" + run_payload["status"] = run_status + run_payload["real_data_guard"] = guard_payload + stage_status = StageStatus.FAILED + stage_error = "Real-data guard blocked proxy or stdout-only experiment results." + # Auto-generate results.json from parsed metrics if sandbox didn't produce one if structured_results is None and effective_metrics: auto_results = {"source": "stdout_parsed", "metrics": effective_metrics} @@ -325,6 +616,18 @@ def _execute_experiment_run( (runs_dir / f"{_safe_filename(run_id)}.json").write_text( json.dumps(payload, indent=2), encoding="utf-8" ) + artifacts = ["runs/"] + evidence_refs = ["stage-12/runs/"] + if (stage_dir / "real_data_guard.json").exists(): + artifacts.append("real_data_guard.json") + evidence_refs.append("stage-12/real_data_guard.json") + return StageResult( + stage=Stage.EXPERIMENT_RUN, + status=stage_status, + artifacts=tuple(artifacts), + evidence_refs=tuple(evidence_refs), + error=stage_error, + ) return StageResult( stage=Stage.EXPERIMENT_RUN, status=StageStatus.DONE, @@ -652,6 +955,41 @@ def _files_to_context(project_files: dict[str, str]) -> str: parts.append(f"```filename:{fname}\n{code}\n```") return "\n\n".join(parts) + def _write_refinement_log() -> None: + (stage_dir / "refinement_log.json").write_text( + json.dumps(log, indent=2), encoding="utf-8" + ) + + def _pause_refinement( + *, + reason: str, + stop_reason: str, + iteration: int | None = None, + ) -> StageResult: + log.update( + { + "paused": True, + "converged": False, + "stop_reason": stop_reason, + "pause_reason": reason, + "best_metric": best_metric, + "best_version": best_version, + "iterations_completed": len(log["iterations"]), + } + ) + if iteration is not None: + log["pause_iteration"] = iteration + _write_refinement_log() + artifacts = ("refinement_log.json",) + return StageResult( + stage=Stage.ITERATIVE_REFINE, + status=StageStatus.PAUSED, + artifacts=artifacts, + error=reason, + decision="resume", + evidence_refs=tuple(f"stage-13/{a}" for a in artifacts), + ) + if llm is None: logger.info("Stage 13: LLM unavailable, saving original experiment as final") final_dir = stage_dir / "experiment_final" @@ -677,9 +1015,7 @@ def _files_to_context(project_files: dict[str, str]) -> str: ], } ) - (stage_dir / "refinement_log.json").write_text( - json.dumps(log, indent=2), encoding="utf-8" - ) + _write_refinement_log() artifacts = ("refinement_log.json", "experiment_final/") return StageResult( stage=Stage.ITERATIVE_REFINE, @@ -693,6 +1029,12 @@ def _files_to_context(project_files: dict[str, str]) -> str: # R7-3: Read experiment plan to detect condition coverage gaps _exp_plan_text = _read_prior_artifact(run_dir, "exp_plan.yaml") or "" + _repair_brief = _build_research_repair_brief(run_dir) + _refine_topic, _exp_plan_anchor = _build_refine_prompt_context( + config.research.topic, + _exp_plan_text, + _repair_brief, + ) _condition_coverage_hint = "" if _exp_plan_text and run_summaries: # Check if stdout contains condition labels @@ -764,14 +1106,6 @@ def _files_to_context(project_files: dict[str, str]) -> str: logger.warning("Stage 13: metric saturation detected, injecting difficulty upgrade hint") files_context = _files_to_context(best_files) - # BUG-10 fix: anchor refinement to original experiment plan - _exp_plan_anchor = "" - if _exp_plan_text.strip(): - _exp_plan_anchor = ( - "Original experiment plan (exp_plan.yaml):\n" - "```yaml\n" + _exp_plan_text[:4000] + "\n```\n" - "You MUST preserve ALL condition names from this plan.\n\n" - ) ip = _pm.sub_prompt( "iterative_improve", metric_key=metric_key, @@ -779,7 +1113,7 @@ def _files_to_context(project_files: dict[str, str]) -> str: files_context=files_context, run_summaries=chr(10).join(run_summaries[:20]), condition_coverage_hint=_condition_coverage_hint, - topic=config.research.topic, + topic=_refine_topic, exp_plan_anchor=_exp_plan_anchor, ) @@ -803,12 +1137,25 @@ def _files_to_context(project_files: dict[str, str]) -> str: timeout_refine_attempts, ) - response = _chat_with_prompt( - llm, - ip.system, - user_prompt, - max_tokens=ip.max_tokens or 8192, - ) + try: + response = _chat_with_prompt( + llm, + ip.system, + user_prompt, + max_tokens=ip.max_tokens or 8192, + ) + except RuntimeError as exc: + if "ACP prompt timed out after" in str(exc): + logger.warning( + "Stage 13: ACP prompt timed out during iteration %d; pausing for resume", + iteration, + ) + return _pause_refinement( + reason=str(exc), + stop_reason="acp_prompt_timeout", + iteration=iteration, + ) + raise extracted_files = _extract_multi_file_blocks(response.content) # If LLM returns only single block, treat as main.py update if not extracted_files: @@ -865,7 +1212,20 @@ def _files_to_context(project_files: dict[str, str]) -> str: issue_text=issue_text, all_files_ctx=_files_to_context(candidate_files), ) - repair_response = _chat_with_prompt(llm, irp.system, irp.user) + try: + repair_response = _chat_with_prompt(llm, irp.system, irp.user) + except RuntimeError as exc: + if "ACP prompt timed out after" in str(exc): + logger.warning( + "Stage 13: ACP repair prompt timed out during iteration %d; pausing for resume", + iteration, + ) + return _pause_refinement( + reason=str(exc), + stop_reason="acp_prompt_timeout", + iteration=iteration, + ) + raise candidate_files["main.py"] = _extract_code_block(repair_response.content) validation = validate_code(candidate_files["main.py"]) repaired = True @@ -898,9 +1258,14 @@ def _files_to_context(project_files: dict[str, str]) -> str: config.experiment, stage_dir / f"refine_sandbox_v{iteration}", ) + rerun_args, rerun_env = _build_project_entrypoint_runtime_overrides( + version_dir + ) rerun = sandbox.run_project( version_dir, timeout_sec=config.experiment.time_budget_sec, + args=rerun_args, + env_overrides=rerun_env, ) metric_val = _find_metric(rerun.metrics, metric_key) # R19-1: Store stdout (capped) so PAIRED lines survive for Stage 14 @@ -977,7 +1342,20 @@ def _files_to_context(project_files: dict[str, str]) -> str: issue_text=runtime_issues, all_files_ctx=_files_to_context(candidate_files), ) - repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user) + try: + repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user) + except RuntimeError as exc: + if "ACP prompt timed out after" in str(exc): + logger.warning( + "Stage 13: ACP runtime-repair prompt timed out during iteration %d; pausing for resume", + iteration, + ) + return _pause_refinement( + reason=str(exc), + stop_reason="acp_prompt_timeout", + iteration=iteration, + ) + raise repaired_files = _extract_multi_file_blocks(repair_resp.content) if not repaired_files: single = _extract_code_block(repair_resp.content) @@ -996,9 +1374,14 @@ def _files_to_context(project_files: dict[str, str]) -> str: config.experiment, stage_dir / f"refine_sandbox_v{iteration}_fix", ) + rerun2_args, rerun2_env = _build_project_entrypoint_runtime_overrides( + version_dir + ) rerun2 = sandbox2.run_project( version_dir, timeout_sec=config.experiment.time_budget_sec, + args=rerun2_args, + env_overrides=rerun2_env, ) metric_val = _find_metric(rerun2.metrics, metric_key) iter_record["sandbox_after_fix"] = { @@ -1067,9 +1450,7 @@ def _files_to_context(project_files: dict[str, str]) -> str: ) if _all_ablation_identical: log["ablation_identical_warning"] = True - (stage_dir / "refinement_log.json").write_text( - json.dumps(log, indent=2), encoding="utf-8" - ) + _write_refinement_log() artifacts = ["refinement_log.json", "experiment_final/"] artifacts.extend( diff --git a/tests/test_rc_cli.py b/tests/test_rc_cli.py index 3123ba82..4bf67e0d 100644 --- a/tests/test_rc_cli.py +++ b/tests/test_rc_cli.py @@ -9,6 +9,8 @@ from researchclaw import cli as rc_cli from researchclaw.config import resolve_config_path +from researchclaw.pipeline.executor import StageResult +from researchclaw.pipeline.stages import Stage, StageStatus def _write_valid_config(path: Path) -> None: @@ -100,6 +102,55 @@ def test_cmd_validate_valid_config_returns_zero( assert "Config validation passed" in capsys.readouterr().out +def test_cmd_run_reports_paused_pipeline( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + config_path = tmp_path / "config.yaml" + _write_valid_config(config_path) + output_dir = tmp_path / "artifacts" / "paused-run" + + from researchclaw.pipeline import runner as rc_runner + + monkeypatch.setattr( + rc_runner, + "execute_pipeline", + lambda **kwargs: [ + StageResult( + stage=Stage.TOPIC_INIT, + status=StageStatus.DONE, + artifacts=("goal.md",), + ), + StageResult( + stage=Stage.PROBLEM_DECOMPOSE, + status=StageStatus.PAUSED, + artifacts=("refinement_log.json",), + error="ACP prompt timed out after 1800s", + decision="resume", + ), + ], + ) + monkeypatch.setattr(rc_runner, "read_checkpoint", lambda run_dir: None) + + args = argparse.Namespace( + config=str(config_path), + topic=None, + output=str(output_dir), + from_stage=None, + auto_approve=False, + skip_preflight=True, + resume=False, + skip_noncritical_stage=False, + no_graceful_degradation=False, + ) + code = rc_cli.cmd_run(args) + captured = capsys.readouterr() + assert code == 0 + assert "Pipeline paused:" in captured.out + assert "1 paused" in captured.out + + def test_main_dispatches_run_command(monkeypatch: pytest.MonkeyPatch) -> None: captured = {} diff --git a/tests/test_rc_executor.py b/tests/test_rc_executor.py index 8554ad87..80b8046d 100644 --- a/tests/test_rc_executor.py +++ b/tests/test_rc_executor.py @@ -13,6 +13,7 @@ from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.pipeline import executor as rc_executor +from researchclaw.pipeline.stage_impls import _code_generation as code_generation from researchclaw.pipeline.stages import Stage, StageStatus @@ -37,6 +38,26 @@ def __init__(self, response_text: str = "mock response"): ) +class SequencedFakeLLMClient(FakeLLMClient): + def __init__(self, responses: list[str]): + super().__init__(response_text=responses[-1] if responses else "mock response") + self._responses = list(responses) + self._idx = 0 + + def chat(self, messages: list[dict[str, str]], **kwargs: object): + _ = kwargs + self.calls.append(messages) + from researchclaw.llm.client import LLMResponse + + if self._responses: + idx = min(self._idx, len(self._responses) - 1) + content = self._responses[idx] + self._idx += 1 + else: + content = self.response_text + return LLMResponse(content=content, model="fake-model") + + @pytest.fixture() def rc_config(tmp_path: Path) -> RCConfig: data = { @@ -272,6 +293,29 @@ def test_write_stage_meta_writes_expected_json(run_dir: Path) -> None: assert re.match(r"\d{4}-\d{2}-\d{2}T", payload["ts"]) +def test_write_stage_meta_keeps_paused_stage_as_next_stage(run_dir: Path) -> None: + stage_dir = run_dir / "stage-02" + stage_dir.mkdir() + result = rc_executor.StageResult( + stage=Stage.PROBLEM_DECOMPOSE, + status=StageStatus.PAUSED, + artifacts=("refinement_log.json",), + decision="resume", + error="ACP prompt timed out after 1800s", + evidence_refs=("stage-02/refinement_log.json",), + ) + rc_executor._write_stage_meta( + stage_dir, Stage.PROBLEM_DECOMPOSE, "run-paused", result + ) + payload = cast( + dict[str, Any], + json.loads((stage_dir / "decision.json").read_text(encoding="utf-8")), + ) + assert payload["status"] == "paused" + assert payload["decision"] == "resume" + assert payload["next_stage"] == int(Stage.PROBLEM_DECOMPOSE) + + def test_execute_stage_creates_stage_dir_writes_artifacts_and_meta( monkeypatch: pytest.MonkeyPatch, run_dir: Path, @@ -751,6 +795,45 @@ def test_refine_no_llm_saves_original_as_final( assert payload["stop_reason"] == "llm_unavailable" assert result.status == StageStatus.DONE + def test_refine_acp_timeout_pauses_for_resume( + self, + run_dir: Path, + rc_config: RCConfig, + adapters: AdapterBundle, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + self._prepare_refine_inputs(run_dir) + stage_dir = run_dir / "stage-13" + stage_dir.mkdir(parents=True, exist_ok=True) + + from researchclaw.pipeline.stage_impls import _execution as execution_impl + + def _timeout(*args, **kwargs): + _ = args, kwargs + raise RuntimeError("ACP prompt timed out after 1800s") + + monkeypatch.setattr(execution_impl, "_chat_with_prompt", _timeout) + + result = rc_executor._execute_iterative_refine( + stage_dir, + run_dir, + rc_config, + adapters, + llm=FakeLLMClient("unused"), + ) + + payload = json.loads( + (stage_dir / "refinement_log.json").read_text(encoding="utf-8") + ) + assert result.status == StageStatus.PAUSED + assert result.decision == "resume" + assert result.artifacts == ("refinement_log.json",) + assert payload["paused"] is True + assert payload["stop_reason"] == "acp_prompt_timeout" + assert payload["pause_iteration"] == 1 + assert payload["best_version"] == "experiment/" + assert not (stage_dir / "experiment_final").exists() + def test_refine_with_llm_generates_improved_code( self, run_dir: Path, @@ -1911,6 +1994,379 @@ def test_compute_budget_injected_into_code_generation( ) assert "60" in all_user_msgs or "Compute Budget" in all_user_msgs + def test_code_generation_repairs_missing_local_helper_modules( + self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle + ) -> None: + data = { + "project": {"name": "rc-test", "mode": "docs-first"}, + "research": { + "topic": "optimizer comparison", + "domains": ["ml"], + "daily_paper_count": 2, + "quality_threshold": 8.2, + }, + "runtime": {"timezone": "UTC"}, + "notifications": { + "channel": "local", + "on_stage_start": True, + "on_stage_fail": False, + "on_gate_required": True, + }, + "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, + "openclaw_bridge": {"use_memory": True, "use_message": True}, + "llm": { + "provider": "openai-compatible", + "base_url": "http://localhost:1234/v1", + "api_key_env": "RC_TEST_KEY", + "api_key": "inline-test-key", + "primary_model": "fake-model", + "fallback_models": [], + }, + "security": {"hitl_required_stages": [5, 9, 20]}, + "experiment": { + "mode": "sandbox", + "time_budget_sec": 30, + "metric_key": "primary_metric", + "metric_direction": "minimize", + "code_agent": {"enabled": False}, + "opencode": {"enabled": False}, + }, + } + cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) + + _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test") + + initial_generation = ( + "```filename:main.py\n" + "from models import ToyModel\n\n" + "def main():\n" + " print('primary_metric: 0.3')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + review_json = '{"score": 8, "issues": [], "verdict": "pass"}' + alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}' + self_contained_fix = ( + "```filename:main.py\n" + "from models import ToyModel\n\n" + "def main():\n" + " _ = ToyModel()\n" + " print('primary_metric: 0.3')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:models.py\n" + "class ToyModel:\n" + " pass\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + llm = SequencedFakeLLMClient( + [initial_generation, review_json, alignment_json, self_contained_fix] + ) + stage_dir = run_dir / "stage-11" + stage_dir.mkdir(parents=True, exist_ok=True) + + result = rc_executor._execute_code_generation( + stage_dir, run_dir, cfg, adapters, llm=llm + ) + + assert result.status == StageStatus.DONE + exp_dir = stage_dir / "experiment" + assert (exp_dir / "main.py").exists() + assert (exp_dir / "models.py").exists() + + def test_detects_placeholder_ablation_stubs(self) -> None: + issues = code_generation._find_placeholder_experiment_issues( + { + "main.py": ( + "# Dummy Implementations for Standalone Operation\n" + "class ClutterAwareDisagreementRadiusAdaptiveReranker:\n" + " def __init__(self, hparams):\n" + " pass\n" + " def evaluate(self, seed=None, regime=None):\n" + " return 0.22\n" + ) + } + ) + + assert any("Placeholder experiment text found" in issue for issue in issues) + assert any( + "placeholder experiment implementation" in issue + or "demonstration stub" in issue + for issue in issues + ) + + def test_detects_missing_condition_distinctness_check(self) -> None: + issues = code_generation._find_condition_distinctness_issues( + { + "main.py": ( + "class BaselineVerifier:\n" + " def predict(self, value):\n" + " return {'score': value}\n\n" + "class AblationWithoutRadius:\n" + " def predict(self, value):\n" + " return {'score': value + 1}\n\n" + "models = [\n" + " ('Baseline', BaselineVerifier()),\n" + " ('Abl_NoRadius', AblationWithoutRadius()),\n" + " ('Abl_NoVoteShape', AblationWithoutRadius()),\n" + " ('FusionWithoutScaleBins', AblationWithoutRadius()),\n" + "]\n" + ) + } + ) + + assert any( + "No ablation/condition distinctness self-check found" in issue + for issue in issues + ) + + def test_code_generation_repairs_placeholder_ablation_stubs( + self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle + ) -> None: + data = { + "project": {"name": "rc-test", "mode": "docs-first"}, + "research": { + "topic": "geometry-learning fusion ablation study", + "domains": ["ml"], + "daily_paper_count": 2, + "quality_threshold": 8.2, + }, + "runtime": {"timezone": "UTC"}, + "notifications": { + "channel": "local", + "on_stage_start": True, + "on_stage_fail": False, + "on_gate_required": True, + }, + "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, + "openclaw_bridge": {"use_memory": True, "use_message": True}, + "llm": { + "provider": "openai-compatible", + "base_url": "http://localhost:1234/v1", + "api_key_env": "RC_TEST_KEY", + "api_key": "inline-test-key", + "primary_model": "fake-model", + "fallback_models": [], + }, + "security": {"hitl_required_stages": [5, 9, 20]}, + "experiment": { + "mode": "sandbox", + "time_budget_sec": 30, + "metric_key": "primary_metric", + "metric_direction": "minimize", + "code_agent": {"enabled": False}, + "opencode": {"enabled": False}, + }, + } + cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) + _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test") + + initial_generation = ( + "```filename:main.py\n" + "# Dummy Implementations for Standalone Operation\n" + "class GeometryFusionVerifier:\n" + " def __init__(self, hparams):\n" + " pass\n\n" + " def evaluate(self, seed=None, regime=None):\n" + " return 0.2\n\n" + "class AblationWithoutRadius:\n" + " def __init__(self, hparams):\n" + " pass\n\n" + " def evaluate(self, seed=None, regime=None):\n" + " return 0.2\n\n" + "def main():\n" + " print('primary_metric: 0.3')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + repaired_generation = ( + "```filename:main.py\n" + "class GeometryFusionVerifier:\n" + " def __init__(self, hparams):\n" + " self.bias = float(hparams.get('bias', 0.0))\n\n" + " def evaluate(self, seed=None, regime=None):\n" + " seed = 0 if seed is None else int(seed)\n" + " base = 0.10 + 0.02 * (seed % 3)\n" + " if regime == 'hard':\n" + " base += 0.05\n" + " return base + self.bias\n\n" + "class AblationWithoutRadius(GeometryFusionVerifier):\n" + " def evaluate(self, seed=None, regime=None):\n" + " base = super().evaluate(seed=seed, regime=regime)\n" + " return base + 0.07\n\n" + "def main():\n" + " verifier = GeometryFusionVerifier({'bias': 0.01})\n" + " ablation = AblationWithoutRadius({'bias': 0.01})\n" + " primary_metric = min(\n" + " verifier.evaluate(seed=1, regime='hard'),\n" + " ablation.evaluate(seed=1, regime='hard'),\n" + " )\n" + " print(f'primary_metric: {primary_metric:.3f}')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + review_json = '{"score": 8, "issues": [], "verdict": "pass"}' + alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}' + llm = SequencedFakeLLMClient( + [initial_generation, repaired_generation, review_json, alignment_json] + ) + stage_dir = run_dir / "stage-11" + stage_dir.mkdir(parents=True, exist_ok=True) + + result = rc_executor._execute_code_generation( + stage_dir, run_dir, cfg, adapters, llm=llm + ) + + assert result.status == StageStatus.DONE + main_text = (stage_dir / "experiment" / "main.py").read_text( + encoding="utf-8" + ) + assert "Dummy Implementations" not in main_text + assert "return 0.2" not in main_text + + def test_code_generation_repairs_missing_condition_distinctness_check( + self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle + ) -> None: + data = { + "project": {"name": "rc-test", "mode": "docs-first"}, + "research": { + "topic": "geometry-learning fusion ablation study", + "domains": ["ml"], + "daily_paper_count": 2, + "quality_threshold": 8.2, + }, + "runtime": {"timezone": "UTC"}, + "notifications": { + "channel": "local", + "on_stage_start": True, + "on_stage_fail": False, + "on_gate_required": True, + }, + "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, + "openclaw_bridge": {"use_memory": True, "use_message": True}, + "llm": { + "provider": "openai-compatible", + "base_url": "http://localhost:1234/v1", + "api_key_env": "RC_TEST_KEY", + "api_key": "inline-test-key", + "primary_model": "fake-model", + "fallback_models": [], + }, + "security": {"hitl_required_stages": [5, 9, 20]}, + "experiment": { + "mode": "sandbox", + "time_budget_sec": 30, + "metric_key": "primary_metric", + "metric_direction": "minimize", + "code_agent": {"enabled": False}, + "opencode": {"enabled": False}, + }, + } + cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) + _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test") + + initial_generation = ( + "```filename:main.py\n" + "class BaselineVerifier:\n" + " def predict(self, value):\n" + " return {'score': value}\n\n" + "class AblationWithoutRadius:\n" + " def predict(self, value):\n" + " return {'score': value + 1}\n\n" + "class AblationWithoutVoteShape:\n" + " def predict(self, value):\n" + " return {'score': value + 2}\n\n" + "class FusionWithoutScaleBins:\n" + " def predict(self, value):\n" + " return {'score': value + 3}\n\n" + "models = [\n" + " ('Baseline', BaselineVerifier()),\n" + " ('Abl_NoRadius', AblationWithoutRadius()),\n" + " ('Abl_NoVoteShape', AblationWithoutVoteShape()),\n" + " ('FusionWithoutScaleBins', FusionWithoutScaleBins()),\n" + "]\n\n" + "def sanity_check_condition_outputs_differ():\n" + " pass\n\n" + "def main():\n" + " print('primary_metric: 0.3')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + repaired_generation = ( + "```filename:main.py\n" + "class BaselineVerifier:\n" + " def predict(self, value):\n" + " return {'score': value}\n\n" + "class AblationWithoutRadius:\n" + " def predict(self, value):\n" + " return {'score': value + 1}\n\n" + "class AblationWithoutVoteShape:\n" + " def predict(self, value):\n" + " return {'score': value + 2}\n\n" + "class FusionWithoutScaleBins:\n" + " def predict(self, value):\n" + " return {'score': value + 3}\n\n" + "models = [\n" + " ('Baseline', BaselineVerifier()),\n" + " ('Abl_NoRadius', AblationWithoutRadius()),\n" + " ('Abl_NoVoteShape', AblationWithoutVoteShape()),\n" + " ('FusionWithoutScaleBins', FusionWithoutScaleBins()),\n" + "]\n\n" + "def sanity_check_condition_outputs_differ():\n" + " probe = 5\n" + " outputs = {name: model.predict(probe)['score'] for name, model in models}\n" + " assert len(set(outputs.values())) == len(outputs), outputs\n" + " print('ABLATION_CHECK: outputs_differ=True')\n\n" + "def main():\n" + " sanity_check_condition_outputs_differ()\n" + " print('primary_metric: 0.3')\n\n" + "if __name__ == '__main__':\n" + " main()\n" + "```\n" + "```filename:requirements.txt\n" + "numpy\n" + "```" + ) + review_json = '{"score": 8, "issues": [], "verdict": "pass"}' + alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}' + llm = SequencedFakeLLMClient( + [initial_generation, repaired_generation, review_json, alignment_json] + ) + stage_dir = run_dir / "stage-11" + stage_dir.mkdir(parents=True, exist_ok=True) + + result = rc_executor._execute_code_generation( + stage_dir, run_dir, cfg, adapters, llm=llm + ) + + assert result.status == StageStatus.DONE + main_text = (stage_dir / "experiment" / "main.py").read_text( + encoding="utf-8" + ) + assert "def sanity_check_condition_outputs_differ" in main_text + assert "pass" not in main_text + assert "sanity_check_condition_outputs_differ()" in main_text + class TestPartialTimeoutStatus: """Test partial status for timed-out experiments with data (R4-1c).""" @@ -3172,6 +3628,55 @@ def test_topic_alignment_in_refine_prompt(self) -> None: assert "NEVER rename" in sp.user +class TestRefinePromptCompaction: + def test_build_refine_prompt_context_preserves_constraints(self) -> None: + from researchclaw.pipeline.stage_impls._execution import ( + _build_refine_prompt_context, + ) + + topic = """ + Design a research project around hybrid circle localization for engineering drawings. + + Important constraints: + - Keep the direction hybrid geometry + learning + - Focus especially on small circles, partial circles, dashed circles, and cluttered drawings + - Do not propose a purely black-box end-to-end detector + + Please produce: + 1. Problem formulation + 2. Novelty statement + """ + exp_plan = """ +baselines: +- name: ExplicitArcVoteRuleCascade +- name: ImplicitHeatmapPeakVerifier +proposed_methods: +- name: ScaleBinnedRulePriorHeatmapCalibration +ablations: +- name: AntiEvidenceRerankerWithFixedPatchVerifier +metrics: + primary_metric: + name: hard_subset_miss_rate +objectives: + problem_formulation: Treat circle localization as a comparison between D(x) and H(x). + novelty_statement: Audit whether geometric and learned vote fields are complementary. + recommended_first_prototype: Start with cached diagnostics and two shallow trainable methods. + research_questions: + - Are D(x) and H(x) complementary? + - Does rule density help more as calibration or anti-evidence? +""" + + compact_topic, anchor = _build_refine_prompt_context(topic, exp_plan) + + assert len(compact_topic) < len(topic) + assert "Important constraints" not in compact_topic + assert "Structured experiment plan summary" in anchor + assert "ScaleBinnedRulePriorHeatmapCalibration" in anchor + assert "hard_subset_miss_rate" in anchor + assert "Key research constraints to preserve" in anchor + assert "Keep the direction hybrid geometry + learning" in anchor + + # ===================================================================== # _validate_draft_quality tests # ===================================================================== diff --git a/tests/test_rc_runner.py b/tests/test_rc_runner.py index 0d178529..5e5d6238 100644 --- a/tests/test_rc_runner.py +++ b/tests/test_rc_runner.py @@ -53,6 +53,16 @@ def _failed(stage: Stage, msg: str = "boom") -> StageResult: return StageResult(stage=stage, status=StageStatus.FAILED, artifacts=(), error=msg) +def _paused(stage: Stage, msg: str = "resume needed") -> StageResult: + return StageResult( + stage=stage, + status=StageStatus.PAUSED, + artifacts=("refinement_log.json",), + error=msg, + decision="resume", + ) + + def _blocked(stage: Stage) -> StageResult: return StageResult( stage=stage, @@ -113,6 +123,37 @@ def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: assert len(results) == int(fail_stage) +def test_execute_pipeline_stops_on_paused_stage( + monkeypatch: pytest.MonkeyPatch, + run_dir: Path, + rc_config: RCConfig, + adapters: AdapterBundle, +) -> None: + pause_stage = Stage.ITERATIVE_REFINE + + def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: + _ = kwargs + if stage == pause_stage: + return _paused(stage, "ACP prompt timed out after 1800s") + return _done(stage) + + monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) + results = rc_runner.execute_pipeline( + run_dir=run_dir, + run_id="run-paused", + config=rc_config, + adapters=adapters, + ) + assert results[-1].stage == pause_stage + assert results[-1].status == StageStatus.PAUSED + assert len(results) == int(pause_stage) + checkpoint = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8")) + assert checkpoint["last_completed_stage"] == int(Stage.EXPERIMENT_RUN) + summary = json.loads((run_dir / "pipeline_summary.json").read_text(encoding="utf-8")) + assert summary["stages_paused"] == 1 + assert summary["final_status"] == "paused" + + def test_execute_pipeline_stops_on_gate_when_stop_on_gate_enabled( monkeypatch: pytest.MonkeyPatch, run_dir: Path, @@ -217,6 +258,7 @@ def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: assert summary["stages_done"] == sum( 1 for r in results if r.status == StageStatus.DONE ) + assert summary["stages_paused"] == 0 assert summary["stages_blocked"] == 1 assert summary["stages_failed"] == 1 assert summary["from_stage"] == 1 @@ -337,6 +379,7 @@ def test_should_start_logic(stage: Stage, started: bool, expected: bool) -> None [ ([], "no_stages", int(Stage.TOPIC_INIT)), ([_done(Stage.TOPIC_INIT)], "done", int(Stage.TOPIC_INIT)), + ([_done(Stage.TOPIC_INIT), _paused(Stage.PROBLEM_DECOMPOSE)], "paused", int(Stage.PROBLEM_DECOMPOSE)), ( [_done(Stage.TOPIC_INIT), _failed(Stage.PROBLEM_DECOMPOSE)], "failed", diff --git a/tests/test_ssh_and_colab_sandbox.py b/tests/test_ssh_and_colab_sandbox.py index d3436888..2132a459 100644 --- a/tests/test_ssh_and_colab_sandbox.py +++ b/tests/test_ssh_and_colab_sandbox.py @@ -508,7 +508,42 @@ def fail_with_other_error(acpx: str, prompt: str) -> str: import pytest with pytest.raises(RuntimeError, match="permission denied"): client._send_prompt("test prompt") - assert call_count == 1 # no retry + + def test_stateless_reconnect_on_session_died(self): + """Stateless mode retries with a fresh ephemeral session on reconnect errors.""" + from researchclaw.llm.acp_client import ACPClient, ACPConfig + + client = ACPClient(ACPConfig(agent="claude", stateless_prompt=True)) + client._acpx = "/usr/bin/true" + + sessions: list[str] = [] + closed: list[str] = [] + call_count = 0 + + def fake_new_ephemeral(acpx: str) -> str: + name = f"ephemeral-{len(sessions) + 1}" + sessions.append(name) + return name + + def fake_close_named(acpx: str, session_name: str) -> None: + closed.append(session_name) + + def fake_cli(acpx: str, prompt: str, *, session_name: str | None = None) -> str: + nonlocal call_count + call_count += 1 + if call_count == 1: + raise RuntimeError("ACP prompt failed (exit 1): agent needs reconnect") + return f"success via {session_name}" + + client._new_ephemeral_session = fake_new_ephemeral # type: ignore[assignment] + client._close_named_session = fake_close_named # type: ignore[assignment] + client._send_prompt_cli = fake_cli # type: ignore[assignment] + + result = client._send_prompt("test prompt") + assert result == "success via ephemeral-2" + assert call_count == 2 + assert sessions == ["ephemeral-1", "ephemeral-2"] + assert closed == ["ephemeral-1", "ephemeral-2"] # =========================================================================== From 18030fac1f6ad9ccc3f46fcbc8ab8cd0e1177162 Mon Sep 17 00:00:00 2001 From: CKwin26 <156837805+CKwin26@users.noreply.github.com> Date: Tue, 31 Mar 2026 01:41:39 -0400 Subject: [PATCH 2/2] add manual repair workflows --- README.md | 65 +- autoresearchclaw/__init__.py | 2 + autoresearchclaw/__main__.py | 6 + autoresearchclaw/cli.py | 254 ++++++ autoresearchclaw/paper_repair.py | 349 +++++++++ autoresearchclaw/research_repair.py | 1133 +++++++++++++++++++++++++++ pyproject.toml | 3 +- 7 files changed, 1810 insertions(+), 2 deletions(-) create mode 100644 autoresearchclaw/__init__.py create mode 100644 autoresearchclaw/__main__.py create mode 100644 autoresearchclaw/cli.py create mode 100644 autoresearchclaw/paper_repair.py create mode 100644 autoresearchclaw/research_repair.py diff --git a/README.md b/README.md index 5d35ddb8..0ec39db7 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,70 @@ export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` -Output → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — compile-ready LaTeX, BibTeX, experiment code, charts. +Output → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` - compile-ready LaTeX, BibTeX, experiment code, charts. + +## Repair Workflows + +AutoResearchClaw already has in-pipeline rollback and auto-repair loops. This repo now also ships a **manual repair companion CLI** for cases where a human wants to take a completed run and: + +- patch exported paper artifacts without rerunning the pipeline +- or create a repair child run that reuses early stages and reruns from a later authoritative stage such as Stage 9, 10, or 12 + +These workflows are exposed through a second CLI: + +```bash +autoresearchclaw --help +``` + +### Paper Repair + +Use paper repair when the research run is complete but the exported paper package still needs human cleanup. + +```bash +autoresearchclaw paper-repair-init \ + --run-dir artifacts/rc-YYYYMMDD-HHMMSS- \ + --output-dir artifacts/paper-repair/my-run-v1 +``` + +Edit files under `workspace/`, then publish them back into the source run: + +```bash +autoresearchclaw paper-repair-apply \ + --repair-json artifacts/paper-repair/my-run-v1/paper-repair.json \ + --note "Clarify wording and fix paper packaging" +``` + +If needed, roll back the most recent publish: + +```bash +autoresearchclaw paper-repair-rollback \ + --repair-json artifacts/paper-repair/my-run-v1/paper-repair.json +``` + +### Research Repair + +Use research repair when the completed run needs more data, more seeds, stronger protocol coverage, or a return to earlier experiment stages. + +```bash +autoresearchclaw research-repair-init \ + --run-dir artifacts/rc-YYYYMMDD-HHMMSS- \ + --output-dir artifacts/research-repair/my-run-v1 \ + --config config.arc.yaml \ + --target-stage EXPERIMENT_DESIGN \ + --reason "Human review found insufficient experiment coverage." \ + --feedback "Use real local assets only." \ + --feedback "Increase experiment coverage before claiming results." +``` + +Then prepare a child run: + +```bash +autoresearchclaw research-repair-run \ + --repair-json artifacts/research-repair/my-run-v1/research-repair.json \ + --skip-preflight +``` + +This creates a child-run config, launch script, repair metadata, and a compact repair brief. Add `--execute` when you are ready to launch the rerun.
📝 Minimum required config diff --git a/autoresearchclaw/__init__.py b/autoresearchclaw/__init__.py new file mode 100644 index 00000000..e5430bc9 --- /dev/null +++ b/autoresearchclaw/__init__.py @@ -0,0 +1,2 @@ +"""Companion repair workflows for completed AutoResearchClaw runs.""" + diff --git a/autoresearchclaw/__main__.py b/autoresearchclaw/__main__.py new file mode 100644 index 00000000..0b6ae7cd --- /dev/null +++ b/autoresearchclaw/__main__.py @@ -0,0 +1,6 @@ +from .cli import main + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/autoresearchclaw/cli.py b/autoresearchclaw/cli.py new file mode 100644 index 00000000..57de5005 --- /dev/null +++ b/autoresearchclaw/cli.py @@ -0,0 +1,254 @@ +from __future__ import annotations + +import argparse +import sys + +from .paper_repair import ( + PaperRepairError, + apply_paper_repair, + init_paper_repair, + rollback_paper_repair, +) +from .research_repair import ( + ResearchRepairError, + init_research_repair, + prepare_research_repair_run, +) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="autoresearchclaw", + description="Manual paper-level and research-level repair workflows for completed AutoResearchClaw runs.", + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + paper_init = subparsers.add_parser( + "paper-repair-init", + help="Create an editable post-export paper repair workspace from a completed run.", + ) + paper_init.add_argument("--run-dir", required=True, help="Completed run directory with stage-22 or stage-23 paper artifacts.") + paper_init.add_argument( + "--output-dir", + default="artifacts/paper-repair", + help="Directory where the paper repair workspace and manifest will be written.", + ) + + paper_apply = subparsers.add_parser( + "paper-repair-apply", + help="Publish repaired paper artifacts back into the source run.", + ) + paper_apply.add_argument("--repair-json", required=True, help="Path to a paper-repair.json manifest.") + paper_apply.add_argument("--note", help="Optional note describing the published paper fix.") + + paper_rollback = subparsers.add_parser( + "paper-repair-rollback", + help="Restore the most recent published paper repair snapshot into the source run.", + ) + paper_rollback.add_argument("--repair-json", required=True, help="Path to a paper-repair.json manifest.") + paper_rollback.add_argument("--backup-id", help="Optional backup id to roll back to. Defaults to the most recent publish.") + + research_init = subparsers.add_parser( + "research-repair-init", + help="Create a run-level repair workspace that can send a completed run back to experiment stages.", + ) + research_init.add_argument("--run-dir", required=True, help="Existing AutoResearchClaw run directory to repair.") + research_init.add_argument( + "--output-dir", + default="artifacts/research-repair", + help="Directory where the research-repair workspace and manifest will be written.", + ) + research_init.add_argument( + "--config", + default="config.arc.yaml", + help="Base config to copy into the repair workspace for the child run.", + ) + research_init.add_argument( + "--target-stage", + default="EXPERIMENT_DESIGN", + help="Stage number or stage name to restart from, such as 9, CODE_GENERATION, or EXPERIMENT_RUN.", + ) + research_init.add_argument("--reason", help="Short human reason for why the completed run should be repaired.") + research_init.add_argument( + "--feedback", + action="append", + default=[], + help="Initial repair feedback bullet to seed into workspace/feedback.md. Repeatable.", + ) + research_init.add_argument( + "--upstream-root", + default=".", + help="Path to the AutoResearchClaw checkout used for child runs.", + ) + + research_run = subparsers.add_parser( + "research-repair-run", + help="Prepare, and optionally launch, a child run from a research-repair workspace.", + ) + research_run.add_argument("--repair-json", required=True, help="Path to a research-repair.json manifest.") + research_run.add_argument("--output-dir", help="Optional explicit child run output directory.") + research_run.add_argument( + "--feedback", + action="append", + default=[], + help="Additional repair feedback bullet to append before generating the child run. Repeatable.", + ) + research_run.add_argument( + "--auto-approve", + action="store_true", + help="Launch the child run with --auto-approve so the child pipeline will not stop at quality gates.", + ) + research_run.add_argument( + "--skip-preflight", + action="store_true", + help="Pass --skip-preflight to the child run command.", + ) + research_run.add_argument( + "--execute", + action="store_true", + help="Actually launch the child run. Without this flag, only launch metadata and scripts are prepared.", + ) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.command == "paper-repair-init": + return _run_paper_repair_init(args.run_dir, args.output_dir) + if args.command == "paper-repair-apply": + return _run_paper_repair_apply(args.repair_json, args.note) + if args.command == "paper-repair-rollback": + return _run_paper_repair_rollback(args.repair_json, args.backup_id) + if args.command == "research-repair-init": + return _run_research_repair_init( + args.run_dir, + args.output_dir, + args.config, + args.target_stage, + args.reason, + list(args.feedback), + args.upstream_root, + ) + if args.command == "research-repair-run": + return _run_research_repair_run( + args.repair_json, + args.output_dir, + list(args.feedback), + bool(args.auto_approve), + bool(args.skip_preflight), + bool(args.execute), + ) + + parser.error(f"Unknown command: {args.command}") + return 2 + + +def _run_paper_repair_init(run_dir: str, output_dir: str) -> int: + try: + outputs = init_paper_repair(run_dir, output_dir) + except (PaperRepairError, OSError) as exc: + print(f"Paper repair init failed: {exc}", file=sys.stderr) + return 1 + + print("Paper repair workspace created") + print(f"Workspace: {outputs['workspace']}") + print(f"Session JSON: {outputs['session_json']}") + print(f"README: {outputs['readme']}") + return 0 + + +def _run_paper_repair_apply(repair_json: str, note: str | None) -> int: + try: + outputs = apply_paper_repair(repair_json, note=note) + except (PaperRepairError, OSError) as exc: + print(f"Paper repair publish failed: {exc}", file=sys.stderr) + return 1 + + print("Paper repair published") + print(f"Run dir: {outputs['published_run_dir']}") + print(f"Backup dir: {outputs['backup_dir']}") + print(f"Session JSON: {outputs['session_json']}") + return 0 + + +def _run_paper_repair_rollback(repair_json: str, backup_id: str | None) -> int: + try: + outputs = rollback_paper_repair(repair_json, backup_id=backup_id) + except (PaperRepairError, OSError) as exc: + print(f"Paper repair rollback failed: {exc}", file=sys.stderr) + return 1 + + print("Paper repair rolled back") + print(f"Run dir: {outputs['published_run_dir']}") + print(f"Rolled back backup: {outputs['rolled_back_backup']}") + print(f"Session JSON: {outputs['session_json']}") + return 0 + + +def _run_research_repair_init( + run_dir: str, + output_dir: str, + config_path: str, + target_stage: str, + reason: str | None, + feedback: list[str], + upstream_root: str, +) -> int: + try: + outputs = init_research_repair( + run_dir, + output_dir, + config_path=config_path, + target_stage=target_stage, + reason=reason, + feedback=feedback, + upstream_root=upstream_root, + ) + except (ResearchRepairError, OSError) as exc: + print(f"Research repair init failed: {exc}", file=sys.stderr) + return 1 + + print("Research repair workspace created") + print(f"Workspace: {outputs['workspace']}") + print(f"Session JSON: {outputs['session_json']}") + print(f"Feedback: {outputs['feedback']}") + print(f"Repair config: {outputs['repair_config']}") + print(f"README: {outputs['readme']}") + return 0 + + +def _run_research_repair_run( + repair_json: str, + output_dir: str | None, + feedback: list[str], + auto_approve: bool, + skip_preflight: bool, + execute: bool, +) -> int: + try: + outputs = prepare_research_repair_run( + repair_json, + output_dir=output_dir, + extra_feedback=feedback, + auto_approve=auto_approve, + skip_preflight=skip_preflight, + execute=execute, + ) + except (ResearchRepairError, OSError) as exc: + print(f"Research repair launch preparation failed: {exc}", file=sys.stderr) + return 1 + + print("Research repair child run prepared") + print(f"Child run dir: {outputs['child_run_dir']}") + print(f"Generated config: {outputs['generated_config']}") + print(f"Launch script: {outputs['launch_script']}") + print(f"Metadata: {outputs['metadata']}") + print("Command preview:") + print(outputs["command_preview"]) + if execute and outputs.get("pid"): + print(f"Process pid: {outputs['pid']}") + print(f"Session JSON: {outputs['session_json']}") + return 0 diff --git a/autoresearchclaw/paper_repair.py b/autoresearchclaw/paper_repair.py new file mode 100644 index 00000000..b039397c --- /dev/null +++ b/autoresearchclaw/paper_repair.py @@ -0,0 +1,349 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from json import dumps, loads +from pathlib import Path +from shutil import copy2, copytree, rmtree +from typing import Any + + +class PaperRepairError(ValueError): + """Raised when a paper-repair session cannot be created or applied.""" + + +TRACKED_STAGE_PATHS: dict[str, tuple[str, ...]] = { + "stage-22": ( + "paper.tex", + "paper.pdf", + "paper_final.md", + "paper_final_latex.md", + "references.bib", + "references_verified.bib", + "neurips_2025.sty", + "charts", + ), + "stage-23": ( + "paper_final_verified.md", + "references_verified.bib", + "verification_report.json", + "charts", + ), +} + + +@dataclass(frozen=True) +class TrackedItem: + relative_path: str + kind: str + exists: bool + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> TrackedItem: + return cls( + relative_path=str(data.get("relative_path", "")), + kind=str(data.get("kind", "file")), + exists=bool(data.get("exists", False)), + ) + + +@dataclass(frozen=True) +class ApplyEntry: + backup_id: str + applied_at: str + note: str + backup_dir: str + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ApplyEntry: + return cls( + backup_id=str(data.get("backup_id", "")), + applied_at=str(data.get("applied_at", "")), + note=str(data.get("note", "")), + backup_dir=str(data.get("backup_dir", "")), + ) + + +@dataclass(frozen=True) +class PaperRepairSession: + source_run_dir: str + session_dir: str + workspace_dir: str + created_at: str + tracked_items: tuple[TrackedItem, ...] + apply_history: tuple[ApplyEntry, ...] = field(default_factory=tuple) + + def to_dict(self) -> dict[str, Any]: + return { + "source_run_dir": self.source_run_dir, + "session_dir": self.session_dir, + "workspace_dir": self.workspace_dir, + "created_at": self.created_at, + "tracked_items": [item.to_dict() for item in self.tracked_items], + "apply_history": [entry.to_dict() for entry in self.apply_history], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> PaperRepairSession: + return cls( + source_run_dir=str(data.get("source_run_dir", "")), + session_dir=str(data.get("session_dir", "")), + workspace_dir=str(data.get("workspace_dir", "")), + created_at=str(data.get("created_at", "")), + tracked_items=tuple( + TrackedItem.from_dict(item) + for item in data.get("tracked_items", []) + if isinstance(item, dict) + ), + apply_history=tuple( + ApplyEntry.from_dict(item) + for item in data.get("apply_history", []) + if isinstance(item, dict) + ), + ) + + +def init_paper_repair( + run_dir: str | Path, + output_dir: str | Path, +) -> dict[str, str]: + source_run_dir = Path(run_dir).resolve() + if not source_run_dir.exists(): + raise PaperRepairError(f"Run directory not found: {source_run_dir}") + + tracked_items = _collect_tracked_items(source_run_dir) + if not tracked_items: + raise PaperRepairError( + "No paper-export artifacts found under stage-22 or stage-23. " + "Expected files such as paper.tex or paper_final_verified.md." + ) + + session_dir = Path(output_dir).resolve() + session_dir.mkdir(parents=True, exist_ok=True) + workspace_dir = session_dir / "workspace" + if workspace_dir.exists(): + raise PaperRepairError( + f"Repair workspace already exists: {workspace_dir}. " + "Use a fresh output directory for each repair session." + ) + workspace_dir.mkdir(parents=True, exist_ok=False) + + for item in tracked_items: + if not item.exists: + continue + source_path = source_run_dir / item.relative_path + target_path = workspace_dir / item.relative_path + _copy_path(source_path, target_path, item.kind) + + session = PaperRepairSession( + source_run_dir=str(source_run_dir), + session_dir=str(session_dir), + workspace_dir=str(workspace_dir), + created_at=_utc_now(), + tracked_items=tracked_items, + ) + + session_json = session_dir / "paper-repair.json" + session_json.write_text(dumps(session.to_dict(), indent=2) + "\n", encoding="utf-8") + readme_path = session_dir / "README.md" + readme_path.write_text(_render_repair_readme(session), encoding="utf-8") + + return { + "session_json": str(session_json), + "readme": str(readme_path), + "workspace": str(workspace_dir), + } + + +def apply_paper_repair( + session_json_path: str | Path, + *, + note: str | None = None, +) -> dict[str, str]: + session_path = Path(session_json_path).resolve() + session = _load_session(session_path) + source_run_dir = Path(session.source_run_dir) + workspace_dir = Path(session.workspace_dir) + if not workspace_dir.exists(): + raise PaperRepairError(f"Repair workspace not found: {workspace_dir}") + + backup_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + backup_dir = Path(session.session_dir) / "backups" / backup_id + backup_dir.mkdir(parents=True, exist_ok=False) + + for item in session.tracked_items: + target_path = source_run_dir / item.relative_path + backup_path = backup_dir / item.relative_path + if target_path.exists(): + _copy_path(target_path, backup_path, _path_kind(target_path)) + + workspace_path = workspace_dir / item.relative_path + if workspace_path.exists(): + _copy_path(workspace_path, target_path, _path_kind(workspace_path)) + + entry = ApplyEntry( + backup_id=backup_id, + applied_at=_utc_now(), + note=(note or "").strip(), + backup_dir=str(backup_dir), + ) + rewritten = PaperRepairSession( + source_run_dir=session.source_run_dir, + session_dir=session.session_dir, + workspace_dir=session.workspace_dir, + created_at=session.created_at, + tracked_items=session.tracked_items, + apply_history=session.apply_history + (entry,), + ) + session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8") + return { + "session_json": str(session_path), + "backup_dir": str(backup_dir), + "published_run_dir": str(source_run_dir), + } + + +def rollback_paper_repair( + session_json_path: str | Path, + *, + backup_id: str | None = None, +) -> dict[str, str]: + session_path = Path(session_json_path).resolve() + session = _load_session(session_path) + if not session.apply_history: + raise PaperRepairError("No published repair exists yet, so there is nothing to roll back.") + + entry = _select_backup_entry(session, backup_id) + backup_dir = Path(entry.backup_dir) + if not backup_dir.exists(): + raise PaperRepairError(f"Backup directory not found: {backup_dir}") + + source_run_dir = Path(session.source_run_dir) + for item in session.tracked_items: + source_path = backup_dir / item.relative_path + target_path = source_run_dir / item.relative_path + if source_path.exists(): + _copy_path(source_path, target_path, _path_kind(source_path)) + elif not item.exists and target_path.exists(): + _remove_path(target_path) + + remaining_history = tuple( + history_entry + for history_entry in session.apply_history + if history_entry.backup_id != entry.backup_id + ) + rewritten = PaperRepairSession( + source_run_dir=session.source_run_dir, + session_dir=session.session_dir, + workspace_dir=session.workspace_dir, + created_at=session.created_at, + tracked_items=session.tracked_items, + apply_history=remaining_history, + ) + session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8") + return { + "session_json": str(session_path), + "rolled_back_backup": entry.backup_id, + "published_run_dir": str(source_run_dir), + } + + +def _collect_tracked_items(run_dir: Path) -> tuple[TrackedItem, ...]: + items: list[TrackedItem] = [] + for stage_name, relative_paths in TRACKED_STAGE_PATHS.items(): + stage_dir = run_dir / stage_name + if not stage_dir.exists(): + continue + for relative_path in relative_paths: + full_path = stage_dir / relative_path + items.append( + TrackedItem( + relative_path=f"{stage_name}/{relative_path}", + kind="directory" if full_path.is_dir() else "file", + exists=full_path.exists(), + ) + ) + return tuple(items) + + +def _load_session(session_json_path: Path) -> PaperRepairSession: + if not session_json_path.exists(): + raise PaperRepairError(f"Paper repair JSON not found: {session_json_path}") + data = loads(session_json_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise PaperRepairError("Paper repair JSON must decode to a mapping.") + return PaperRepairSession.from_dict(data) + + +def _select_backup_entry( + session: PaperRepairSession, + backup_id: str | None, +) -> ApplyEntry: + if backup_id: + for entry in session.apply_history: + if entry.backup_id == backup_id: + return entry + raise PaperRepairError(f"Backup id not found in repair session: {backup_id}") + return session.apply_history[-1] + + +def _copy_path(source_path: Path, target_path: Path, kind: str) -> None: + target_path.parent.mkdir(parents=True, exist_ok=True) + if target_path.exists(): + _remove_path(target_path) + if kind == "directory": + copytree(source_path, target_path) + return + copy2(source_path, target_path) + + +def _remove_path(path: Path) -> None: + if path.is_dir(): + rmtree(path) + return + path.unlink() + + +def _path_kind(path: Path) -> str: + return "directory" if path.is_dir() else "file" + + +def _render_repair_readme(session: PaperRepairSession) -> str: + lines = [ + "# Paper Repair Workspace", + "", + "This workspace is a post-export repair lane for a completed AutoResearchClaw run.", + "", + f"- Source run: `{session.source_run_dir}`", + f"- Created at: `{session.created_at}`", + f"- Workspace root: `{session.workspace_dir}`", + "", + "## Tracked Artifacts", + ] + for item in session.tracked_items: + state = "present" if item.exists else "missing in source run" + lines.append(f"- `{item.relative_path}` ({item.kind}, {state})") + lines.extend( + [ + "", + "## Workflow", + "1. Edit files under `workspace/`.", + "2. Publish repairs back to the source run with:", + " `python -m autoresearchclaw paper-repair-apply --repair-json /paper-repair.json`", + "3. If needed, roll back the most recent publish with:", + " `python -m autoresearchclaw paper-repair-rollback --repair-json /paper-repair.json`", + "", + "Each publish snapshots the original files under `backups//` before overwriting them.", + ] + ) + return "\n".join(lines).rstrip() + "\n" + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat() diff --git a/autoresearchclaw/research_repair.py b/autoresearchclaw/research_repair.py new file mode 100644 index 00000000..5c7351de --- /dev/null +++ b/autoresearchclaw/research_repair.py @@ -0,0 +1,1133 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from json import dumps, loads +import os +from pathlib import Path +from shutil import copy2, copytree +import subprocess +import sys +from typing import Any + +import yaml + + +class ResearchRepairError(ValueError): + """Raised when a research-repair session cannot be created or prepared.""" + + +STAGE_NAME_BY_NUMBER: dict[int, str] = { + 1: "TOPIC_INIT", + 2: "PROBLEM_DECOMPOSE", + 3: "SEARCH_STRATEGY", + 4: "LITERATURE_COLLECT", + 5: "LITERATURE_SCREEN", + 6: "KNOWLEDGE_EXTRACT", + 7: "SYNTHESIS", + 8: "HYPOTHESIS_GEN", + 9: "EXPERIMENT_DESIGN", + 10: "CODE_GENERATION", + 11: "RESOURCE_PLANNING", + 12: "EXPERIMENT_RUN", + 13: "ITERATIVE_REFINE", + 14: "RESULT_ANALYSIS", + 15: "RESEARCH_DECISION", + 16: "PAPER_OUTLINE", + 17: "PAPER_DRAFT", + 18: "PEER_REVIEW", + 19: "PAPER_REVISION", + 20: "QUALITY_GATE", + 21: "KNOWLEDGE_ARCHIVE", + 22: "EXPORT_PUBLISH", + 23: "CITATION_VERIFY", +} +STAGE_NUMBER_BY_NAME: dict[str, int] = { + name.upper(): number for number, name in STAGE_NAME_BY_NUMBER.items() +} + +FIXED_CONTEXT_PATHS: tuple[str, ...] = ( + "checkpoint.json", + "pipeline_summary.json", + "experiment_diagnosis.json", + "repair_prompt.txt", + "quality_warning.txt", + "experiment_summary_best.json", + "analysis_best.md", + "stage-09/exp_plan.yaml", + "stage-12/runs/results.json", + "stage-20/quality_report.json", + "stage-23/paper_final_verified.md", + "stage-23/verification_report.json", +) +LATEST_GLOB_PATHS: tuple[str, ...] = ( + "stage-14*/experiment_summary.json", + "stage-14*/analysis.md", + "stage-15*/decision.md", +) + +WSL_PASSTHROUGH_ENV_VARS: tuple[str, ...] = ( + "OPENAI_API_KEY", + "OPENAI_API_BASE", + "OPENAI_BASE_URL", + "OPENAI_ORG_ID", + "OPENAI_PROJECT_ID", +) +REPAIR_RUN_ROOT_ENV_VAR = "AUTORESEARCHCLAW_REPAIR_RUN_ROOT" + + +@dataclass(frozen=True) +class ContextItem: + relative_path: str + kind: str + exists: bool + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ContextItem: + return cls( + relative_path=str(data.get("relative_path", "")), + kind=str(data.get("kind", "file")), + exists=bool(data.get("exists", False)), + ) + + +@dataclass(frozen=True) +class LaunchEntry: + launched_at: str + child_run_dir: str + generated_config_path: str + launch_script: str + command_preview: str + target_stage_name: str + target_stage_number: int + launch_log: str = "" + inherited_stage_dirs: tuple[str, ...] = field(default_factory=tuple) + executed: bool = False + pid: int | None = None + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> LaunchEntry: + return cls( + launched_at=str(data.get("launched_at", "")), + child_run_dir=str(data.get("child_run_dir", "")), + generated_config_path=str(data.get("generated_config_path", "")), + launch_script=str(data.get("launch_script", "")), + launch_log=str(data.get("launch_log", "")), + command_preview=str(data.get("command_preview", "")), + target_stage_name=str(data.get("target_stage_name", "")), + target_stage_number=int(data.get("target_stage_number", 0)), + inherited_stage_dirs=tuple(data.get("inherited_stage_dirs") or ()), + executed=bool(data.get("executed", False)), + pid=int(data["pid"]) if data.get("pid") is not None else None, + ) + + +@dataclass(frozen=True) +class ReusePolicy: + hard_reuse_stage_dirs: tuple[str, ...] = field(default_factory=tuple) + soft_context_paths: tuple[str, ...] = field(default_factory=tuple) + rerun_from_stage_name: str = "" + rerun_from_stage_number: int = 0 + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ReusePolicy: + return cls( + hard_reuse_stage_dirs=tuple(data.get("hard_reuse_stage_dirs") or ()), + soft_context_paths=tuple(data.get("soft_context_paths") or ()), + rerun_from_stage_name=str(data.get("rerun_from_stage_name", "")), + rerun_from_stage_number=int(data.get("rerun_from_stage_number", 0)), + ) + + +@dataclass(frozen=True) +class ResearchRepairSession: + source_run_dir: str + source_run_id: str + session_dir: str + workspace_dir: str + created_at: str + base_config_path: str + upstream_root: str + target_stage_name: str + target_stage_number: int + repair_reason: str + context_items: tuple[ContextItem, ...] + feedback_path: str + reuse_policy: ReusePolicy + launch_history: tuple[LaunchEntry, ...] = field(default_factory=tuple) + + def to_dict(self) -> dict[str, Any]: + return { + "source_run_dir": self.source_run_dir, + "source_run_id": self.source_run_id, + "session_dir": self.session_dir, + "workspace_dir": self.workspace_dir, + "created_at": self.created_at, + "base_config_path": self.base_config_path, + "upstream_root": self.upstream_root, + "target_stage_name": self.target_stage_name, + "target_stage_number": self.target_stage_number, + "repair_reason": self.repair_reason, + "context_items": [item.to_dict() for item in self.context_items], + "feedback_path": self.feedback_path, + "reuse_policy": self.reuse_policy.to_dict(), + "launch_history": [item.to_dict() for item in self.launch_history], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ResearchRepairSession: + return cls( + source_run_dir=str(data.get("source_run_dir", "")), + source_run_id=str(data.get("source_run_id", "")), + session_dir=str(data.get("session_dir", "")), + workspace_dir=str(data.get("workspace_dir", "")), + created_at=str(data.get("created_at", "")), + base_config_path=str(data.get("base_config_path", "")), + upstream_root=str(data.get("upstream_root", "")), + target_stage_name=str(data.get("target_stage_name", "")), + target_stage_number=int(data.get("target_stage_number", 0)), + repair_reason=str(data.get("repair_reason", "")), + context_items=tuple( + ContextItem.from_dict(item) + for item in data.get("context_items", []) + if isinstance(item, dict) + ), + feedback_path=str(data.get("feedback_path", "")), + reuse_policy=ReusePolicy.from_dict( + data.get("reuse_policy") if isinstance(data.get("reuse_policy"), dict) else {} + ), + launch_history=tuple( + LaunchEntry.from_dict(item) + for item in data.get("launch_history", []) + if isinstance(item, dict) + ), + ) + + +def init_research_repair( + run_dir: str | Path, + output_dir: str | Path, + *, + config_path: str | Path = "config.arc.yaml", + target_stage: str = "EXPERIMENT_DESIGN", + reason: str | None = None, + feedback: list[str] | tuple[str, ...] = (), + upstream_root: str | Path = ".", +) -> dict[str, str]: + source_run_dir = Path(run_dir).resolve() + if not source_run_dir.exists(): + raise ResearchRepairError(f"Run directory not found: {source_run_dir}") + + base_config_path = Path(config_path).resolve() + if not base_config_path.exists(): + raise ResearchRepairError(f"Config not found: {base_config_path}") + + upstream_root_path = Path(upstream_root).resolve() + if not upstream_root_path.exists(): + raise ResearchRepairError(f"Upstream root not found: {upstream_root_path}") + + stage_number, stage_name = _normalize_stage_ref(target_stage) + source_run_id = _read_source_run_id(source_run_dir) + context_items = _collect_context_items(source_run_dir) + reuse_policy = _build_reuse_policy( + context_items=context_items, + target_stage_number=stage_number, + target_stage_name=stage_name, + ) + + session_dir = Path(output_dir).resolve() + session_dir.mkdir(parents=True, exist_ok=True) + workspace_dir = session_dir / "workspace" + if workspace_dir.exists(): + raise ResearchRepairError( + f"Research repair workspace already exists: {workspace_dir}. " + "Use a fresh output directory for each repair session." + ) + workspace_dir.mkdir(parents=True, exist_ok=False) + + context_root = workspace_dir / "context" + for item in context_items: + if not item.exists: + continue + source_path = source_run_dir / item.relative_path + target_path = context_root / item.relative_path + _copy_path(source_path, target_path, item.kind) + + repair_config_path = workspace_dir / "repair-config.yaml" + copy2(base_config_path, repair_config_path) + + feedback_path = workspace_dir / "feedback.md" + feedback_path.write_text( + _render_feedback_template( + source_run_id=source_run_id, + target_stage_name=stage_name, + reason=(reason or "").strip(), + feedback=list(feedback), + ), + encoding="utf-8", + ) + + repair_reason = ( + (reason or "").strip() + or "Human review concluded that the completed run needs more data, more experiments, or stronger protocol coverage." + ) + session = ResearchRepairSession( + source_run_dir=str(source_run_dir), + source_run_id=source_run_id, + session_dir=str(session_dir), + workspace_dir=str(workspace_dir), + created_at=_utc_now(), + base_config_path=str(base_config_path), + upstream_root=str(upstream_root_path), + target_stage_name=stage_name, + target_stage_number=stage_number, + repair_reason=repair_reason, + context_items=context_items, + feedback_path=str(feedback_path), + reuse_policy=reuse_policy, + ) + + session_json = session_dir / "research-repair.json" + session_json.write_text(dumps(session.to_dict(), indent=2) + "\n", encoding="utf-8") + readme_path = session_dir / "README.md" + readme_path.write_text(_render_repair_readme(session), encoding="utf-8") + + return { + "session_json": str(session_json), + "readme": str(readme_path), + "workspace": str(workspace_dir), + "feedback": str(feedback_path), + "repair_config": str(repair_config_path), + } + + +def prepare_research_repair_run( + session_json_path: str | Path, + *, + output_dir: str | Path | None = None, + extra_feedback: list[str] | tuple[str, ...] = (), + auto_approve: bool = False, + skip_preflight: bool = False, + execute: bool = False, +) -> dict[str, str]: + session_path = Path(session_json_path).resolve() + session = _load_session(session_path) + workspace_dir = Path(session.workspace_dir) + repair_config_path = workspace_dir / "repair-config.yaml" + if not repair_config_path.exists(): + raise ResearchRepairError(f"Repair config not found: {repair_config_path}") + + feedback_text = _read_feedback(Path(session.feedback_path), extra_feedback) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + child_run_dir = ( + Path(output_dir).resolve() + if output_dir is not None + else _default_child_run_dir(session, timestamp=timestamp) + ) + if child_run_dir.exists() and any(child_run_dir.iterdir()): + raise ResearchRepairError( + f"Child run directory already exists and is not empty: {child_run_dir}" + ) + child_run_dir.mkdir(parents=True, exist_ok=True) + inherited_stage_dirs = _copy_prerequisite_stage_dirs( + source_run_dir=Path(session.source_run_dir), + child_run_dir=child_run_dir, + target_stage_number=session.target_stage_number, + ) + + generated_dir = Path(session.session_dir) / "generated-runs" / timestamp + generated_dir.mkdir(parents=True, exist_ok=False) + + config_data = _load_yaml(repair_config_path) + generated_config_path = generated_dir / "repair-config.generated.yaml" + _write_generated_config( + config_data, + generated_config_path, + session=session, + feedback_text=feedback_text, + child_run_dir=child_run_dir, + ) + + metadata = { + "generated_at": _utc_now(), + "parent_run_dir": session.source_run_dir, + "parent_run_id": session.source_run_id, + "target_stage_name": session.target_stage_name, + "target_stage_number": session.target_stage_number, + "repair_reason": session.repair_reason, + "feedback_path": session.feedback_path, + "feedback_excerpt": feedback_text[:1200], + "generated_config_path": str(generated_config_path), + "inherited_stage_dirs": list(inherited_stage_dirs), + "reuse_policy": session.reuse_policy.to_dict(), + "soft_context_note": ( + "Parent-run downstream analysis/draft artifacts are provided only as " + "reference context. They are not authoritative outputs for this child run." + ), + "compact_repair_brief": _build_compact_repair_brief( + session=session, + feedback_text=feedback_text, + ), + } + metadata_path = child_run_dir / "research_repair_parent.json" + metadata_path.write_text(dumps(metadata, indent=2) + "\n", encoding="utf-8") + + inner_command = _build_inner_launch_command( + upstream_root=Path(session.upstream_root), + generated_config_path=generated_config_path, + child_run_dir=child_run_dir, + stage_name=session.target_stage_name, + auto_approve=auto_approve, + skip_preflight=skip_preflight, + ) + command_preview = _wrap_launch_command_for_display(inner_command) + launch_script = generated_dir / "launch.sh" + launch_script.write_text(command_preview + "\n", encoding="utf-8") + launch_log = generated_dir / "launch.log" + + pid: int | None = None + if execute: + pid = _launch_command( + inner_command, + launch_log, + upstream_root=Path(session.upstream_root), + ) + + launch_entry = LaunchEntry( + launched_at=_utc_now(), + child_run_dir=str(child_run_dir), + generated_config_path=str(generated_config_path), + launch_script=str(launch_script), + launch_log=str(launch_log), + command_preview=command_preview, + target_stage_name=session.target_stage_name, + target_stage_number=session.target_stage_number, + inherited_stage_dirs=inherited_stage_dirs, + executed=execute, + pid=pid, + ) + rewritten = ResearchRepairSession( + source_run_dir=session.source_run_dir, + source_run_id=session.source_run_id, + session_dir=session.session_dir, + workspace_dir=session.workspace_dir, + created_at=session.created_at, + base_config_path=session.base_config_path, + upstream_root=session.upstream_root, + target_stage_name=session.target_stage_name, + target_stage_number=session.target_stage_number, + repair_reason=session.repair_reason, + context_items=session.context_items, + feedback_path=session.feedback_path, + reuse_policy=session.reuse_policy, + launch_history=session.launch_history + (launch_entry,), + ) + session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8") + + return { + "session_json": str(session_path), + "child_run_dir": str(child_run_dir), + "generated_config": str(generated_config_path), + "launch_script": str(launch_script), + "launch_log": str(launch_log), + "command_preview": command_preview, + "metadata": str(metadata_path), + "pid": "" if pid is None else str(pid), + } + + +def _normalize_stage_ref(stage_ref: str) -> tuple[int, str]: + raw = str(stage_ref).strip() + if not raw: + raise ResearchRepairError("Target stage must not be empty.") + if raw.isdigit(): + stage_number = int(raw) + stage_name = STAGE_NAME_BY_NUMBER.get(stage_number) + if stage_name is None: + raise ResearchRepairError(f"Unknown stage number: {stage_number}") + return stage_number, stage_name + stage_name = raw.upper() + stage_number = STAGE_NUMBER_BY_NAME.get(stage_name) + if stage_number is None: + valid = ", ".join(STAGE_NAME_BY_NUMBER.values()) + raise ResearchRepairError( + f"Unknown stage name '{raw}'. Valid stage names: {valid}" + ) + return stage_number, stage_name + + +def _read_source_run_id(run_dir: Path) -> str: + if run_dir.name.strip(): + return run_dir.name.strip() + summary_path = run_dir / "pipeline_summary.json" + if summary_path.exists(): + try: + data = loads(summary_path.read_text(encoding="utf-8")) + run_id = data.get("run_id") + if isinstance(run_id, str) and run_id.strip(): + return run_id.strip() + except (OSError, ValueError): + pass + return run_dir.name + + +def _collect_context_items(run_dir: Path) -> tuple[ContextItem, ...]: + relative_paths: list[str] = [] + for relative_path in FIXED_CONTEXT_PATHS: + relative_paths.append(relative_path) + for pattern in LATEST_GLOB_PATHS: + matches = sorted(run_dir.glob(pattern)) + if matches: + relative_paths.append(matches[-1].relative_to(run_dir).as_posix()) + + deduped: list[str] = [] + seen: set[str] = set() + for relative_path in relative_paths: + if relative_path in seen: + continue + seen.add(relative_path) + deduped.append(relative_path) + + items: list[ContextItem] = [] + for relative_path in deduped: + full_path = run_dir / relative_path + items.append( + ContextItem( + relative_path=relative_path, + kind="directory" if full_path.is_dir() else "file", + exists=full_path.exists(), + ) + ) + return tuple(items) + + +def _build_reuse_policy( + *, + context_items: tuple[ContextItem, ...], + target_stage_number: int, + target_stage_name: str, +) -> ReusePolicy: + hard_reuse = tuple( + f"stage-{number:02d}" for number in range(1, target_stage_number) + ) + soft_context: list[str] = [] + for item in context_items: + rel = item.relative_path + if rel.startswith("stage-"): + stage_prefix = rel.split("/", 1)[0] + number_text = stage_prefix.replace("stage-", "").split("_", 1)[0] + number_text = number_text.split("-", 1)[0] + try: + stage_number = int(number_text) + except ValueError: + stage_number = 0 + if stage_number >= target_stage_number: + soft_context.append(rel) + elif rel in { + "checkpoint.json", + "pipeline_summary.json", + "experiment_diagnosis.json", + "experiment_summary_best.json", + "analysis_best.md", + "repair_prompt.txt", + }: + soft_context.append(rel) + deduped_soft: list[str] = [] + seen: set[str] = set() + for rel in soft_context: + if rel in seen: + continue + seen.add(rel) + deduped_soft.append(rel) + return ReusePolicy( + hard_reuse_stage_dirs=hard_reuse, + soft_context_paths=tuple(deduped_soft), + rerun_from_stage_name=target_stage_name, + rerun_from_stage_number=target_stage_number, + ) + + +def _render_feedback_template( + *, + source_run_id: str, + target_stage_name: str, + reason: str, + feedback: list[str], +) -> str: + lines = [ + "# Research Repair Feedback", + "", + f"- Parent run: `{source_run_id}`", + f"- Target stage: `{target_stage_name}`", + f"- Reason: `{reason or 'Add the human repair reason here.'}`", + "", + "## Human Repair Request", + ] + if feedback: + for item in feedback: + item_text = str(item).strip() + if item_text: + lines.append(f"- {item_text}") + else: + lines.extend( + [ + "- State exactly what was insufficient in the completed run.", + "- Say what must be added: more data, more seeds, more conditions, or stronger protocol checks.", + "- If real local assets are required, say so explicitly.", + "- If the previous run should be considered invalid unless those changes happen, say that too.", + ] + ) + return "\n".join(lines).rstrip() + "\n" + + +def _render_repair_readme(session: ResearchRepairSession) -> str: + lines = [ + "# Research Repair Workspace", + "", + "This workspace is for run-level repair, not post-export paper cleanup.", + "", + f"- Source run: `{session.source_run_dir}`", + f"- Source run id: `{session.source_run_id}`", + f"- Target rollback stage: `{session.target_stage_name}`", + f"- Base config: `{session.base_config_path}`", + f"- Created at: `{session.created_at}`", + "", + "## Reuse Policy", + "- Hard reuse: parent stages before the target stage are copied directly into the child run.", + f"- Hard-reused stage dirs: `{', '.join(session.reuse_policy.hard_reuse_stage_dirs)}`", + "- Soft reuse: downstream analysis / decision / paper artifacts are copied into `workspace/context/` only as draft reference material.", + f"- Soft-context artifacts: `{', '.join(session.reuse_policy.soft_context_paths)}`", + "- Authoritative rerun boundary: all stages from the target stage onward must be regenerated from the new evidence.", + "", + "## What This Is For", + "- Human review says the completed run is not strong enough yet.", + "- Instead of only editing the exported paper, create a child run that goes back to the experiment stages.", + "- Typical reasons: not enough data, not enough seeds, wrong protocol, or real assets were not used.", + "", + "## Workspace Files", + f"- `workspace/repair-config.yaml`: editable config seed for the child run.", + f"- `workspace/feedback.md`: human repair instructions that will be preserved in child-run repair metadata and exposed as a compact repair brief.", + f"- `workspace/context/`: copied reference artifacts from the parent run.", + "", + "## Workflow", + "1. Edit `workspace/feedback.md` and, if needed, `workspace/repair-config.yaml`.", + "2. Prepare a child run with:", + " `python -m autoresearchclaw research-repair-run --repair-json /research-repair.json`", + "3. Add `--execute` only when you explicitly want to launch the new upstream run.", + "", + "The prepared child run keeps a parent pointer via `research_repair_parent.json` so the repair lineage stays auditable.", + ] + return "\n".join(lines).rstrip() + "\n" + + +def _load_session(session_json_path: Path) -> ResearchRepairSession: + if not session_json_path.exists(): + raise ResearchRepairError(f"Research repair JSON not found: {session_json_path}") + data = loads(session_json_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ResearchRepairError("Research repair JSON must decode to a mapping.") + return ResearchRepairSession.from_dict(data) + + +def _read_feedback(feedback_path: Path, extra_feedback: list[str] | tuple[str, ...]) -> str: + if not feedback_path.exists(): + raise ResearchRepairError(f"Feedback file not found: {feedback_path}") + feedback_text = feedback_path.read_text(encoding="utf-8").strip() + extras = [str(item).strip() for item in extra_feedback if str(item).strip()] + if extras: + feedback_text = feedback_text.rstrip() + "\n\n## CLI Additions\n" + "\n".join( + f"- {item}" for item in extras + ) + return feedback_text.strip() + + +def _load_yaml(path: Path) -> dict[str, Any]: + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except OSError as exc: + raise ResearchRepairError(f"Could not read YAML config: {path}") from exc + if not isinstance(data, dict): + raise ResearchRepairError(f"Config must decode to a mapping: {path}") + return data + + +def _write_generated_config( + config_data: dict[str, Any], + target_path: Path, + *, + session: ResearchRepairSession, + feedback_text: str, + child_run_dir: Path, +) -> None: + research = config_data.setdefault("research", {}) + if not isinstance(research, dict): + raise ResearchRepairError("Config field `research` must be a mapping.") + original_topic = str(research.get("topic", "")).strip() + research["topic"] = _build_repair_topic( + session=session, + original_topic=original_topic, + feedback_text=feedback_text, + ) + project = config_data.setdefault("project", {}) + if isinstance(project, dict): + project_name = str(project.get("name", "research-repair")).strip() or "research-repair" + if not project_name.endswith("-repair"): + project["name"] = f"{project_name}-repair" + _apply_repair_runtime_defaults( + config_data, + session=session, + generated_config_path=target_path, + child_run_dir=child_run_dir, + ) + + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_text( + yaml.safe_dump(config_data, sort_keys=False, allow_unicode=True), + encoding="utf-8", + ) + + +def _build_repair_topic( + *, + session: ResearchRepairSession, + original_topic: str, + feedback_text: str, +) -> str: + base_topic = _normalize_single_line(original_topic) + if not base_topic: + base_topic = ( + "Engineering-drawing circle localization with explicit rule evidence " + "and learned heatmaps." + ) + + repair_focus = _first_repair_focus_line(feedback_text) or _normalize_single_line( + session.repair_reason + ) + if repair_focus: + return ( + f"{base_topic}\n\n" + f"Repair focus: rerun from {session.target_stage_name} and strengthen " + f"{repair_focus}." + ).strip() + return ( + f"{base_topic}\n\n" + f"Repair focus: rerun from {session.target_stage_name} with stronger " + "real-data coverage, seeds, and experiment protocol." + ).strip() + + +def _normalize_single_line(text: str) -> str: + normalized = " ".join(str(text).split()).strip() + return normalized + + +def _first_repair_focus_line(feedback_text: str) -> str: + for raw_line in feedback_text.splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + if not line.startswith("- "): + continue + payload = line[2:].strip() + lowered = payload.lower() + if lowered.startswith(("parent run:", "target stage:", "reason:")): + continue + return _normalize_single_line(payload) + return "" + + +def _apply_repair_runtime_defaults( + config_data: dict[str, Any], + *, + session: ResearchRepairSession, + generated_config_path: Path, + child_run_dir: Path, +) -> None: + llm = config_data.setdefault("llm", {}) + if isinstance(llm, dict): + acp = llm.setdefault("acp", {}) + if isinstance(acp, dict): + timestamp = generated_config_path.parent.name.strip() or "repair" + stage_slug = session.target_stage_name.lower().replace("_", "-") + acp["session_name"] = f"researchclaw-{stage_slug}-{timestamp}" + current_timeout = _safe_int(acp.get("timeout_sec"), 1800) + acp["timeout_sec"] = max(current_timeout, 3200) + current_retries = _safe_int(acp.get("reconnect_retries"), 2) + acp["reconnect_retries"] = max(current_retries, 6) + acp["reconnect_backoff_sec"] = 3.0 + acp["verbose"] = True + acp["capture_status_on_failure"] = True + acp["archive_failed_prompt_files"] = True + acp["debug_log_path"] = _to_wsl_path(child_run_dir / "acp_debug.jsonl") + if session.target_stage_number >= STAGE_NUMBER_BY_NAME["CODE_GENERATION"]: + acp["stateless_prompt"] = True + + experiment = config_data.setdefault("experiment", {}) + if isinstance(experiment, dict): + code_agent = experiment.setdefault("code_agent", {}) + if isinstance(code_agent, dict): + code_agent["architecture_planning"] = False + code_agent["review_max_rounds"] = 0 + if session.target_stage_number >= STAGE_NUMBER_BY_NAME["CODE_GENERATION"]: + code_agent["fallback_to_legacy_on_acp_failure"] = False + + +def _default_child_run_dir( + session: ResearchRepairSession, + *, + timestamp: str, +) -> Path: + suffix = f"{session.source_run_id}-repair-{timestamp}" + override = os.environ.get(REPAIR_RUN_ROOT_ENV_VAR, "").strip() + if override: + return Path(override).resolve() / suffix + + if sys.platform.startswith("win"): + detected_root = _detect_windows_wsl_run_root() + if detected_root is not None: + return detected_root / suffix + + return Path(session.source_run_dir).resolve().parent / suffix + + +def _detect_windows_wsl_run_root() -> Path | None: + if not sys.platform.startswith("win"): + return None + try: + probe = subprocess.run( + [ + "wsl", + "bash", + "-lc", + 'mkdir -p "$HOME/.autoresearchclaw/artifacts" && wslpath -w "$HOME/.autoresearchclaw/artifacts"', + ], + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=20, + check=False, + ) + except Exception: # noqa: BLE001 + return None + if probe.returncode != 0: + return None + output = (probe.stdout or "").strip().splitlines() + if not output: + return None + candidate = output[-1].strip() + if not candidate: + return None + return Path(candidate).resolve() + + +def _safe_int(value: Any, default: int) -> int: + if value is None: + return default + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _build_inner_launch_command( + *, + upstream_root: Path, + generated_config_path: Path, + child_run_dir: Path, + stage_name: str, + auto_approve: bool, + skip_preflight: bool, +) -> str: + upstream_root_wsl = _to_wsl_path(upstream_root) + generated_config_wsl = _to_wsl_path(generated_config_path) + child_run_dir_wsl = _to_wsl_path(child_run_dir) + exe_wsl = _to_wsl_path(upstream_root / ".venv" / "bin" / "researchclaw") + command_parts = [f"cd { _sh_quote(upstream_root_wsl) }", 'export PATH="$HOME/bin:$PATH"'] + tmp_bin = upstream_root / ".tmp_bin" + if tmp_bin.exists(): + tmp_bin_wsl = _to_wsl_path(tmp_bin) + command_parts.append( + f"export PATH={_sh_quote(tmp_bin_wsl)}:\"$PATH\"" + ) + command_parts.append( + f"{_sh_quote(exe_wsl)} run --config {_sh_quote(generated_config_wsl)} --output {_sh_quote(child_run_dir_wsl)} --from-stage {stage_name}" + ) + if auto_approve: + command_parts[-1] += " --auto-approve" + if skip_preflight: + command_parts[-1] += " --skip-preflight" + inner = " && ".join(command_parts) + return inner + + +def _build_compact_repair_brief( + *, + session: ResearchRepairSession, + feedback_text: str, +) -> str: + feedback_lines: list[str] = [] + for raw_line in feedback_text.splitlines(): + line = raw_line.strip() + if not line.startswith("- "): + continue + payload = line[2:].strip() + lowered = payload.lower() + if lowered.startswith(("parent run:", "target stage:", "reason:")): + continue + feedback_lines.append(_normalize_single_line(payload)) + if len(feedback_lines) >= 5: + break + + lines = [ + "## Repair Context", + f"- Parent run: `{session.source_run_id}`", + f"- Authoritative rerun starts at: `{session.target_stage_name}`", + f"- Repair reason: {_normalize_single_line(session.repair_reason)}", + ] + if session.reuse_policy.hard_reuse_stage_dirs: + lines.append( + "- Hard reuse: " + + ", ".join(session.reuse_policy.hard_reuse_stage_dirs) + ) + lines.append( + "- Downstream parent analysis and paper artifacts are soft context only." + ) + if feedback_lines: + lines.append("- Human requirements:") + lines.extend(f"- {item}" for item in feedback_lines) + return "\n".join(lines).strip() + + +def _wrap_launch_command_for_display(inner_command: str) -> str: + if sys.platform.startswith("win"): + return f"wsl bash -lc {_sh_quote(inner_command)}" + return f"bash -lc {_sh_quote(inner_command)}" + + +def _launch_command( + inner_command: str, + launch_log: Path, + *, + upstream_root: Path | None = None, +) -> int: + launch_env, forwarded_env = _build_launch_env(upstream_root=upstream_root) + launch_log.parent.mkdir(parents=True, exist_ok=True) + with launch_log.open("w", encoding="utf-8") as log_handle: + log_handle.write(f"$ {inner_command}\n\n") + if forwarded_env: + log_handle.write( + "# Forwarded to child process via WSLENV: " + + ", ".join(forwarded_env) + + "\n\n" + ) + log_handle.flush() + if sys.platform.startswith("win"): + process = subprocess.Popen( + ["wsl", "bash", "-lc", inner_command], + stdout=log_handle, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + env=launch_env, + ) + else: + process = subprocess.Popen( + ["bash", "-lc", inner_command], + stdout=log_handle, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + env=launch_env, + ) + return int(process.pid) + + +def _build_launch_env( + *, + upstream_root: Path | None = None, +) -> tuple[dict[str, str], tuple[str, ...]]: + env = dict(os.environ) + forwarded: list[str] = [] + for name in WSL_PASSTHROUGH_ENV_VARS: + value = env.get(name, "") + if value: + forwarded.append(name) + + asset_env = _discover_runtime_asset_env(upstream_root) + for name, value in asset_env.items(): + if value: + env[name] = value + forwarded.append(name) + + if not sys.platform.startswith("win"): + return env, tuple(_dedupe_preserve_order(forwarded)) + + forwarded = _dedupe_preserve_order(forwarded) + if not forwarded: + return env, () + + wslenv_entries = [item for item in env.get("WSLENV", "").split(":") if item] + existing_names = {item.split("/", 1)[0] for item in wslenv_entries} + for name in forwarded: + if name not in existing_names: + wslenv_entries.append(name) + env["WSLENV"] = ":".join(wslenv_entries) + return env, tuple(forwarded) + + +def _discover_runtime_asset_env(upstream_root: Path | None) -> dict[str, str]: + if upstream_root is None: + return {} + if sys.platform.startswith("win"): + return _discover_runtime_asset_env_via_wsl(upstream_root) + return {} + + +def _discover_runtime_asset_env_via_wsl(upstream_root: Path) -> dict[str, str]: + upstream_root_wsl = _to_wsl_path(upstream_root) + exe_wsl = _to_wsl_path(upstream_root / ".venv" / "bin" / "python") + script = r""" +from config import build_default_config +from pathlib import Path +import json + +cfg = build_default_config() +specs = cfg.build_dataset_specs() +payload = {"VECTRA_REPO_ROOT": str(Path.cwd())} + +simple = specs.get("engineering_primitives_simple_scenes_noslot_v1_local_20260326", {}) +if isinstance(simple, dict): + payload["VECTRA_SIMPLE_DATASET_ROOT"] = str(simple.get("dataset_root", "")) + payload["VECTRA_SIMPLE_ASSET_ROOT"] = str(simple.get("dataset_root", "")) + payload["VECTRA_SIMPLE_MANIFEST_PATH"] = str(simple.get("manifest_path", "")) + caches = simple.get("cache_roots", {}) + if isinstance(caches, dict): + payload["VECTRA_SIMPLE_HEATMAP_DIR"] = str(caches.get("learned", "")) + +page = specs.get("page_minus_titleblock", {}) +if isinstance(page, dict): + page_root = Path(str(page.get("dataset_root", ""))).expanduser() + payload["VECTRA_PAGE_DATASET_ROOT"] = str(page_root) + payload["VECTRA_PAGE_IMAGE_DIR"] = str(page_root / "train2017") + payload["VECTRA_PAGE_SIDECAR_DIR"] = str(page_root / "sidecars" / "train2017") + split_json = Path(str(page.get("split_manifest_path", ""))).expanduser() + payload["VECTRA_PAGE_SPLIT_JSON"] = str(split_json) + if str(split_json): + one_drive_png_root = split_json.parent.parent + payload["VECTRA_ONE_DRIVE_PNG_ROOT"] = str(one_drive_png_root) + payload["VECTRA_PAGE_GT_SOLID_CSV"] = str(split_json.parent / "gt" / "train2017_solid.csv") + payload["VECTRA_PAGE_GT_DASHED_CSV"] = str(split_json.parent / "gt" / "train2017_dashed.csv") + +probe = specs.get("DeepPatent2_negative_clutter_probe", {}) +if isinstance(probe, dict): + payload["VECTRA_DEEPPATENT_DATASET_ROOT"] = str(probe.get("dataset_root", "")) + +clean = {k: v for k, v in payload.items() if v and v != "."} +print(json.dumps(clean, ensure_ascii=False)) +""".strip() + + command = ( + f"cd {_sh_quote(upstream_root_wsl)} && " + f"{_sh_quote(exe_wsl)} - <<'PY'\n{script}\nPY" + ) + try: + probe = subprocess.run( + ["wsl", "bash", "-lc", command], + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=30, + check=False, + ) + except Exception: # noqa: BLE001 + return {} + if probe.returncode != 0: + return {} + lines = [line.strip() for line in (probe.stdout or "").splitlines() if line.strip()] + if not lines: + return {} + try: + payload = loads(lines[-1]) + except ValueError: + return {} + if not isinstance(payload, dict): + return {} + return { + str(key): str(value) + for key, value in payload.items() + if str(value).strip() + } + + +def _dedupe_preserve_order(values: list[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + if value in seen: + continue + seen.add(value) + result.append(value) + return result + + +def _copy_path(source_path: Path, target_path: Path, kind: str) -> None: + target_path.parent.mkdir(parents=True, exist_ok=True) + if kind == "directory": + copytree(source_path, target_path) + return + copy2(source_path, target_path) + + +def _copy_prerequisite_stage_dirs( + *, + source_run_dir: Path, + child_run_dir: Path, + target_stage_number: int, +) -> tuple[str, ...]: + inherited: list[str] = [] + for stage_number in range(1, target_stage_number): + stage_dir_name = f"stage-{stage_number:02d}" + source_stage_dir = source_run_dir / stage_dir_name + if not source_stage_dir.exists(): + continue + target_stage_dir = child_run_dir / stage_dir_name + if target_stage_dir.exists(): + continue + copytree(source_stage_dir, target_stage_dir) + inherited.append(stage_dir_name) + return tuple(inherited) + + +def _to_wsl_path(path: Path) -> str: + resolved = str(path.resolve()) + normalized = resolved.replace("/", "\\") + lowered = normalized.lower() + wsl_prefixes = ("\\\\wsl$\\", "\\\\wsl.localhost\\") + for prefix in wsl_prefixes: + if lowered.startswith(prefix): + parts = normalized.split("\\") + # UNC layout: \\wsl$\Distro\path\inside\wsl + if len(parts) >= 5: + remainder = "/".join(segment for segment in parts[4:] if segment) + return "/" + remainder if remainder else "/" + if ":" not in resolved: + return resolved.replace("\\", "/") + drive, rest = resolved.split(":", 1) + return f"/mnt/{drive.lower()}{rest.replace('\\', '/')}" + + +def _sh_quote(value: str) -> str: + return "'" + value.replace("'", "'\"'\"'") + "'" + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat() diff --git a/pyproject.toml b/pyproject.toml index d669a06d..b119c6d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,10 @@ dev = ["pytest>=7.0", "httpx>=0.24"] [project.scripts] researchclaw = "researchclaw.cli:main" +autoresearchclaw = "autoresearchclaw.cli:main" [tool.hatch.build.targets.wheel] -packages = ["researchclaw", "sibyl", "arc"] +packages = ["researchclaw", "sibyl", "arc", "autoresearchclaw"] [tool.hatch.build.targets.wheel.force-include] "researchclaw/templates/styles" = "researchclaw/templates/styles"