From 58911a81acb64ef6172eb49d255128e44cd0ca6a Mon Sep 17 00:00:00 2001
From: CKwin26 <156837805+CKwin26@users.noreply.github.com>
Date: Tue, 31 Mar 2026 01:34:17 -0400
Subject: [PATCH 1/2] harden repair reruns

---
 config.researchclaw.example.yaml              |   23 +
 researchclaw/config.py                        |   38 +
 researchclaw/docker/entrypoint.sh             |    7 +-
 researchclaw/experiment/colab_sandbox.py      |    2 +
 researchclaw/experiment/docker_sandbox.py     |   30 +-
 researchclaw/experiment/sandbox.py            |   20 +-
 researchclaw/experiment/ssh_sandbox.py        |   64 +-
 researchclaw/llm/acp_client.py                |  523 +++++++-
 researchclaw/pipeline/_helpers.py             |  144 ++-
 researchclaw/pipeline/runner.py               |   14 +
 .../pipeline/stage_impls/_code_generation.py  | 1124 +++++++++++++++--
 .../pipeline/stage_impls/_execution.py        |  429 ++++++-
 tests/test_rc_cli.py                          |   51 +
 tests/test_rc_executor.py                     |  505 ++++++++
 tests/test_rc_runner.py                       |   43 +
 tests/test_ssh_and_colab_sandbox.py           |   37 +-
 16 files changed, 2896 insertions(+), 158 deletions(-)

diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml
index 5b8d43f2..da4f3158 100644
--- a/config.researchclaw.example.yaml
+++ b/config.researchclaw.example.yaml
@@ -50,6 +50,13 @@ llm:
   # primary_model: "MiniMax-M2.5"
   # fallback_models:
   #   - "MiniMax-M2.5-highspeed"
+  # acp:
+  #   reconnect_retries: 2
+  #   reconnect_backoff_sec: 2.0
+  #   verbose: true
+  #   capture_status_on_failure: true
+  #   debug_log_path: "artifacts/acp_debug.jsonl"
+  #   archive_failed_prompt_files: true
 
 security:
   hitl_required_stages: [5, 9, 20]
@@ -66,6 +73,22 @@ experiment:
   max_iterations: 10
   metric_key: "primary_metric"
   metric_direction: "minimize"
+  # Optional hard guards for trust-first experiment runs.
+  # When enabled, generated experiments must use real local assets/caches, fail fast
+  # if those assets are missing, and emit structured machine-readable results.
+  require_real_data: false
+  forbid_synthetic_proxy: false
+  fail_on_stdout_parsed_results: false
+  required_real_data_refs: []
+  benchmark_agent:
+    enabled: true
+    preserve_existing_assets: true
+    pass_existing_assets_as_reference: true
+  code_agent:
+    enabled: true
+    # If the ACP transport drops during multi-round Stage 10 generation,
+    # fall back to the older one-shot generator instead of failing immediately.
+    fallback_to_legacy_on_acp_failure: false
   sandbox:
     # Use ".venv/Scripts/python.exe" on Windows
     python_path: ".venv/bin/python3"
diff --git a/researchclaw/config.py b/researchclaw/config.py
index 8b2173f2..c6154e5a 100644
--- a/researchclaw/config.py
+++ b/researchclaw/config.py
@@ -184,6 +184,13 @@ class AcpConfig:
     acpx_command: str = ""
     session_name: str = "researchclaw"
     timeout_sec: int = 1800
+    verbose: bool = False
+    stateless_prompt: bool = False
+    reconnect_retries: int = 2
+    reconnect_backoff_sec: float = 2.0
+    capture_status_on_failure: bool = False
+    debug_log_path: str = ""
+    archive_failed_prompt_files: bool = False
 
 
 @dataclass(frozen=True)
@@ -295,6 +302,7 @@ class CodeAgentConfig:
     """Configuration for the advanced multi-phase code generation agent."""
 
     enabled: bool = True
+    fallback_to_legacy_on_acp_failure: bool = False
     # Phase 1: Blueprint planning (deep implementation blueprint)
     architecture_planning: bool = True
     # Phase 2: Sequential file generation (one-by-one following blueprint)
@@ -347,6 +355,8 @@ class BenchmarkAgentConfig:
     min_benchmarks: int = 1
     min_baselines: int = 2
     prefer_cached: bool = True
+    preserve_existing_assets: bool = True
+    pass_existing_assets_as_reference: bool = True
     # Orchestrator
     max_iterations: int = 2
 
@@ -426,6 +436,10 @@ class ExperimentConfig:
     metric_key: str = "primary_metric"
     metric_direction: str = "minimize"
     keep_threshold: float = 0.0
+    require_real_data: bool = False
+    forbid_synthetic_proxy: bool = False
+    fail_on_stdout_parsed_results: bool = False
+    required_real_data_refs: tuple[str, ...] = ()
     sandbox: SandboxConfig = field(default_factory=SandboxConfig)
     docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig)
     agentic: AgenticConfig = field(default_factory=AgenticConfig)
@@ -972,6 +986,17 @@ def _parse_llm_config(data: dict[str, Any]) -> LlmConfig:
             acpx_command=acp_data.get("acpx_command", ""),
             session_name=acp_data.get("session_name", "researchclaw"),
             timeout_sec=int(acp_data.get("timeout_sec", 1800)),
+            verbose=bool(acp_data.get("verbose", False)),
+            stateless_prompt=bool(acp_data.get("stateless_prompt", False)),
+            reconnect_retries=_safe_int(acp_data.get("reconnect_retries"), 2),
+            reconnect_backoff_sec=_safe_float(acp_data.get("reconnect_backoff_sec"), 2.0),
+            capture_status_on_failure=bool(
+                acp_data.get("capture_status_on_failure", False)
+            ),
+            debug_log_path=str(acp_data.get("debug_log_path", "")),
+            archive_failed_prompt_files=bool(
+                acp_data.get("archive_failed_prompt_files", False)
+            ),
         ),
     )
 
@@ -1008,6 +1033,12 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
         metric_key=data.get("metric_key", "primary_metric"),
         metric_direction=data.get("metric_direction", "minimize"),
         keep_threshold=_safe_float(data.get("keep_threshold"), 0.0),
+        require_real_data=bool(data.get("require_real_data", False)),
+        forbid_synthetic_proxy=bool(data.get("forbid_synthetic_proxy", False)),
+        fail_on_stdout_parsed_results=bool(
+            data.get("fail_on_stdout_parsed_results", False)
+        ),
+        required_real_data_refs=tuple(data.get("required_real_data_refs") or ()),
         sandbox=SandboxConfig(
             python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH),
             gpu_required=bool(sandbox_data.get("gpu_required", False)),
@@ -1086,6 +1117,10 @@ def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig:
         min_benchmarks=_safe_int(data.get("min_benchmarks"), 1),
         min_baselines=_safe_int(data.get("min_baselines"), 2),
         prefer_cached=bool(data.get("prefer_cached", True)),
+        preserve_existing_assets=bool(data.get("preserve_existing_assets", True)),
+        pass_existing_assets_as_reference=bool(
+            data.get("pass_existing_assets_as_reference", True)
+        ),
         max_iterations=_safe_int(data.get("max_iterations"), 2),
     )
 
@@ -1142,6 +1177,9 @@ def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig:
         return CodeAgentConfig()
     return CodeAgentConfig(
         enabled=bool(data.get("enabled", True)),
+        fallback_to_legacy_on_acp_failure=bool(
+            data.get("fallback_to_legacy_on_acp_failure", False)
+        ),
         architecture_planning=bool(data.get("architecture_planning", True)),
         sequential_generation=bool(data.get("sequential_generation", True)),
         hard_validation=bool(data.get("hard_validation", True)),
diff --git a/researchclaw/docker/entrypoint.sh b/researchclaw/docker/entrypoint.sh
index 316039c0..5104bd9c 100755
--- a/researchclaw/docker/entrypoint.sh
+++ b/researchclaw/docker/entrypoint.sh
@@ -11,7 +11,10 @@
 set -e
 
 WORKSPACE="/workspace"
-ENTRY_POINT="${1:-main.py}"
+ENTRY_POINT="${RC_ENTRY_POINT:-${1:-main.py}}"
+if [ "$#" -gt 0 ]; then
+    shift
+fi
 
 # ----------------------------------------------------------------
 # Phase 0: Install additional pip packages
@@ -51,4 +54,4 @@ fi
 # Phase 2: Run experiment
 # ----------------------------------------------------------------
 echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
-exec python3 -u "$WORKSPACE/$ENTRY_POINT"
+exec python3 -u "$WORKSPACE/$ENTRY_POINT" "$@"
diff --git a/researchclaw/experiment/colab_sandbox.py b/researchclaw/experiment/colab_sandbox.py
index b6a46542..eec6ad7e 100644
--- a/researchclaw/experiment/colab_sandbox.py
+++ b/researchclaw/experiment/colab_sandbox.py
@@ -158,6 +158,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends
         from researchclaw.experiment.sandbox import validate_entry_point
diff --git a/researchclaw/experiment/docker_sandbox.py b/researchclaw/experiment/docker_sandbox.py
index 3eda27c9..b45f21cd 100644
--- a/researchclaw/experiment/docker_sandbox.py
+++ b/researchclaw/experiment/docker_sandbox.py
@@ -138,6 +138,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project inside a container."""
         self._run_counter += 1
@@ -189,7 +191,13 @@ def run_project(
                 metrics={},
             )
 
-        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
+        return self._execute(
+            staging,
+            entry_point=entry_point,
+            timeout_sec=timeout_sec,
+            entry_args=args,
+            env_overrides=env_overrides,
+        )
 
     # ------------------------------------------------------------------
     # Static helpers
@@ -254,7 +262,13 @@ def _inject_harness(target_dir: Path) -> None:
     # ------------------------------------------------------------------
 
     def _execute(
-        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
+        self,
+        staging_dir: Path,
+        *,
+        entry_point: str,
+        timeout_sec: int,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Core execution: single container, three-phase via entrypoint.sh."""
         cfg = self.config
@@ -269,6 +283,8 @@ def _execute(
             staging_dir,
             entry_point=entry_point,
             container_name=container_name,
+            entry_args=entry_args,
+            env_overrides=env_overrides,
         )
 
         start = time.monotonic()
@@ -349,6 +365,8 @@ def _build_run_command(
         *,
         entry_point: str,
         container_name: str,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> list[str]:
         """Build the ``docker run`` command list.
 
@@ -453,9 +471,17 @@ def _user_flag() -> list[str]:
             else:
                 cmd.extend(["--gpus", "all"])
 
+        if env_overrides:
+            for name, value in sorted(env_overrides.items()):
+                if not value:
+                    continue
+                cmd.extend(["-e", f"{name}={value}"])
+
         # Image + entry point (passed as CMD arg to entrypoint.sh)
         cmd.append(cfg.image)
         cmd.append(entry_point)
+        if entry_args:
+            cmd.extend(entry_args)
 
         return cmd
 
diff --git a/researchclaw/experiment/sandbox.py b/researchclaw/experiment/sandbox.py
index 09ef276f..f8c0d3d1 100644
--- a/researchclaw/experiment/sandbox.py
+++ b/researchclaw/experiment/sandbox.py
@@ -297,6 +297,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult: ...
 
 
@@ -350,6 +352,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project in the sandbox.
 
@@ -409,12 +413,14 @@ def run_project(
             )
 
         start = time.monotonic()
-        command = self._build_command(entry)
+        command = self._build_command(entry, args=args)
         logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project)
 
         result: SandboxResult
         try:
             env = {**os.environ, "PYTHONUNBUFFERED": "1"}
+            if env_overrides:
+                env.update(env_overrides)
             completed = subprocess.run(
                 command,
                 capture_output=True,
@@ -457,7 +463,12 @@ def _next_script_path(self) -> Path:
     def _write_script(script_path: Path, code: str) -> None:
         _ = script_path.write_text(code, encoding="utf-8")
 
-    def _build_command(self, script_path: Path) -> list[str]:
+    def _build_command(
+        self,
+        script_path: Path,
+        *,
+        args: list[str] | None = None,
+    ) -> list[str]:
         # Convert relative python_path to absolute WITHOUT resolving symlinks.
         # Using .resolve() would follow venv symlinks to the system Python binary,
         # which loses the venv context (site-packages like numpy become unavailable).
@@ -466,7 +477,10 @@ def _build_command(self, script_path: Path) -> list[str]:
         if not python_path.is_absolute():
             python_path = Path.cwd() / python_path
         # -u: unbuffered stdout/stderr so subprocess.run captures all output
-        return [str(python_path), "-u", str(script_path)]
+        command = [str(python_path), "-u", str(script_path)]
+        if args:
+            command.extend(args)
+        return command
 
     @staticmethod
     def _result_from_completed(
diff --git a/researchclaw/experiment/ssh_sandbox.py b/researchclaw/experiment/ssh_sandbox.py
index aad97fca..ec5026da 100644
--- a/researchclaw/experiment/ssh_sandbox.py
+++ b/researchclaw/experiment/ssh_sandbox.py
@@ -71,6 +71,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project on the remote host."""
         self._run_counter += 1
@@ -119,7 +121,13 @@ def run_project(
                 metrics={},
             )
 
-        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
+        return self._execute(
+            staging,
+            entry_point=entry_point,
+            timeout_sec=timeout_sec,
+            entry_args=args,
+            env_overrides=env_overrides,
+        )
 
     # ------------------------------------------------------------------
     # Static helpers
@@ -158,7 +166,13 @@ def _inject_harness(target_dir: Path) -> None:
     # ------------------------------------------------------------------
 
     def _execute(
-        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
+        self,
+        staging_dir: Path,
+        *,
+        entry_point: str,
+        timeout_sec: int,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Core execution flow for remote experiments.
 
@@ -213,11 +227,17 @@ def _execute(
         # 4. Execute experiment
         if cfg.use_docker:
             exec_cmd = self._build_docker_exec_cmd(
-                remote_dir, entry_point=entry_point,
+                remote_dir,
+                entry_point=entry_point,
+                args=entry_args,
+                env_overrides=env_overrides,
             )
         else:
             exec_cmd = self._build_bare_exec_cmd(
-                remote_dir, entry_point=entry_point,
+                remote_dir,
+                entry_point=entry_point,
+                args=entry_args,
+                env_overrides=env_overrides,
             )
 
         start = time.monotonic()
@@ -242,13 +262,26 @@ def _execute(
         )
 
     def _build_bare_exec_cmd(
-        self, remote_dir: str, *, entry_point: str,
+        self,
+        remote_dir: str,
+        *,
+        entry_point: str,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> str:
         """Build command to run Python directly on remote host (with basic sandboxing)."""
         cfg = self.config
         rd = shlex.quote(remote_dir)
         ep = shlex.quote(entry_point)
         py = shlex.quote(cfg.remote_python)
+        arg_text = " ".join(shlex.quote(arg) for arg in (args or []))
+        arg_suffix = f" {arg_text}" if arg_text else ""
+        env_parts = [
+            f"{name}={shlex.quote(value)}"
+            for name, value in sorted((env_overrides or {}).items())
+            if value
+        ]
+        env_prefix = (" ".join(env_parts) + " ") if env_parts else ""
 
         gpu_env = ""
         if cfg.gpu_ids:
@@ -264,17 +297,24 @@ def _build_bare_exec_cmd(
             f"if command -v unshare >/dev/null 2>&1; then "
             f"HOME={rd} "
             f"{gpu_env}"
-            f"unshare --net {py} -u {ep}; "
+            f"{env_prefix}"
+            f"unshare --net {py} -u {ep}{arg_suffix}; "
             f"else "
             f"echo 'WARNING: unshare not available, running without network isolation' >&2; "
             f"HOME={rd} "
             f"{gpu_env}"
-            f"{py} -u {ep}; "
+            f"{env_prefix}"
+            f"{py} -u {ep}{arg_suffix}; "
             f"fi"
         )
 
     def _build_docker_exec_cmd(
-        self, remote_dir: str, *, entry_point: str,
+        self,
+        remote_dir: str,
+        *,
+        entry_point: str,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> str:
         """Build command to run inside a Docker container on the remote host.
 
@@ -307,8 +347,16 @@ def _build_docker_exec_cmd(
             # Try to pass all GPUs; fails gracefully if none available
             parts.extend(["--gpus", "all"])
 
+        if env_overrides:
+            for name, value in sorted(env_overrides.items()):
+                if not value:
+                    continue
+                parts.extend(["-e", shlex.quote(f"{name}={value}")])
+
         parts.append(shlex.quote(cfg.docker_image))
         parts.extend(["python3", "-u", shlex.quote(entry_point)])
+        if args:
+            parts.extend(shlex.quote(arg) for arg in args)
 
         return " ".join(parts)
 
diff --git a/researchclaw/llm/acp_client.py b/researchclaw/llm/acp_client.py
index d5e13bc4..f2fdd1d6 100644
--- a/researchclaw/llm/acp_client.py
+++ b/researchclaw/llm/acp_client.py
@@ -10,13 +10,17 @@
 from __future__ import annotations
 
 import atexit
+import json
 import logging
 import os
+from pathlib import Path
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
+import time
+import uuid
 import weakref
 from dataclasses import dataclass
 from typing import Any
@@ -41,6 +45,13 @@ class ACPConfig:
     acpx_command: str = ""  # auto-detect if empty
     session_name: str = "researchclaw"
     timeout_sec: int = 1800  # per-prompt timeout
+    verbose: bool = False
+    stateless_prompt: bool = False
+    reconnect_retries: int = 2
+    reconnect_backoff_sec: float = 2.0
+    capture_status_on_failure: bool = False
+    debug_log_path: str = ""
+    archive_failed_prompt_files: bool = False
 
 
 def _find_acpx() -> str | None:
@@ -90,6 +101,13 @@ def from_rc_config(cls, rc_config: Any) -> ACPClient:
             acpx_command=getattr(acp, "acpx_command", ""),
             session_name=getattr(acp, "session_name", "researchclaw"),
             timeout_sec=getattr(acp, "timeout_sec", 1800),
+            verbose=getattr(acp, "verbose", False),
+            stateless_prompt=getattr(acp, "stateless_prompt", False),
+            reconnect_retries=getattr(acp, "reconnect_retries", 2),
+            reconnect_backoff_sec=getattr(acp, "reconnect_backoff_sec", 2.0),
+            capture_status_on_failure=getattr(acp, "capture_status_on_failure", False),
+            debug_log_path=getattr(acp, "debug_log_path", ""),
+            archive_failed_prompt_files=getattr(acp, "archive_failed_prompt_files", False),
         ))
 
     # ------------------------------------------------------------------
@@ -137,6 +155,8 @@ def preflight(self) -> tuple[bool, str]:
         agent = self.config.agent
         if not shutil.which(agent):
             return False, f"ACP agent CLI not found: {agent!r} (not on PATH)"
+        if self.config.stateless_prompt:
+            return True, f"OK - ACP stateless prompt mode ready ({agent} via acpx)"
         # Create the session
         try:
             self._ensure_session()
@@ -146,6 +166,9 @@ def preflight(self) -> tuple[bool, str]:
 
     def close(self) -> None:
         """Close the acpx session."""
+        if self.config.stateless_prompt:
+            self._session_ready = False
+            return
         if not self._session_ready:
             return
         acpx = self._resolve_acpx()
@@ -153,9 +176,12 @@ def close(self) -> None:
             return
         try:
             subprocess.run(
-                [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
-                 self.config.agent, "sessions", "close",
-                 self.config.session_name],
+                [
+                    *self._acpx_base_command(acpx, approve_all=False),
+                    "sessions",
+                    "close",
+                    self.config.session_name,
+                ],
                 capture_output=True, timeout=15,
             )
         except Exception:  # noqa: BLE001
@@ -195,6 +221,128 @@ def _resolve_acpx(self) -> str | None:
     def _abs_cwd(self) -> str:
         return os.path.abspath(self.config.cwd)
 
+    def _acpx_base_command(self, acpx: str, *, approve_all: bool) -> list[str]:
+        cmd = [acpx]
+        if self.config.verbose:
+            cmd.append("--verbose")
+        if approve_all:
+            cmd.append("--approve-all")
+        cmd.extend(["--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent])
+        return cmd
+
+    def _debug_log_path(self) -> Path | None:
+        raw = str(getattr(self.config, "debug_log_path", "") or "").strip()
+        if not raw:
+            return None
+        return Path(raw)
+
+    @staticmethod
+    def _debug_timestamp() -> str:
+        return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+
+    def _append_debug_event(self, event: str, **payload: Any) -> None:
+        record = {
+            "ts": self._debug_timestamp(),
+            "event": event,
+            **payload,
+        }
+        serialized = json.dumps(record, ensure_ascii=False, sort_keys=True)
+        logger.info("ACP_DEBUG %s", serialized)
+        debug_path = self._debug_log_path()
+        if debug_path is None:
+            return
+        try:
+            debug_path.parent.mkdir(parents=True, exist_ok=True)
+            with debug_path.open("a", encoding="utf-8") as handle:
+                handle.write(serialized + "\n")
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to append ACP debug log %s: %s", debug_path, exc)
+
+    def _archive_prompt_file(self, prompt_path: str, *, session_name: str) -> str:
+        if not self.config.archive_failed_prompt_files:
+            return ""
+        source_path = Path(prompt_path)
+        if not source_path.exists():
+            return ""
+        debug_path = self._debug_log_path()
+        base_dir = debug_path.parent if debug_path is not None else Path(self._abs_cwd())
+        archive_dir = base_dir / "acp_failed_prompts"
+        archive_dir.mkdir(parents=True, exist_ok=True)
+        target_name = f"{session_name}-{source_path.name}"
+        target_path = archive_dir / target_name
+        shutil.copy2(source_path, target_path)
+        return str(target_path)
+
+    def _status_command(self, acpx: str, session_name: str) -> list[str]:
+        cmd = self._acpx_base_command(acpx, approve_all=False)
+        cmd.extend(["status", "-s", session_name])
+        return cmd
+
+    def _capture_session_status(self, acpx: str, session_name: str) -> str:
+        if not self.config.capture_status_on_failure:
+            return ""
+        try:
+            result = subprocess.run(
+                self._status_command(acpx, session_name),
+                capture_output=True,
+                text=True,
+                encoding="utf-8",
+                errors="replace",
+                timeout=15,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return f"<status lookup failed: {exc}>"
+
+        stdout = (result.stdout or "").strip()
+        stderr = (result.stderr or "").strip()
+        chunks = [f"exit={result.returncode}"]
+        if stdout:
+            chunks.append(f"stdout:\n{stdout}")
+        if stderr:
+            chunks.append(f"stderr:\n{stderr}")
+        return "\n".join(chunks)
+
+    def _record_failure_context(
+        self,
+        *,
+        acpx: str,
+        session_name: str,
+        transport: str,
+        prompt_bytes: int,
+        prompt_limit: int,
+        use_file: bool,
+        error_text: str,
+        prompt_path: str | None = None,
+        returncode: int | None = None,
+        timed_out: bool = False,
+    ) -> None:
+        archived_prompt_path = ""
+        if prompt_path:
+            try:
+                archived_prompt_path = self._archive_prompt_file(
+                    prompt_path,
+                    session_name=session_name,
+                )
+            except Exception as exc:  # noqa: BLE001
+                archived_prompt_path = ""
+                logger.warning("Failed to archive ACP prompt file %s: %s", prompt_path, exc)
+
+        status_text = self._capture_session_status(acpx, session_name)
+        self._append_debug_event(
+            "prompt_failure",
+            session_name=session_name,
+            transport=transport,
+            prompt_bytes=prompt_bytes,
+            prompt_limit=prompt_limit,
+            use_file=use_file,
+            prompt_path=prompt_path or "",
+            archived_prompt_path=archived_prompt_path,
+            returncode=returncode,
+            timed_out=timed_out,
+            error=error_text,
+            session_status=status_text,
+        )
+
     def _ensure_session(self) -> None:
         """Find or create the named acpx session."""
         if self._session_ready:
@@ -202,28 +350,7 @@ def _ensure_session(self) -> None:
         acpx = self._resolve_acpx()
         if not acpx:
             raise RuntimeError("acpx not found")
-
-        # Use 'ensure' which finds existing or creates new
-        result = subprocess.run(
-            [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
-             self.config.agent, "sessions", "ensure",
-             "--name", self.config.session_name],
-            capture_output=True, text=True, encoding="utf-8",
-            errors="replace", timeout=30,
-        )
-        if result.returncode != 0:
-            # Fall back to 'new'
-            result = subprocess.run(
-                [acpx, "--ttl", "0", "--cwd", self._abs_cwd(),
-                 self.config.agent, "sessions", "new",
-                 "--name", self.config.session_name],
-                capture_output=True, text=True, encoding="utf-8",
-                errors="replace", timeout=30,
-            )
-            if result.returncode != 0:
-                raise RuntimeError(
-                    f"Failed to create ACP session: {result.stderr.strip()}"
-                )
+        self._create_or_ensure_session(acpx, self.config.session_name, ensure=True)
         self._session_ready = True
         logger.info("ACP session '%s' ready (%s)", self.config.session_name, self.config.agent)
 
@@ -231,7 +358,7 @@ def _ensure_session(self) -> None:
     # for the entire command line, not just the prompt payload. acpx adds
     # several fixed arguments plus quoting overhead, so leave generous headroom
     # on Windows and switch to temp-file transport earlier.
-    _MAX_CLI_PROMPT_BYTES = 20_000 if sys.platform == "win32" else 100_000
+    _MAX_CLI_PROMPT_BYTES = 6_000
     # On Windows, npm-installed CLIs usually resolve to ``.cmd`` launchers,
     # which are routed through ``cmd.exe`` and hit a much smaller practical
     # command-line limit (~8 KB). Use file transport much earlier there.
@@ -250,9 +377,9 @@ def _ensure_session(self) -> None:
     _RECONNECT_ERRORS = (
         "agent needs reconnect",
         "session not found",
-        "Query closed",
+        "query closed",
+        "queue owner disconnected before prompt completion",
     )
-    _MAX_RECONNECT_ATTEMPTS = 2
 
     @classmethod
     def _cli_prompt_limit(cls, acpx: str | None) -> int:
@@ -264,6 +391,18 @@ def _cli_prompt_limit(cls, acpx: str | None) -> int:
                 return min(limit, cls._MAX_CMD_WRAPPER_PROMPT_BYTES)
         return limit
 
+    @staticmethod
+    def _sanitize_prompt(prompt: str) -> str:
+        """Strip NUL bytes before subprocess transport.
+
+        ``subprocess.run()`` rejects arguments containing ``\\x00`` with
+        ``ValueError: embedded null byte``. This can happen when upstream
+        scraping or artifact text accidentally carries NULs into the prompt.
+        """
+        if "\x00" not in prompt:
+            return prompt
+        return prompt.replace("\x00", "")
+
     def _send_prompt(self, prompt: str) -> str:
         """Send a prompt via acpx and return the response text.
 
@@ -272,12 +411,14 @@ def _send_prompt(self, prompt: str) -> str:
         is asked to read it.
 
         If the session has died (common after long-running stages), retries
-        up to ``_MAX_RECONNECT_ATTEMPTS`` times with automatic reconnection.
+        up to the configured reconnect retry count with automatic reconnection.
         """
         acpx = self._resolve_acpx()
         if not acpx:
             raise RuntimeError("acpx not found")
 
+        prompt = self._sanitize_prompt(prompt)
+
         prompt_bytes = len(prompt.encode("utf-8"))
         prompt_limit = self._cli_prompt_limit(acpx)
         use_file = prompt_bytes > prompt_limit
@@ -288,13 +429,123 @@ def _send_prompt(self, prompt: str) -> str:
                 prompt_limit,
             )
 
+        if self.config.stateless_prompt:
+            last_exc: RuntimeError | None = None
+            for attempt in range(1 + self.config.reconnect_retries):
+                session_name = self._new_ephemeral_session(acpx)
+                self._append_debug_event(
+                    "prompt_attempt",
+                    session_name=session_name,
+                    stateless=True,
+                    attempt=attempt + 1,
+                    max_attempts=1 + self.config.reconnect_retries,
+                    prompt_bytes=prompt_bytes,
+                    prompt_limit=prompt_limit,
+                    use_file=use_file,
+                )
+                try:
+                    if use_file:
+                        return self._send_prompt_via_file(
+                            acpx,
+                            prompt,
+                            session_name=session_name,
+                            prompt_bytes=prompt_bytes,
+                            prompt_limit=prompt_limit,
+                        )
+                    return self._send_prompt_cli(
+                        acpx,
+                        prompt,
+                        session_name=session_name,
+                        prompt_bytes=prompt_bytes,
+                        prompt_limit=prompt_limit,
+                    )
+                except OSError as os_exc:
+                    if not use_file:
+                        logger.warning(
+                            "Stateless ACP subprocess raised OSError, "
+                            "falling back to temp file: %s",
+                            os_exc,
+                        )
+                        use_file = True
+                        return self._send_prompt_via_file(
+                            acpx,
+                            prompt,
+                            session_name=session_name,
+                            prompt_bytes=prompt_bytes,
+                            prompt_limit=prompt_limit,
+                        )
+                    raise RuntimeError(
+                        f"ACP prompt failed: {os_exc}"
+                    ) from os_exc
+                except RuntimeError as exc:
+                    exc_lower = str(exc).lower()
+                    if not use_file and any(
+                        h in exc_lower for h in self._CMD_TOO_LONG_HINTS
+                    ):
+                        logger.warning(
+                            "Stateless ACP prompt too long for OS, "
+                            "falling back to temp file: %s",
+                            exc,
+                        )
+                        use_file = True
+                        return self._send_prompt_via_file(
+                            acpx,
+                            prompt,
+                            session_name=session_name,
+                        )
+                    if not self._is_reconnect_error(exc):
+                        raise
+                    last_exc = exc
+                    if attempt < self.config.reconnect_retries:
+                        self._append_debug_event(
+                            "prompt_retrying",
+                            session_name=session_name,
+                            stateless=True,
+                            attempt=attempt + 1,
+                            remaining_retries=self.config.reconnect_retries - attempt,
+                            error=str(exc),
+                        )
+                        logger.warning(
+                            "Stateless ACP session died (%s), retrying "
+                            "with a fresh ephemeral session (attempt %d/%d)...",
+                            exc,
+                            attempt + 1,
+                            self.config.reconnect_retries,
+                        )
+                        self._sleep_before_retry()
+                        continue
+                finally:
+                    self._close_named_session(acpx, session_name)
+
+            raise last_exc  # type: ignore[misc]
+
         last_exc: RuntimeError | None = None
-        for attempt in range(1 + self._MAX_RECONNECT_ATTEMPTS):
+        for attempt in range(1 + self.config.reconnect_retries):
             self._ensure_session()
+            self._append_debug_event(
+                "prompt_attempt",
+                session_name=self.config.session_name,
+                stateless=False,
+                attempt=attempt + 1,
+                max_attempts=1 + self.config.reconnect_retries,
+                prompt_bytes=prompt_bytes,
+                prompt_limit=prompt_limit,
+                use_file=use_file,
+            )
             try:
                 if use_file:
-                    return self._send_prompt_via_file(acpx, prompt)
-                return self._send_prompt_cli(acpx, prompt)
+                    return self._send_prompt_via_file(
+                        acpx,
+                        prompt,
+                        prompt_bytes=prompt_bytes,
+                        prompt_limit=prompt_limit,
+                    )
+                return self._send_prompt_cli(
+                    acpx,
+                    prompt,
+                    prompt_bytes=prompt_bytes,
+                    prompt_limit=prompt_limit,
+                )
             except OSError as os_exc:
                 # OS-level failure (e.g., Windows CreateProcess arg limit).
                 # Fall back to temp-file transport automatically.
@@ -305,7 +556,12 @@ def _send_prompt(self, prompt: str) -> str:
                         os_exc,
                     )
                     use_file = True
-                    return self._send_prompt_via_file(acpx, prompt)
+                    return self._send_prompt_via_file(
+                        acpx,
+                        prompt,
+                        prompt_bytes=prompt_bytes,
+                        prompt_limit=prompt_limit,
+                    )
                 raise RuntimeError(
                     f"ACP prompt failed: {os_exc}"
                 ) from os_exc
@@ -322,17 +578,26 @@ def _send_prompt(self, prompt: str) -> str:
                     )
                     use_file = True
                     return self._send_prompt_via_file(acpx, prompt)
-                if not any(pat in str(exc) for pat in self._RECONNECT_ERRORS):
+                if not self._is_reconnect_error(exc):
                     raise
                 last_exc = exc
-                if attempt < self._MAX_RECONNECT_ATTEMPTS:
+                if attempt < self.config.reconnect_retries:
+                    self._append_debug_event(
+                        "prompt_retrying",
+                        session_name=self.config.session_name,
+                        stateless=False,
+                        attempt=attempt + 1,
+                        remaining_retries=self.config.reconnect_retries - attempt,
+                        error=str(exc),
+                    )
                     logger.warning(
                         "ACP session died (%s), reconnecting (attempt %d/%d)...",
                         exc,
                         attempt + 1,
-                        self._MAX_RECONNECT_ATTEMPTS,
+                        self.config.reconnect_retries,
                     )
                     self._force_reconnect()
+                    self._sleep_before_retry()
 
         raise last_exc  # type: ignore[misc]
 
@@ -344,35 +609,97 @@ def _force_reconnect(self) -> None:
             pass
         self._session_ready = False
 
-    def _send_prompt_cli(self, acpx: str, prompt: str) -> str:
+    def _is_reconnect_error(self, exc: Exception) -> bool:
+        text = str(exc).lower()
+        return any(pattern in text for pattern in self._RECONNECT_ERRORS)
+
+    def _sleep_before_retry(self) -> None:
+        delay = max(float(getattr(self.config, "reconnect_backoff_sec", 0.0) or 0.0), 0.0)
+        if delay > 0:
+            time.sleep(delay)
+
+    def _send_prompt_cli(
+        self,
+        acpx: str,
+        prompt: str,
+        *,
+        session_name: str | None = None,
+        prompt_bytes: int,
+        prompt_limit: int,
+    ) -> str:
         """Send prompt as a CLI argument (original path)."""
+        active_session = session_name or self.config.session_name
         try:
             result = subprocess.run(
-                [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(),
-                 self.config.agent, "-s", self.config.session_name,
-                 prompt],
+                self._prompt_command(acpx, prompt, session_name=active_session),
                 capture_output=True, text=True, encoding="utf-8",
                 errors="replace", timeout=self.config.timeout_sec,
             )
         except subprocess.TimeoutExpired as exc:
+            self._record_failure_context(
+                acpx=acpx,
+                session_name=active_session,
+                transport="cli",
+                prompt_bytes=prompt_bytes,
+                prompt_limit=prompt_limit,
+                use_file=False,
+                error_text=f"ACP prompt timed out after {self.config.timeout_sec}s",
+                timed_out=True,
+            )
             raise RuntimeError(
                 f"ACP prompt timed out after {self.config.timeout_sec}s"
             ) from exc
 
         if result.returncode != 0:
             stderr = (result.stderr or "").strip()
+            self._record_failure_context(
+                acpx=acpx,
+                session_name=active_session,
+                transport="cli",
+                prompt_bytes=prompt_bytes,
+                prompt_limit=prompt_limit,
+                use_file=False,
+                error_text=stderr,
+                returncode=result.returncode,
+            )
             raise RuntimeError(f"ACP prompt failed (exit {result.returncode}): {stderr}")
 
-        return self._extract_response(result.stdout)
+        response = self._extract_response(result.stdout)
+        self._append_debug_event(
+            "prompt_success",
+            session_name=active_session,
+            transport="cli",
+            prompt_bytes=prompt_bytes,
+            use_file=False,
+            response_bytes=len(response.encode("utf-8")),
+        )
+        return response
 
-    def _send_prompt_via_file(self, acpx: str, prompt: str) -> str:
+    def _send_prompt_via_file(
+        self,
+        acpx: str,
+        prompt: str,
+        *,
+        session_name: str | None = None,
+        prompt_bytes: int,
+        prompt_limit: int,
+    ) -> str:
         """Write prompt to a temp file, ask the agent to read and respond."""
         fd, prompt_path = tempfile.mkstemp(
             suffix=".md", prefix="rc_prompt_",
         )
+        active_session = session_name or self.config.session_name
         try:
             with os.fdopen(fd, "w", encoding="utf-8") as f:
                 f.write(prompt)
+            self._append_debug_event(
+                "prompt_file_written",
+                session_name=active_session,
+                transport="file",
+                prompt_bytes=prompt_bytes,
+                prompt_limit=prompt_limit,
+                prompt_path=prompt_path,
+            )
 
             short_prompt = (
                 f"Read the file at {prompt_path} in its entirety. "
@@ -383,30 +710,134 @@ def _send_prompt_via_file(self, acpx: str, prompt: str) -> str:
 
             try:
                 result = subprocess.run(
-                    [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(),
-                     self.config.agent, "-s", self.config.session_name,
-                     short_prompt],
+                    self._prompt_command(acpx, short_prompt, session_name=active_session),
                     capture_output=True, text=True, encoding="utf-8",
                     errors="replace", timeout=self.config.timeout_sec,
                 )
             except subprocess.TimeoutExpired as exc:
+                self._record_failure_context(
+                    acpx=acpx,
+                    session_name=active_session,
+                    transport="file",
+                    prompt_bytes=prompt_bytes,
+                    prompt_limit=prompt_limit,
+                    use_file=True,
+                    error_text=f"ACP prompt timed out after {self.config.timeout_sec}s",
+                    prompt_path=prompt_path,
+                    timed_out=True,
+                )
                 raise RuntimeError(
                     f"ACP prompt timed out after {self.config.timeout_sec}s"
                 ) from exc
 
             if result.returncode != 0:
                 stderr = (result.stderr or "").strip()
+                self._record_failure_context(
+                    acpx=acpx,
+                    session_name=active_session,
+                    transport="file",
+                    prompt_bytes=prompt_bytes,
+                    prompt_limit=prompt_limit,
+                    use_file=True,
+                    error_text=stderr,
+                    prompt_path=prompt_path,
+                    returncode=result.returncode,
+                )
                 raise RuntimeError(
                     f"ACP prompt failed (exit {result.returncode}): {stderr}"
                 )
 
-            return self._extract_response(result.stdout)
+            response = self._extract_response(result.stdout)
+            self._append_debug_event(
+                "prompt_success",
+                session_name=active_session,
+                transport="file",
+                prompt_bytes=prompt_bytes,
+                use_file=True,
+                prompt_path=prompt_path,
+                response_bytes=len(response.encode("utf-8")),
+            )
+            return response
         finally:
             try:
                 os.unlink(prompt_path)
             except OSError:
                 pass
 
+    def _prompt_command(
+        self,
+        acpx: str,
+        prompt: str,
+        *,
+        session_name: str | None = None,
+    ) -> list[str]:
+        """Build the acpx prompt command for session or stateless mode."""
+        cmd = self._acpx_base_command(acpx, approve_all=True)
+        cmd.append("prompt")
+        active_session = session_name or self.config.session_name
+        cmd.extend(["-s", active_session])
+        cmd.append(prompt)
+        return cmd
+
+    def _create_or_ensure_session(
+        self,
+        acpx: str,
+        session_name: str,
+        *,
+        ensure: bool,
+    ) -> None:
+        action = "ensure" if ensure else "new"
+        result = subprocess.run(
+            [
+                *self._acpx_base_command(acpx, approve_all=False),
+                "sessions",
+                action,
+                "--name",
+                session_name,
+            ],
+            capture_output=True, text=True, encoding="utf-8",
+            errors="replace", timeout=30,
+        )
+        if result.returncode == 0:
+            self._append_debug_event(
+                "session_ready",
+                session_name=session_name,
+                ensure=ensure,
+                stateless=self.config.stateless_prompt,
+            )
+            return
+        if ensure:
+            self._create_or_ensure_session(acpx, session_name, ensure=False)
+            return
+        raise RuntimeError(
+            f"Failed to create ACP session: {(result.stderr or '').strip()}"
+        )
+
+    def _new_ephemeral_session(self, acpx: str) -> str:
+        session_name = f"{self.config.session_name}-{uuid.uuid4().hex[:8]}"
+        self._create_or_ensure_session(acpx, session_name, ensure=False)
+        logger.info("ACP ephemeral session '%s' ready (%s)", session_name, self.config.agent)
+        return session_name
+
+    def _close_named_session(self, acpx: str, session_name: str) -> None:
+        try:
+            subprocess.run(
+                [
+                    *self._acpx_base_command(acpx, approve_all=False),
+                    "sessions",
+                    "close",
+                    session_name,
+                ],
+                capture_output=True, timeout=15,
+            )
+            self._append_debug_event(
+                "session_closed",
+                session_name=session_name,
+                stateless=self.config.stateless_prompt,
+            )
+        except Exception:  # noqa: BLE001
+            pass
+
     @staticmethod
     def _extract_response(raw_output: str | None) -> str:
         """Extract the agent's actual response from acpx output.
diff --git a/researchclaw/pipeline/_helpers.py b/researchclaw/pipeline/_helpers.py
index 74eda81d..ea49a2e2 100644
--- a/researchclaw/pipeline/_helpers.py
+++ b/researchclaw/pipeline/_helpers.py
@@ -42,6 +42,70 @@ class StageResult:
     evidence_refs: tuple[str, ...] = ()
 
 
+def detect_synthetic_proxy_signals(file_texts: dict[str, str]) -> list[str]:
+    """Heuristically detect toy/proxy dataset generation in experiment code."""
+    if not file_texts:
+        return []
+
+    combined = "\n\n".join(file_texts.values())
+    combined_lower = combined.lower()
+    signals: list[str] = []
+
+    if "class cachedevidencerepository" in combined_lower:
+        signals.append(
+            "contains `CachedEvidenceRepository`, a repository-local synthetic evidence scaffold"
+        )
+    if re.search(r"def\s+_build_example\s*\(", combined):
+        signals.append("contains `_build_example(...)`, suggesting in-code sample synthesis")
+    if re.search(r"def\s+_build_splits\s*\(", combined):
+        signals.append("contains `_build_splits(...)`, suggesting in-code dataset assembly")
+    if re.search(r"def\s+_sample_circle\s*\(", combined):
+        signals.append("contains `_sample_circle(...)`, suggesting synthetic circle generation")
+
+    split_match = re.search(
+        r"_SPLIT_SIZES\s*=\s*\{[^}]*['\"]train['\"]\s*:\s*(\d+)"
+        r"[^}]*['\"]val['\"]\s*:\s*(\d+)"
+        r"[^}]*['\"]test['\"]\s*:\s*(\d+)",
+        combined,
+        re.DOTALL,
+    )
+    if split_match:
+        split_sizes = tuple(int(split_match.group(i)) for i in range(1, 4))
+        if sum(split_sizes) <= 500:
+            signals.append(
+                "contains hard-coded tiny split sizes "
+                f"train/val/test={split_sizes[0]}/{split_sizes[1]}/{split_sizes[2]}"
+            )
+
+    for phrase in (
+        "toy dataset",
+        "proxy dataset",
+        "synthetic benchmark",
+        "repository-local benchmark",
+    ):
+        if phrase in combined_lower:
+            signals.append(f"contains suspicious phrase `{phrase}`")
+
+    return signals
+
+
+def should_fail_synthetic_proxy_guard(signals: list[str]) -> bool:
+    """Return True when synthetic/proxy signals are strong enough to hard-fail."""
+    if not signals:
+        return False
+
+    strong_markers = ("CachedEvidenceRepository", "hard-coded tiny split sizes")
+    if any(any(marker in signal for marker in strong_markers) for signal in signals):
+        return True
+
+    has_build_example = any("_build_example" in signal for signal in signals)
+    has_build_splits = any("_build_splits" in signal for signal in signals)
+    if has_build_example and has_build_splits:
+        return True
+
+    return len(signals) >= 3
+
+
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
@@ -235,7 +299,12 @@ def _build_fallback_queries(topic: str) -> list[str]:
 def _write_stage_meta(
     stage_dir: Path, stage: Stage, run_id: str, result: "StageResult"
 ) -> None:
-    next_stage = NEXT_STAGE[stage]
+    if result.status is StageStatus.DONE:
+        next_stage = NEXT_STAGE[stage]
+    else:
+        # Failed / paused / blocked stages should point back to themselves so
+        # retry-resume tooling does not imply that the pipeline advanced.
+        next_stage = stage
     meta = {
         "stage_id": f"{int(stage):02d}-{stage.name.lower()}",
         "run_id": run_id,
@@ -371,6 +440,79 @@ def _load_hardware_profile(run_dir: Path) -> dict[str, Any] | None:
         return None
 
 
+def _load_research_repair_metadata(run_dir: Path) -> dict[str, Any] | None:
+    """Load child-run repair metadata when this run was created via research-repair."""
+    metadata_path = run_dir / "research_repair_parent.json"
+    if not metadata_path.exists():
+        return None
+    try:
+        data = json.loads(metadata_path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, ValueError):
+        return None
+    return data if isinstance(data, dict) else None
+
+
+def _build_research_repair_brief(
+    run_dir: Path,
+    *,
+    max_feedback_items: int = 5,
+) -> str:
+    """Return a compact repair brief for prompt injection.
+
+    The child-run repair metadata is authoritative for repair semantics.
+    Use this short block instead of repeating the full repair narrative in
+    `research.topic`.
+    """
+    metadata = _load_research_repair_metadata(run_dir)
+    if not metadata:
+        return ""
+
+    compact = str(metadata.get("compact_repair_brief", "")).strip()
+    if compact:
+        return compact
+
+    parent_run_id = str(metadata.get("parent_run_id", "")).strip()
+    target_stage_name = str(metadata.get("target_stage_name", "")).strip()
+    repair_reason = " ".join(
+        str(metadata.get("repair_reason", "")).split()
+    ).strip()
+    reuse_policy = metadata.get("reuse_policy")
+    hard_reuse: list[str] = []
+    if isinstance(reuse_policy, dict):
+        hard_reuse = [
+            str(item).strip()
+            for item in reuse_policy.get("hard_reuse_stage_dirs", [])
+            if str(item).strip()
+        ]
+
+    feedback_excerpt = str(metadata.get("feedback_excerpt", "")).strip()
+    feedback_items: list[str] = []
+    for raw_line in feedback_excerpt.splitlines():
+        line = raw_line.strip()
+        if not line.startswith("- "):
+            continue
+        payload = line[2:].strip()
+        lowered = payload.lower()
+        if lowered.startswith(("parent run:", "target stage:", "reason:")):
+            continue
+        feedback_items.append(" ".join(payload.split()))
+        if len(feedback_items) >= max_feedback_items:
+            break
+
+    lines = ["## Repair Context"]
+    if parent_run_id:
+        lines.append(f"- Parent run: `{parent_run_id}`")
+    if target_stage_name:
+        lines.append(f"- Authoritative rerun starts at: `{target_stage_name}`")
+    if repair_reason:
+        lines.append(f"- Repair reason: {repair_reason}")
+    if hard_reuse:
+        lines.append("- Hard reuse: " + ", ".join(hard_reuse))
+    lines.append("- Downstream parent analysis and paper artifacts are soft context only.")
+    lines.extend(f"- {item}" for item in feedback_items)
+    return "\n".join(lines).strip()
+
+
 # ---------------------------------------------------------------------------
 # Parsing utilities
 # ---------------------------------------------------------------------------
diff --git a/researchclaw/pipeline/runner.py b/researchclaw/pipeline/runner.py
index b81ffdb9..558b026d 100644
--- a/researchclaw/pipeline/runner.py
+++ b/researchclaw/pipeline/runner.py
@@ -47,6 +47,9 @@ def _build_pipeline_summary(
         "run_id": run_id,
         "stages_executed": len(results),
         "stages_done": sum(1 for item in results if item.status == StageStatus.DONE),
+        "stages_paused": sum(
+            1 for item in results if item.status == StageStatus.PAUSED
+        ),
         "stages_blocked": sum(
             1 for item in results if item.status == StageStatus.BLOCKED_APPROVAL
         ),
@@ -463,6 +466,9 @@ def execute_pipeline(
         elif result.status == StageStatus.FAILED:
             err = result.error or "unknown error"
             print(f"{prefix} {stage.name} — FAILED ({elapsed:.1f}s) — {err}")
+        elif result.status == StageStatus.PAUSED:
+            err = result.error or "paused"
+            print(f"{prefix} {stage.name} — PAUSED ({elapsed:.1f}s) — {err}")
         elif result.status == StageStatus.BLOCKED_APPROVAL:
             print(f"{prefix} {stage.name} — blocked (awaiting approval)")
         results.append(result)
@@ -604,6 +610,14 @@ def execute_pipeline(
                 logger.warning("Noncritical stage %s failed - skipping", stage.name)
             else:
                 break
+        if result.status == StageStatus.PAUSED:
+            logger.warning(
+                "[%s] Pipeline paused at %s: %s",
+                run_id,
+                stage.name,
+                result.error or result.decision,
+            )
+            break
         if result.status == StageStatus.BLOCKED_APPROVAL and stop_on_gate:
             break
 
diff --git a/researchclaw/pipeline/stage_impls/_code_generation.py b/researchclaw/pipeline/stage_impls/_code_generation.py
index e5f21ddc..2f65a90b 100644
--- a/researchclaw/pipeline/stage_impls/_code_generation.py
+++ b/researchclaw/pipeline/stage_impls/_code_generation.py
@@ -2,12 +2,16 @@
 
 from __future__ import annotations
 
+import ast
+import importlib.util
 import json
 import logging
 import re
 from pathlib import Path
 from typing import Any
 
+import yaml
+
 from researchclaw.adapters import AdapterBundle
 from researchclaw.config import RCConfig
 from researchclaw.experiment.validator import (
@@ -19,7 +23,9 @@
 from researchclaw.pipeline._domain import _detect_domain
 from researchclaw.pipeline._helpers import (
     StageResult,
+    _build_research_repair_brief,
     _chat_with_prompt,
+    detect_synthetic_proxy_signals,
     _ensure_sandbox_deps,
     _extract_code_block,
     _extract_multi_file_blocks,
@@ -28,6 +34,7 @@
     _load_hardware_profile,
     _read_prior_artifact,
     _safe_json_loads,
+    should_fail_synthetic_proxy_guard,
     _utcnow_iso,
 )
 from researchclaw.pipeline.stages import Stage, StageStatus
@@ -42,6 +49,121 @@
     "mountaincarcontinuous", "lunarlander-continuous",
 }
 
+_LIKELY_LOCAL_HELPER_MODULES = {
+    "backbone",
+    "backbones",
+    "config",
+    "configs",
+    "constants",
+    "data_loader",
+    "data_utils",
+    "dataloader",
+    "dataset",
+    "datasets",
+    "decoder",
+    "decoders",
+    "encoder",
+    "encoders",
+    "helper",
+    "helpers",
+    "layer",
+    "layers",
+    "loader",
+    "loaders",
+    "loss",
+    "losses",
+    "metric",
+    "metrics",
+    "model",
+    "models",
+    "module",
+    "modules",
+    "network",
+    "networks",
+    "postprocess",
+    "postprocessing",
+    "preprocess",
+    "preprocessing",
+    "train_utils",
+    "trainer",
+    "trainers",
+    "transform",
+    "transforms",
+    "util",
+    "utils",
+}
+
+_PLACEHOLDER_EXPERIMENT_PATTERNS = (
+    "dummy implementation",
+    "dummy implementations",
+    "dummy placeholder",
+    "placeholder implementation",
+    "replace with actual implementation",
+    "replace with actual implementations",
+    "for standalone operation",
+    "for demonstration",
+)
+
+_EXPERIMENT_CLASS_NAME_HINTS = (
+    "ablation",
+    "baseline",
+    "detector",
+    "fusion",
+    "model",
+    "reranker",
+    "verifier",
+)
+
+_CORE_EXPERIMENT_METHODS = {
+    "evaluate",
+    "forward",
+    "predict",
+    "run",
+    "score",
+    "train_step",
+}
+
+_ABLATION_NAME_HINTS = (
+    "without",
+    "ablation",
+    "abl_",
+    "no_",
+    "minus",
+)
+
+_DISTINCTNESS_CHECK_NAME_HINTS = (
+    "ablation_check",
+    "condition_outputs_differ",
+    "distinctness",
+    "outputs_differ",
+    "sanity_check_condition",
+    "verify_condition",
+)
+
+_CRITICAL_DEEP_KEYWORDS = (
+    "unboundlocalerror",
+    "unregistered",
+    "does not exist",
+    "empty or trivial subclass",
+    "does not override",
+    "import-usage mismatch",
+    "nameerror",
+    "was removed",
+    "ptp()",
+    "copy-paste",
+    "identical method signatures",
+    "identical ast",
+    "not a real ablation",
+    "shadows stdlib/pip",
+    "placeholder experiment text found",
+    "placeholder experiment implementation",
+    "fixed-constant core method",
+    "demonstration stub",
+    "no ablation/condition distinctness self-check",
+    "distinctness self-check",
+    "does not call distinctness check",
+)
+
 
 def _check_rl_compatibility(code: str) -> list[str]:
     """Detect DQN + continuous-action environment mismatches.
@@ -64,6 +186,716 @@ def _check_rl_compatibility(code: str) -> list[str]:
     return errors
 
 
+def _find_missing_local_module_imports(files: dict[str, str]) -> list[str]:
+    """Detect local helper-module imports that are not present in *files*.
+
+    This is intentionally narrower than generic import validation: we only flag
+    imports that strongly indicate an intra-project Python module dependency.
+    """
+    known_modules = {
+        fname[:-3]
+        for fname in files
+        if fname.endswith(".py")
+    }
+    issues: list[str] = []
+    seen: set[tuple[str, str, int | None]] = set()
+
+    def _record_issue(
+        file_name: str,
+        module_name: str,
+        *,
+        line: int | None,
+    ) -> None:
+        if (
+            module_name in known_modules
+            or module_name.startswith("_")
+        ):
+            return
+        key = (file_name, module_name, line)
+        if key in seen:
+            return
+        seen.add(key)
+        line_text = f" line {line}" if line is not None else ""
+        issues.append(
+            f"[{file_name}] Local helper module '{module_name}.py' is imported at"
+            f"{line_text} but was not generated. The experiment project must be"
+            f" self-contained: either return '{module_name}.py' or inline its"
+            f" code and remove the import."
+        )
+
+    for fname, code in files.items():
+        if not fname.endswith(".py"):
+            continue
+        try:
+            tree = ast.parse(code)
+        except SyntaxError:
+            continue
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Import):
+                for alias in node.names:
+                    top = alias.name.split(".")[0]
+                    if top in _LIKELY_LOCAL_HELPER_MODULES:
+                        _record_issue(
+                            fname,
+                            top,
+                            line=getattr(node, "lineno", None),
+                        )
+            elif isinstance(node, ast.ImportFrom):
+                line = getattr(node, "lineno", None)
+                if node.level > 0:
+                    if node.module:
+                        top = node.module.split(".")[0]
+                        _record_issue(fname, top, line=line)
+                    else:
+                        for alias in node.names:
+                            top = alias.name.split(".")[0]
+                            _record_issue(fname, top, line=line)
+                elif node.module:
+                    top = node.module.split(".")[0]
+                    if top in _LIKELY_LOCAL_HELPER_MODULES:
+                        _record_issue(fname, top, line=line)
+
+    return issues
+
+
+def _strip_docstring(body: list[ast.stmt]) -> list[ast.stmt]:
+    if (
+        body
+        and isinstance(body[0], ast.Expr)
+        and isinstance(body[0].value, ast.Constant)
+        and isinstance(body[0].value.value, str)
+    ):
+        return body[1:]
+    return body
+
+
+def _is_literal_constant(node: ast.AST | None) -> bool:
+    if node is None:
+        return True
+    if isinstance(node, ast.Constant):
+        return True
+    if isinstance(node, ast.UnaryOp) and isinstance(node.op, (ast.UAdd, ast.USub)):
+        return _is_literal_constant(node.operand)
+    if isinstance(node, (ast.Tuple, ast.List, ast.Set)):
+        return all(_is_literal_constant(elt) for elt in node.elts)
+    if isinstance(node, ast.Dict):
+        return all(
+            (key is None or _is_literal_constant(key)) and _is_literal_constant(value)
+            for key, value in zip(node.keys, node.values, strict=False)
+        )
+    return False
+
+
+def _method_is_pass_only(node: ast.FunctionDef | ast.AsyncFunctionDef) -> bool:
+    body = _strip_docstring(list(node.body))
+    return len(body) == 1 and isinstance(body[0], ast.Pass)
+
+
+def _method_returns_fixed_constant(
+    node: ast.FunctionDef | ast.AsyncFunctionDef,
+) -> bool:
+    body = _strip_docstring(list(node.body))
+    return (
+        len(body) == 1
+        and isinstance(body[0], ast.Return)
+        and _is_literal_constant(body[0].value)
+    )
+
+
+def _looks_like_experiment_class(node: ast.ClassDef) -> bool:
+    lowered = node.name.lower()
+    if any(hint in lowered for hint in _EXPERIMENT_CLASS_NAME_HINTS):
+        return True
+    return any(
+        isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and item.name in _CORE_EXPERIMENT_METHODS
+        for item in node.body
+    )
+
+
+def _call_name(node: ast.AST) -> str:
+    if isinstance(node, ast.Name):
+        return node.id
+    if isinstance(node, ast.Attribute):
+        return node.attr
+    return ""
+
+
+def _extract_condition_entries(tree: ast.AST) -> list[tuple[str, str | None]]:
+    entries: list[tuple[str, str | None]] = []
+
+    def _extract_label(node: ast.AST) -> str | None:
+        if isinstance(node, ast.Constant) and isinstance(node.value, str):
+            return node.value
+        return None
+
+    def _extract_class_name(node: ast.AST) -> str | None:
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Call):
+            return _extract_class_name(node.func)
+        if isinstance(node, ast.Attribute):
+            return node.attr
+        return None
+
+    for assign in ast.walk(tree):
+        if not isinstance(assign, ast.Assign):
+            continue
+        if not isinstance(assign.value, (ast.List, ast.Tuple)):
+            continue
+        for elt in assign.value.elts:
+            if not isinstance(elt, ast.Tuple) or len(elt.elts) < 2:
+                continue
+            label = _extract_label(elt.elts[0]) or ""
+            class_name = _extract_class_name(elt.elts[1])
+            if label or class_name:
+                entries.append((label, class_name))
+    return entries
+
+
+def _looks_like_ablation_entry(label: str, class_name: str | None) -> bool:
+    lowered = f"{label} {class_name or ''}".lower()
+    return any(hint in lowered for hint in _ABLATION_NAME_HINTS)
+
+
+def _function_has_distinctness_logic(
+    node: ast.FunctionDef | ast.AsyncFunctionDef,
+) -> bool:
+    body = _strip_docstring(list(node.body))
+    if not body:
+        return False
+    for sub in ast.walk(node):
+        if isinstance(sub, ast.Assert):
+            return True
+        if isinstance(sub, ast.Compare):
+            return True
+        if isinstance(sub, ast.Call):
+            call_name = _call_name(sub.func).lower()
+            if call_name in {"allclose", "array_equal"}:
+                return True
+            if "assert" in call_name or "raise" in call_name:
+                return True
+    return False
+
+
+def _find_placeholder_experiment_issues(files: dict[str, str]) -> list[str]:
+    """Detect obviously placeholder experiment implementations.
+
+    This is stricter than generic code-complexity warnings: it looks for
+    generated experiments that openly advertise themselves as demonstrations,
+    or condition classes whose core methods are pass-only / fixed-constant stubs.
+    """
+    issues: list[str] = []
+
+    for fname, code in files.items():
+        if not fname.endswith(".py"):
+            continue
+        lowered_code = code.lower()
+        for pattern in _PLACEHOLDER_EXPERIMENT_PATTERNS:
+            if pattern in lowered_code:
+                issues.append(
+                    f"[{fname}] Placeholder experiment text found ('{pattern}') — "
+                    "generated experiment code must implement real logic, not "
+                    "demonstration stubs."
+                )
+                break
+
+        try:
+            tree = ast.parse(code)
+        except SyntaxError:
+            continue
+
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.ClassDef) or not _looks_like_experiment_class(node):
+                continue
+
+            methods = [
+                item
+                for item in node.body
+                if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef))
+            ]
+            if not methods:
+                continue
+
+            pass_only_init = any(
+                method.name == "__init__" and _method_is_pass_only(method)
+                for method in methods
+            )
+            constant_core_methods = [
+                method.name
+                for method in methods
+                if method.name in _CORE_EXPERIMENT_METHODS
+                and _method_returns_fixed_constant(method)
+            ]
+            trivial_core_methods = [
+                method.name
+                for method in methods
+                if method.name in _CORE_EXPERIMENT_METHODS
+                and (
+                    _method_is_pass_only(method)
+                    or _method_returns_fixed_constant(method)
+                )
+            ]
+
+            if pass_only_init and constant_core_methods:
+                issues.append(
+                    f"[{fname}] Class '{node.name}' looks like a placeholder "
+                    "experiment implementation: __init__ is pass-only and core "
+                    "method(s) "
+                    + ", ".join(sorted(constant_core_methods))
+                    + " use fixed-constant core method returns. Ablation/condition "
+                      "classes must exercise real differentiating logic."
+                )
+                continue
+
+            if trivial_core_methods and len(trivial_core_methods) == len(
+                [
+                    method
+                    for method in methods
+                    if method.name in _CORE_EXPERIMENT_METHODS
+                ]
+            ):
+                issues.append(
+                    f"[{fname}] Class '{node.name}' is a demonstration stub: all "
+                    "core experiment methods ("
+                    + ", ".join(sorted(trivial_core_methods))
+                    + ") are pass-only or fixed-constant. Generated ablations must "
+                      "implement real computation."
+                )
+
+    return issues
+
+
+def _find_condition_distinctness_issues(files: dict[str, str]) -> list[str]:
+    """Detect missing or non-functional ablation distinctness self-checks."""
+    issues: list[str] = []
+    condition_entries: list[tuple[str, str | None]] = []
+    distinctness_functions: dict[str, tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = {}
+    called_distinctness_functions: set[str] = set()
+
+    for fname, code in files.items():
+        if not fname.endswith(".py"):
+            continue
+        try:
+            tree = ast.parse(code)
+        except SyntaxError:
+            continue
+
+        condition_entries.extend(_extract_condition_entries(tree))
+
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                lowered_name = node.name.lower()
+                if any(hint in lowered_name for hint in _DISTINCTNESS_CHECK_NAME_HINTS):
+                    distinctness_functions[node.name] = (fname, node)
+            elif isinstance(node, ast.Call):
+                call_name = _call_name(node.func)
+                if any(hint in call_name.lower() for hint in _DISTINCTNESS_CHECK_NAME_HINTS):
+                    called_distinctness_functions.add(call_name)
+
+    if not condition_entries:
+        return issues
+
+    ablation_entries = [
+        (label, class_name)
+        for label, class_name in condition_entries
+        if _looks_like_ablation_entry(label, class_name)
+    ]
+    if len(condition_entries) < 4 or len(ablation_entries) < 2:
+        return issues
+
+    if not distinctness_functions:
+        issues.append(
+            "No ablation/condition distinctness self-check found. Experiments with "
+            "multiple ablation-like conditions must include a startup check that "
+            "compares condition outputs on the same probe input and fails if they "
+            "are identical."
+        )
+        return issues
+
+    valid_function_names: set[str] = set()
+    for func_name, (fname, func_node) in distinctness_functions.items():
+        if _method_is_pass_only(func_node):
+            issues.append(
+                f"[{fname}] Distinctness self-check '{func_name}' is pass-only. It "
+                "must actively compare condition outputs and fail loudly on "
+                "identical behavior."
+            )
+            continue
+        if _method_returns_fixed_constant(func_node):
+            issues.append(
+                f"[{fname}] Distinctness self-check '{func_name}' returns a fixed "
+                "constant instead of validating ablation behavior."
+            )
+            continue
+        if not _function_has_distinctness_logic(func_node):
+            issues.append(
+                f"[{fname}] Distinctness self-check '{func_name}' exists but does "
+                "not contain comparison/assertion logic. It must compare outputs "
+                "from multiple conditions on the same probe input."
+            )
+            continue
+        valid_function_names.add(func_name)
+
+    if valid_function_names and not any(
+        called in valid_function_names for called in called_distinctness_functions
+    ):
+        issues.append(
+            "Experiment defines a condition distinctness self-check but does not "
+            "call it before running the main evaluation. Call the self-check at "
+            "startup and fail fast on identical outputs."
+        )
+
+    return issues
+
+
+def _is_critical_deep_warning(message: str) -> bool:
+    lowered = message.lower()
+    return any(keyword in lowered for keyword in _CRITICAL_DEEP_KEYWORDS)
+
+
+def _repair_self_contained_project(
+    *,
+    llm: LLMClient,
+    prompt_manager: PromptManager,
+    files: dict[str, str],
+    issues: list[str],
+    max_tokens: int,
+    max_repair: int,
+) -> tuple[dict[str, str], list[str]]:
+    """Repair missing local helper-module files by asking for a full file set."""
+    current_files = dict(files)
+    current_issues = list(issues)
+
+    for attempt in range(1, max_repair + 1):
+        repair_prompt = (
+            "SELF-CONTAINMENT REPAIR REQUIRED.\n\n"
+            "The generated experiment project is not self-contained. Some files "
+            "import local helper modules that were never returned.\n\n"
+            "Missing local-module issues:\n"
+            + "\n".join(f"- {issue}" for issue in current_issues)
+            + "\n\nRULES:\n"
+            "- If any file imports a local helper module such as models, utils, "
+            "data_utils, metrics, or loaders, you MUST return that helper file "
+            "too.\n"
+            "- If you do not want a helper file, inline its code into an existing "
+            "file and remove the import.\n"
+            "- Preserve working files unless you are intentionally replacing them.\n"
+            "- The final project must be runnable via `python main.py`.\n"
+            "- Return ALL project files using ```filename:...``` blocks.\n\n"
+            "Current files:\n"
+            + "\n\n".join(
+                f"```filename:{fname}\n{code}\n```"
+                for fname, code in current_files.items()
+            )
+        )
+
+        resp = _chat_with_prompt(
+            llm,
+            prompt_manager.system("code_generation"),
+            repair_prompt,
+            max_tokens=max_tokens,
+        )
+        repaired_files = _extract_multi_file_blocks(resp.content)
+        if not repaired_files:
+            logger.warning(
+                "Stage 10: Self-containment repair attempt %d returned no files",
+                attempt,
+            )
+            continue
+
+        merged = dict(current_files)
+        merged.update(repaired_files)
+        current_files = merged
+        current_issues = _find_missing_local_module_imports(current_files)
+        if not current_issues:
+            logger.info(
+                "Stage 10: Self-containment repair succeeded on attempt %d",
+                attempt,
+            )
+            return current_files, []
+
+    return current_files, current_issues
+
+
+def _build_real_data_guard_guidance(config: RCConfig) -> str:
+    exp_cfg = config.experiment
+    if not (
+        getattr(exp_cfg, "require_real_data", False)
+        or getattr(exp_cfg, "forbid_synthetic_proxy", False)
+        or getattr(exp_cfg, "fail_on_stdout_parsed_results", False)
+        or getattr(exp_cfg, "required_real_data_refs", ())
+    ):
+        return ""
+
+    refs = tuple(getattr(exp_cfg, "required_real_data_refs", ()) or ())
+    refs_block = ""
+    if refs:
+        refs_block = "Required local data references (use these, do not invent substitutes):\n"
+        refs_block += "".join(f"- {ref}\n" for ref in refs)
+
+    asset_paths_block = _build_resolved_local_asset_guidance()
+
+    return (
+        "\n\nREAL DATA ENFORCEMENT (HARD RULE):\n"
+        "- This project MUST use real local project assets/caches, not an internally "
+        "generated proxy benchmark.\n"
+        "- If the required local assets are unavailable, FAIL FAST with a clear "
+        "FileNotFoundError or RuntimeError. Do NOT silently degrade to a toy dataset.\n"
+        "- FORBIDDEN fallback patterns include: helper functions such as "
+        "`_build_example`, `_build_splits`, or `_sample_circle` that generate the "
+        "benchmark in code; hard-coded tiny train/val/test split dictionaries; "
+        "repository-local synthetic evidence repositories; or any results source that "
+        "exists only as stdout metric lines.\n"
+        "- main.py must write a structured `results.json`; stdout-only metrics are "
+        "insufficient for this run.\n"
+        "- The execution harness invokes `python main.py` directly. Do NOT require "
+        "dataset/asset CLI flags just to start the experiment. Asset path flags may "
+        "exist only as optional overrides; the default path resolution must come "
+        "from the VECTRA_* env vars or the authoritative absolute roots below.\n"
+        "- Emit machine-readable provenance where practical, including "
+        "`data_manifest.json` and `protocol_manifest.json`, so later stages can verify "
+        "which local assets were actually used.\n"
+        "- Resolve data from the authoritative absolute roots or env vars below. "
+        "Do NOT invent packaged relative directories such as "
+        "`./page_minus_titleblock_train1000_local`.\n"
+        + refs_block
+        + asset_paths_block
+    )
+
+
+def _build_resolved_local_asset_guidance() -> str:
+    """Expose authoritative local asset roots from the repo's experiment config."""
+    specs = _load_project_dataset_specs()
+    if not specs:
+        return ""
+
+    def _path_text(value: Any) -> str:
+        text = " ".join(str(value).replace("\\", "/").split()).strip()
+        return text
+
+    lines = ["Authoritative local asset roots for this repository:"]
+
+    def _append_path(dataset_name: str, label: str, env_name: str, value: Any) -> None:
+        text = _path_text(value)
+        if not text:
+            return
+        lines.append(f"- {dataset_name} {label}: {text} (env: {env_name})")
+
+    repo_root = Path(__file__).resolve().parents[3]
+    lines.append(f"- repo root: {repo_root.as_posix()} (env: VECTRA_REPO_ROOT)")
+
+    simple_key = "engineering_primitives_simple_scenes_noslot_v1_local_20260326"
+    simple_spec = specs.get(simple_key)
+    if isinstance(simple_spec, dict):
+        cache_roots = simple_spec.get("cache_roots")
+        _append_path(simple_key, "dataset_root", "VECTRA_SIMPLE_DATASET_ROOT", simple_spec.get("dataset_root"))
+        _append_path(simple_key, "dataset_root", "VECTRA_SIMPLE_ASSET_ROOT", simple_spec.get("dataset_root"))
+        _append_path(simple_key, "manifest_path", "VECTRA_SIMPLE_MANIFEST_PATH", simple_spec.get("manifest_path"))
+        if isinstance(cache_roots, dict):
+            _append_path(simple_key, "learned_cache", "VECTRA_SIMPLE_HEATMAP_DIR", cache_roots.get("learned"))
+
+    page_key = "page_minus_titleblock"
+    page_spec = specs.get(page_key)
+    if isinstance(page_spec, dict):
+        dataset_root = page_spec.get("dataset_root")
+        split_manifest = page_spec.get("split_manifest_path")
+        _append_path(page_key, "dataset_root", "VECTRA_PAGE_DATASET_ROOT", dataset_root)
+        if dataset_root:
+            dataset_root_path = Path(str(dataset_root))
+            _append_path(page_key, "image_dir", "VECTRA_PAGE_IMAGE_DIR", dataset_root_path / "train2017")
+            _append_path(page_key, "sidecar_dir", "VECTRA_PAGE_SIDECAR_DIR", dataset_root_path / "sidecars" / "train2017")
+        _append_path(page_key, "split_manifest", "VECTRA_PAGE_SPLIT_JSON", split_manifest)
+        if split_manifest:
+            split_manifest_path = Path(str(split_manifest))
+            one_drive_png_root = split_manifest_path.parent.parent
+            _append_path(page_key, "png_root", "VECTRA_ONE_DRIVE_PNG_ROOT", one_drive_png_root)
+            _append_path(page_key, "gt_solid_csv", "VECTRA_PAGE_GT_SOLID_CSV", split_manifest_path.parent / "gt" / "train2017_solid.csv")
+            _append_path(page_key, "gt_dashed_csv", "VECTRA_PAGE_GT_DASHED_CSV", split_manifest_path.parent / "gt" / "train2017_dashed.csv")
+
+    probe_key = "DeepPatent2_negative_clutter_probe"
+    probe_spec = specs.get(probe_key)
+    if isinstance(probe_spec, dict):
+        _append_path(probe_key, "dataset_root", "VECTRA_DEEPPATENT_DATASET_ROOT", probe_spec.get("dataset_root"))
+
+    if len(lines) <= 1:
+        return ""
+    lines.extend(
+        [
+            "- Loader rule: first read the env vars above if they are set, otherwise fall back to the exact absolute paths above.",
+            "- If a required asset path does not exist, raise FileNotFoundError naming the env var/path that was missing.",
+        ]
+    )
+    return "\n" + "\n".join(lines) + "\n"
+
+
+def _load_project_dataset_specs() -> dict[str, dict[str, Any]]:
+    """Load the repo-root experiment dataset specs for prompt grounding."""
+    repo_root = Path(__file__).resolve().parents[3]
+    config_path = repo_root / "config.py"
+    if not config_path.exists():
+        return {}
+
+    try:
+        spec = importlib.util.spec_from_file_location(
+            "researchclaw_project_config_for_codegen",
+            config_path,
+        )
+        if spec is None or spec.loader is None:
+            return {}
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        build_default_config = getattr(module, "build_default_config", None)
+        if not callable(build_default_config):
+            return {}
+        project_config = build_default_config()
+        build_dataset_specs = getattr(project_config, "build_dataset_specs", None)
+        if not callable(build_dataset_specs):
+            return {}
+        dataset_specs = build_dataset_specs()
+        if not isinstance(dataset_specs, dict):
+            return {}
+        return dataset_specs
+    except Exception:  # noqa: BLE001
+        logger.debug("Resolved local asset guidance unavailable", exc_info=True)
+        return {}
+
+
+def _extract_named_plan_items(value: Any, *, limit: int = 8) -> list[str]:
+    items: list[str] = []
+    if value is None:
+        return items
+    if isinstance(value, dict):
+        if "name" in value:
+            candidate = " ".join(str(value.get("name", "")).split()).strip()
+            if candidate:
+                items.append(candidate)
+        else:
+            for key in value:
+                candidate = " ".join(str(key).split()).strip()
+                if candidate:
+                    items.append(candidate)
+    elif isinstance(value, (list, tuple, set)):
+        for item in value:
+            if isinstance(item, dict):
+                candidate = " ".join(str(item.get("name", "")).split()).strip()
+            else:
+                candidate = " ".join(str(item).split()).strip()
+            if candidate:
+                items.append(candidate)
+    else:
+        candidate = " ".join(str(value).split()).strip()
+        if candidate:
+            items.append(candidate)
+
+    deduped: list[str] = []
+    seen: set[str] = set()
+    for item in items:
+        key = item.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(item)
+        if len(deduped) >= limit:
+            break
+    return deduped
+
+
+def _build_codegen_plan_summary(exp_plan_text: str, config: RCConfig) -> str:
+    """Compress the experiment plan into the subset needed for Stage 10."""
+    if not exp_plan_text.strip():
+        return ""
+
+    try:
+        plan_data = yaml.safe_load(exp_plan_text)
+    except yaml.YAMLError:
+        plan_data = None
+    if not isinstance(plan_data, dict):
+        excerpt = exp_plan_text[:2200].rstrip()
+        suffix = "\n...\n" if len(exp_plan_text) > 2200 else "\n"
+        return "PLAN EXCERPT:\n" + excerpt + suffix
+
+    lines = ["## Experiment Plan Summary"]
+
+    plan_topic = " ".join(str(plan_data.get("topic", "")).split()).strip()
+    if plan_topic:
+        if len(plan_topic) > 320:
+            plan_topic = plan_topic[:320].rstrip() + "..."
+        lines.append(f"- Topic anchor: {plan_topic}")
+
+    datasets = _extract_named_plan_items(plan_data.get("datasets"), limit=6)
+    if datasets:
+        lines.append("- Datasets: " + ", ".join(datasets))
+
+    baselines = _extract_named_plan_items(plan_data.get("baselines"), limit=8)
+    if baselines:
+        lines.append("- Baselines: " + ", ".join(baselines))
+
+    methods = _extract_named_plan_items(plan_data.get("proposed_methods"), limit=8)
+    if methods:
+        lines.append("- Proposed methods: " + ", ".join(methods))
+
+    ablations = _extract_named_plan_items(plan_data.get("ablations"), limit=8)
+    if ablations:
+        lines.append("- Ablations: " + ", ".join(ablations))
+
+    metrics = plan_data.get("metrics")
+    if isinstance(metrics, dict):
+        primary = metrics.get("primary_metric")
+        if isinstance(primary, dict):
+            primary_name = " ".join(str(primary.get("name", "")).split()).strip()
+            direction = " ".join(
+                str(primary.get("direction", config.experiment.metric_direction)).split()
+            ).strip()
+            if primary_name:
+                lines.append(
+                    f"- Primary metric: {primary_name} ({direction or config.experiment.metric_direction})"
+                )
+        secondary = _extract_named_plan_items(metrics.get("secondary_metrics"), limit=8)
+        if secondary:
+            lines.append("- Secondary metrics: " + ", ".join(secondary))
+
+    compute_budget = plan_data.get("compute_budget")
+    if isinstance(compute_budget, dict):
+        total_seconds = compute_budget.get("total_time_budget_seconds")
+        seeded_conditions = compute_budget.get("seeded_condition_count")
+        budget_bits: list[str] = []
+        if total_seconds is not None:
+            budget_bits.append(f"total_time_budget_seconds={total_seconds}")
+        if seeded_conditions is not None:
+            budget_bits.append(f"seeded_condition_count={seeded_conditions}")
+        if budget_bits:
+            lines.append("- Compute budget: " + ", ".join(budget_bits))
+
+    refs = tuple(getattr(config.experiment, "required_real_data_refs", ()) or ())
+    if refs:
+        lines.append("- Required local asset refs:")
+        lines.extend(f"  - {ref}" for ref in refs[:8])
+
+    return "\n".join(lines).strip()
+
+
+def _is_acp_transport_failure(exc: Exception) -> bool:
+    """Return True when a Stage-10 failure came from the ACP transport layer."""
+    parts = [str(exc)]
+    cause = getattr(exc, "__cause__", None)
+    context = getattr(exc, "__context__", None)
+    if cause:
+        parts.append(str(cause))
+    if context:
+        parts.append(str(context))
+    text = " ".join(part.strip() for part in parts if part).lower()
+    if not text:
+        return False
+    indicators = (
+        "acp prompt failed",
+        "acp prompt timed out after",
+        "queue owner disconnected before prompt completion",
+        "agent needs reconnect",
+    )
+    return any(indicator in text for indicator in indicators)
+
+
 def _execute_code_generation(
     stage_dir: Path,
     run_dir: Path,
@@ -74,6 +906,7 @@ def _execute_code_generation(
     prompts: PromptManager | None = None,
 ) -> StageResult:
     exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or ""
+    exp_plan_prompt = _build_codegen_plan_summary(exp_plan, config)
     metric = config.experiment.metric_key
     max_repair = 5  # BUG-14: Increased from 3 to give more chances for critical bugs
     files: dict[str, str] = {}
@@ -208,11 +1041,29 @@ def _execute_code_generation(
             )
             _bp_block = _bp.to_prompt_block()
             if _bp_block:
-                extra_guidance += (
-                    "\n\n## BenchmarkAgent Selections (USE THESE)\n"
+                _has_existing_plan_assets = any(
+                    item.get("origin") == "existing_plan"
+                    for item in (_bp.selected_benchmarks + _bp.selected_baselines)
+                    if isinstance(item, dict)
+                )
+                _bp_heading = "## BenchmarkAgent Selections (USE THESE)"
+                _bp_instruction = (
                     "The following datasets, baselines, and code snippets were "
                     "automatically selected and validated by the BenchmarkAgent. "
                     "You MUST use these selections in your experiment code.\n\n"
+                )
+                if _has_existing_plan_assets:
+                    _bp_heading = "## BenchmarkAgent Selections (PRESERVE IN-PROJECT ASSETS)"
+                    _bp_instruction = (
+                        "The following datasets and baselines include in-project "
+                        "assets carried over from the existing experiment plan plus "
+                        "BenchmarkAgent supplements. You MUST preserve the in-project "
+                        "datasets/baselines and may use the extra BenchmarkAgent "
+                        "selections only as supplemental additions.\n\n"
+                    )
+                extra_guidance += (
+                    f"\n\n{_bp_heading}\n"
+                    + _bp_instruction
                     + _bp_block
                 )
                 logger.info(
@@ -317,7 +1168,18 @@ def _execute_code_generation(
         "- Prefer lightweight CPU-friendly libraries (numpy, scipy, "
         "sklearn, pandas) unless deep learning is inherent to the topic.\n"
         "- The experiment MUST be self-contained and runnable without GPU.\n"
+        "- The returned experiment project must be self-contained at the file "
+        "level. If `main.py` or any other file imports a local helper module "
+        "(for example `models`, `utils`, `data_utils`, `metrics`, `loaders`), "
+        "you MUST return that helper file too.\n"
+        "- Never reference a local Python module that is absent from the "
+        "returned file set. If in doubt, inline the helper code into an "
+        "existing returned file instead of importing a missing module.\n"
     )
+    repair_brief = _build_research_repair_brief(run_dir)
+    if repair_brief:
+        extra_guidance += "\n\n" + repair_brief
+    extra_guidance += _build_real_data_guard_guidance(config)
 
     # --- Code generation: Beast Mode → CodeAgent → Legacy single-shot ---
     _code_agent_active = False
@@ -401,7 +1263,7 @@ def _execute_code_generation(
                 _oc_result: OpenCodeResult = _bridge.generate(
                     stage_dir=stage_dir,
                     topic=config.research.topic,
-                    exp_plan=exp_plan,
+                    exp_plan=exp_plan_prompt,
                     metric=metric,
                     pkg_hint=pkg_hint + "\n" + compute_budget,
                     extra_guidance=extra_guidance,
@@ -503,52 +1365,75 @@ def _execute_code_generation(
         except Exception:  # noqa: BLE001
             logger.debug("Domain detection unavailable", exc_info=True)
 
-        _agent = _CodeAgent(
-            llm=llm,
-            prompts=_pm,
-            config=_ca_cfg,
-            stage_dir=stage_dir,
-            sandbox_factory=_sandbox_factory,
-            experiment_config=config.experiment,
-            domain_profile=_domain_profile,
-            code_search_result=_code_search_result,
-        )
-        _agent_result = _agent.generate(
-            topic=config.research.topic,
-            exp_plan=exp_plan,
-            metric=metric,
-            pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
-            max_tokens=_code_max_tokens,
-        )
-        files = _agent_result.files
-        _code_agent_active = True
-
-        # Write agent artifacts
-        (stage_dir / "code_agent_log.json").write_text(
-            json.dumps(
-                {
-                    "log": _agent_result.validation_log,
-                    "llm_calls": _agent_result.total_llm_calls,
-                    "sandbox_runs": _agent_result.total_sandbox_runs,
-                    "best_score": _agent_result.best_score,
-                    "tree_nodes_explored": _agent_result.tree_nodes_explored,
-                    "review_rounds": _agent_result.review_rounds,
-                },
-                indent=2,
-            ),
-            encoding="utf-8",
-        )
-        if _agent_result.architecture_spec:
-            (stage_dir / "architecture_spec.yaml").write_text(
-                _agent_result.architecture_spec, encoding="utf-8",
+        try:
+            _agent = _CodeAgent(
+                llm=llm,
+                prompts=_pm,
+                config=_ca_cfg,
+                stage_dir=stage_dir,
+                sandbox_factory=_sandbox_factory,
+                experiment_config=config.experiment,
+                domain_profile=_domain_profile,
+                code_search_result=_code_search_result,
             )
-        logger.info(
-            "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f",
-            _agent_result.total_llm_calls,
-            _agent_result.total_sandbox_runs,
-            _agent_result.best_score,
-        )
-    elif not _beast_mode_used and llm is not None:
+            _agent_result = _agent.generate(
+                topic=config.research.topic,
+                exp_plan=exp_plan_prompt,
+                metric=metric,
+                pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
+                max_tokens=_code_max_tokens,
+            )
+            files = _agent_result.files
+            _code_agent_active = True
+
+            # Write agent artifacts
+            (stage_dir / "code_agent_log.json").write_text(
+                json.dumps(
+                    {
+                        "log": _agent_result.validation_log,
+                        "llm_calls": _agent_result.total_llm_calls,
+                        "sandbox_runs": _agent_result.total_sandbox_runs,
+                        "best_score": _agent_result.best_score,
+                        "tree_nodes_explored": _agent_result.tree_nodes_explored,
+                        "review_rounds": _agent_result.review_rounds,
+                    },
+                    indent=2,
+                ),
+                encoding="utf-8",
+            )
+            if _agent_result.architecture_spec:
+                (stage_dir / "architecture_spec.yaml").write_text(
+                    _agent_result.architecture_spec, encoding="utf-8",
+                )
+            logger.info(
+                "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f",
+                _agent_result.total_llm_calls,
+                _agent_result.total_sandbox_runs,
+                _agent_result.best_score,
+            )
+        except Exception as exc:
+            fallback_enabled = bool(
+                getattr(_ca_cfg, "fallback_to_legacy_on_acp_failure", False)
+            )
+            if fallback_enabled and _is_acp_transport_failure(exc):
+                fallback_payload = {
+                    "fallback_triggered": True,
+                    "reason": "code_agent_acp_transport_failure",
+                    "error": str(exc),
+                    "triggered_at": _utcnow_iso(),
+                }
+                (stage_dir / "code_agent_fallback.json").write_text(
+                    json.dumps(fallback_payload, indent=2),
+                    encoding="utf-8",
+                )
+                logger.warning(
+                    "CodeAgent ACP transport failure detected; falling back to legacy single-shot generation: %s",
+                    exc,
+                )
+            else:
+                raise
+
+    if not _beast_mode_used and llm is not None and not _code_agent_active:
         # ── Legacy single-shot generation ─────────────────────────────────
         topic = config.research.topic
         _md = config.experiment.metric_direction
@@ -563,7 +1448,7 @@ def _execute_code_generation(
             topic=topic,
             metric=metric,
             pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance,
-            exp_plan=exp_plan,
+            exp_plan=exp_plan_prompt,
             metric_direction_hint=_md_hint,
         )
         # R13-3: Use higher max_tokens for reasoning models (they consume tokens
@@ -817,20 +1702,16 @@ def _execute_code_generation(
 
     # --- P1.1+P1.2: Deep quality analysis (class quality, scoping, API) ---
     deep_warnings = deep_validate_files(files)
+    placeholder_impl_issues = _find_placeholder_experiment_issues(files)
+    distinctness_issues = _find_condition_distinctness_issues(files)
+    deep_warnings.extend(placeholder_impl_issues)
+    deep_warnings.extend(distinctness_issues)
     for w in deep_warnings:
         logger.warning("Stage 10 deep quality: %s", w)
     complexity_warnings.extend(deep_warnings)
 
     # --- P1.2: If critical deep issues found, attempt one repair cycle ---
-    critical_deep = [w for w in deep_warnings if any(
-        kw in w for kw in ("UnboundLocalError", "unregistered", "does not exist",
-                           "empty or trivial subclass", "does NOT override",
-                           "Import-usage mismatch", "NameError",
-                           "was removed", "ptp()",
-                           "copy-paste", "identical method signatures",
-                           "identical AST", "NOT a real ablation",
-                           "shadows stdlib/pip")
-    )]
+    critical_deep = [w for w in deep_warnings if _is_critical_deep_warning(w)]
     if critical_deep and llm is not None:
         logger.info(
             "Stage 10: %d critical code issues found — triggering repair cycle",
@@ -850,6 +1731,14 @@ def _execute_code_generation(
             f"- Use scipy.special.erf, not np.erf\n"
             f"- Ablation/variant classes must have genuinely different logic\n"
             f"- Every class must have a real implementation, not just `pass`\n"
+            f"- Do NOT ship dummy/placeholder/demo experiment code or comments "
+            f"saying 'replace with actual implementation'\n"
+            f"- Core experiment methods such as evaluate/predict/forward must "
+            f"NOT return fixed constants like 0.2 or 0.5 as a stand-in for "
+            f"real computation\n"
+            f"- Multi-condition experiments MUST include and CALL a startup "
+            f"ablation distinctness self-check that compares outputs on the "
+            f"same probe input and raises/asserts if conditions are identical\n"
             f"- Ablation classes MUST override the parent method that implements "
             f"the component being ablated (e.g., if ablating attention, override "
             f"the attention method with a simpler alternative like mean pooling)\n"
@@ -883,17 +1772,11 @@ def _execute_code_generation(
                     (exp_dir / fname).write_text(code, encoding="utf-8")
                 # Re-check after repair
                 deep_warnings_after = deep_validate_files(files)
+                deep_warnings_after.extend(_find_placeholder_experiment_issues(files))
+                deep_warnings_after.extend(_find_condition_distinctness_issues(files))
                 fixed = len(critical_deep) - len([
                     w for w in deep_warnings_after
-                    if any(kw in w for kw in (
-                        "UnboundLocalError", "unregistered", "does not exist",
-                        "empty or trivial subclass", "does NOT override",
-                        "Import-usage mismatch", "NameError",
-                        "was removed", "ptp()",
-                        "copy-paste", "identical method signatures",
-                        "identical AST", "NOT a real ablation",
-                        "shadows stdlib/pip",
-                    ))
+                    if _is_critical_deep_warning(w)
                 ])
                 logger.info(
                     "Stage 10: Deep repair fixed %d/%d critical issues",
@@ -913,6 +1796,46 @@ def _execute_code_generation(
             json.dumps(health, indent=2), encoding="utf-8"
         )
 
+    # --- Hard gate: reject placeholder/dummy experiment implementations ---
+    unresolved_placeholder_issues = _find_placeholder_experiment_issues(files)
+    if unresolved_placeholder_issues:
+        for issue in unresolved_placeholder_issues:
+            logger.warning("Stage 10 placeholder gate: %s", issue)
+            validation_log.append(f"PLACEHOLDER_IMPL: {issue}")
+        (stage_dir / "validation_report.md").write_text(
+            "# Code Validation Report\n\n"
+            "**Status**: BLOCKED — generated experiment code still contains "
+            "placeholder or demonstration-only implementations\n\n"
+            + "\n".join(f"- {issue}" for issue in unresolved_placeholder_issues),
+            encoding="utf-8",
+        )
+        return StageResult(
+            stage=Stage.CODE_GENERATION,
+            status=StageStatus.FAILED,
+            artifacts=("validation_report.md",),
+            evidence_refs=(),
+        )
+
+    # --- Hard gate: require active condition-distinctness self-checks ---
+    unresolved_distinctness_issues = _find_condition_distinctness_issues(files)
+    if unresolved_distinctness_issues:
+        for issue in unresolved_distinctness_issues:
+            logger.warning("Stage 10 distinctness gate: %s", issue)
+            validation_log.append(f"DISTINCTNESS_IMPL: {issue}")
+        (stage_dir / "validation_report.md").write_text(
+            "# Code Validation Report\n\n"
+            "**Status**: BLOCKED — generated experiment does not prove condition "
+            "wiring is distinct\n\n"
+            + "\n".join(f"- {issue}" for issue in unresolved_distinctness_issues),
+            encoding="utf-8",
+        )
+        return StageResult(
+            stage=Stage.CODE_GENERATION,
+            status=StageStatus.FAILED,
+            artifacts=("validation_report.md",),
+            evidence_refs=(),
+        )
+
     # --- P1.4: LLM Code Review (Stage 10.5) ---
     # Skip when CodeAgent is active — Phase 4 review already covers this.
     if llm is not None and not _code_agent_active:
@@ -925,7 +1848,7 @@ def _execute_code_generation(
             f"You are a senior researcher reviewing experiment code for a "
             f"research submission.\n\n"
             f"TOPIC: {config.research.topic}\n"
-            f"EXPERIMENT PLAN:\n{exp_plan[:3000]}\n\n"
+            f"EXPERIMENT PLAN:\n{exp_plan_prompt[:3000]}\n\n"
             f"CODE:\n```python\n{all_code_review}\n```\n\n"
             f"Review the code and return JSON with this EXACT structure:\n"
             f'{{"score": <1-10>, "issues": ['
@@ -1158,9 +2081,12 @@ def _execute_code_generation(
                         f"when the topic describes a tabular, bandit, or game-theoretic method.\n"
                         f"- Use ONLY lightweight CPU-friendly libraries (numpy, scipy, "
                         f"sklearn) unless the topic EXPLICITLY requires deep learning.\n"
-                        f"- The experiment must be self-contained and runnable without GPU.\n\n"
+                        f"- The experiment must be self-contained and runnable without GPU.\n"
+                        f"- If any file imports a local helper module, return that helper "
+                        f"file too. Do not leave unresolved imports like `from models import ...` "
+                        f"without a generated `models.py`.\n\n"
                         f"{pkg_hint}\n{compute_budget}\n"
-                        f"PLAN:\n{exp_plan}\n\n"
+                        f"PLAN:\n{exp_plan_prompt}\n\n"
                         f"Return multiple files using ```filename:xxx.py format."
                     )
                     regen_resp = _chat_with_prompt(
@@ -1302,7 +2228,64 @@ def _execute_code_generation(
         except Exception as exc:
             logger.debug("Ablation validation skipped: %s", exc)
 
+    # --- Self-contained project gate ---
+    unresolved_local_imports = _find_missing_local_module_imports(files)
+    if unresolved_local_imports:
+        for issue in unresolved_local_imports:
+            logger.warning("Stage 10 self-containment: %s", issue)
+            validation_log.append(f"SELF_CONTAINED: {issue}")
+        if llm is not None:
+            files, unresolved_local_imports = _repair_self_contained_project(
+                llm=llm,
+                prompt_manager=_pm,
+                files=files,
+                issues=unresolved_local_imports,
+                max_tokens=_code_max_tokens,
+                max_repair=max_repair,
+            )
+            for fname, code in files.items():
+                (exp_dir / fname).write_text(code, encoding="utf-8")
+        if unresolved_local_imports:
+            (stage_dir / "validation_report.md").write_text(
+                "# Code Validation Report\n\n"
+                "**Status**: BLOCKED — generated experiment project is not self-contained\n\n"
+                + "\n".join(f"- {issue}" for issue in unresolved_local_imports),
+                encoding="utf-8",
+            )
+            return StageResult(
+                stage=Stage.CODE_GENERATION,
+                status=StageStatus.FAILED,
+                artifacts=("validation_report.md",),
+                evidence_refs=(),
+            )
+
     # --- Write spec ---
+    if getattr(config.experiment, "forbid_synthetic_proxy", False):
+        _proxy_signals = detect_synthetic_proxy_signals(
+            {fname: code for fname, code in files.items() if fname.endswith(".py")}
+        )
+        if should_fail_synthetic_proxy_guard(_proxy_signals):
+            guard_payload = {
+                "status": "failed",
+                "reason": "synthetic_proxy_detected",
+                "signals": _proxy_signals,
+                "timestamp": _utcnow_iso(),
+            }
+            (stage_dir / "real_data_guard.json").write_text(
+                json.dumps(guard_payload, indent=2), encoding="utf-8"
+            )
+            logger.error(
+                "Stage 10: Real-data guard blocked generated experiment code: %s",
+                "; ".join(_proxy_signals),
+            )
+            return StageResult(
+                stage=Stage.CODE_GENERATION,
+                status=StageStatus.FAILED,
+                artifacts=("experiment/", "real_data_guard.json"),
+                evidence_refs=("stage-10/experiment/", "stage-10/real_data_guard.json"),
+                error="Real-data guard blocked synthetic/proxy fallback code generation.",
+            )
+
     file_list = ", ".join(f"`{f}`" for f in sorted(files.keys()))
     main_validation = validate_code(files.get("main.py", ""))
     _align_status = "ALIGNED" if alignment_ok else f"MISALIGNED: {alignment_note}"
@@ -1361,4 +2344,3 @@ def _execute_code_generation(
         artifacts=tuple(artifacts),
         evidence_refs=tuple(f"stage-10/{a}" for a in artifacts),
     )
-
diff --git a/researchclaw/pipeline/stage_impls/_execution.py b/researchclaw/pipeline/stage_impls/_execution.py
index 8858cc2d..b9a257cc 100644
--- a/researchclaw/pipeline/stage_impls/_execution.py
+++ b/researchclaw/pipeline/stage_impls/_execution.py
@@ -5,11 +5,14 @@
 import json
 import logging
 import math
+import os
 import re
 import time as _time
 from pathlib import Path
 from typing import Any
 
+import yaml
+
 from researchclaw.adapters import AdapterBundle
 from researchclaw.config import RCConfig
 from researchclaw.experiment.validator import (
@@ -21,6 +24,7 @@
 from researchclaw.pipeline._domain import _detect_domain
 from researchclaw.pipeline._helpers import (
     StageResult,
+    _build_research_repair_brief,
     _chat_with_prompt,
     _detect_runtime_issues,
     _ensure_sandbox_deps,
@@ -32,6 +36,8 @@
     _read_prior_artifact,
     _safe_filename,
     _safe_json_loads,
+    detect_synthetic_proxy_signals,
+    should_fail_synthetic_proxy_guard,
     _utcnow_iso,
     _write_stage_meta,
 )
@@ -41,6 +47,238 @@
 logger = logging.getLogger(__name__)
 
 
+_KNOWN_REAL_ASSET_ARG_MAP: tuple[tuple[str, str], ...] = (
+    ("--simple_manifest", "VECTRA_SIMPLE_MANIFEST_PATH"),
+    ("--simple_heatmap_cache", "VECTRA_SIMPLE_HEATMAP_DIR"),
+    ("--simple_asset_root", "VECTRA_SIMPLE_ASSET_ROOT"),
+    ("--page_dataset_root", "VECTRA_PAGE_DATASET_ROOT"),
+    ("--page_image_dir", "VECTRA_PAGE_IMAGE_DIR"),
+    ("--page_sidecar_dir", "VECTRA_PAGE_SIDECAR_DIR"),
+    ("--page_split_json", "VECTRA_PAGE_SPLIT_JSON"),
+    ("--gt_solid_csv", "VECTRA_PAGE_GT_SOLID_CSV"),
+    ("--gt_dashed_csv", "VECTRA_PAGE_GT_DASHED_CSV"),
+    ("--deep_patent_root", "VECTRA_DEEPPATENT_DATASET_ROOT"),
+)
+
+
+def _collect_vectra_env_overrides() -> dict[str, str]:
+    """Return non-empty VECTRA_* variables from the current runtime."""
+    overrides: dict[str, str] = {}
+    for name, value in os.environ.items():
+        if not name.startswith("VECTRA_"):
+            continue
+        cleaned = str(value).strip()
+        if cleaned:
+            overrides[name] = cleaned
+    return overrides
+
+
+def _build_project_entrypoint_runtime_overrides(
+    project_dir: Path,
+    *,
+    entry_point: str = "main.py",
+) -> tuple[list[str], dict[str, str]]:
+    """Derive optional CLI args/env for generated projects that require local assets.
+
+    The harness still calls ``python main.py`` by default, but some generated
+    experiments insist on asset-path flags. When those flags are present, use the
+    already-resolved VECTRA_* runtime variables as optional overrides.
+    """
+    env_overrides = _collect_vectra_env_overrides()
+    entry_path = project_dir / entry_point
+    if not entry_path.exists():
+        return [], env_overrides
+
+    try:
+        source = entry_path.read_text(encoding="utf-8")
+    except (OSError, UnicodeDecodeError):
+        return [], env_overrides
+
+    args: list[str] = []
+    missing: list[str] = []
+    for flag, env_name in _KNOWN_REAL_ASSET_ARG_MAP:
+        if flag not in source:
+            continue
+        value = env_overrides.get(env_name, "").strip()
+        if value:
+            args.extend([flag, value])
+        else:
+            missing.append(f"{flag} <= {env_name}")
+
+    if args:
+        logger.info(
+            "Execution harness injecting %d real-data CLI args for %s",
+            len(args) // 2,
+            entry_path,
+        )
+    if missing:
+        logger.warning(
+            "Execution harness detected asset CLI flags in %s but runtime vars were missing: %s",
+            entry_path,
+            ", ".join(missing),
+        )
+    return args, env_overrides
+
+
+def _shorten_prompt_text(text: str, *, max_chars: int) -> str:
+    """Collapse whitespace and trim *text* without cutting mid-word when possible."""
+    compact = re.sub(r"\s+", " ", text).strip()
+    if len(compact) <= max_chars:
+        return compact
+    head = compact[: max_chars - 3].rsplit(" ", 1)[0].strip()
+    return f"{head or compact[: max_chars - 3]}..."
+
+
+def _compact_research_topic(topic: str) -> str:
+    """Return a short topic line for Stage 13 without discarding the plan."""
+    if not topic.strip():
+        return ""
+    lines = [line.strip() for line in topic.splitlines()]
+    summary_lines: list[str] = []
+    stop_headers = (
+        "important constraints:",
+        "please produce:",
+        "existing assets",
+        "existing baselines:",
+        "main source doc:",
+        "core code path:",
+        "key functions",
+        "existing result artifacts:",
+        "existing conclusions to preserve:",
+    )
+    for line in lines:
+        if not line:
+            if summary_lines:
+                break
+            continue
+        lowered = line.lower()
+        if lowered.startswith("- ") or re.match(r"^\d+\.", line):
+            break
+        if any(lowered.startswith(header) for header in stop_headers):
+            break
+        summary_lines.append(line)
+    summary = " ".join(summary_lines) if summary_lines else topic
+    return _shorten_prompt_text(summary, max_chars=260)
+
+
+def _extract_topic_constraints(topic: str, *, max_items: int = 8) -> list[str]:
+    """Pull compact bullet constraints out of long topic briefs."""
+    lines = [line.strip() for line in topic.splitlines()]
+    bullets: list[str] = []
+    capture = False
+    wanted_headers = {
+        "important constraints",
+        "existing conclusions to preserve",
+    }
+    for line in lines:
+        lowered = line.lower().rstrip(":")
+        if lowered in wanted_headers:
+            capture = True
+            continue
+        if not capture:
+            continue
+        if not line:
+            continue
+        if line.startswith("- "):
+            bullets.append(line[2:].strip())
+            if len(bullets) >= max_items:
+                break
+            continue
+        if re.match(r"^\d+\.", line) or line.endswith(":"):
+            capture = False
+    return bullets
+
+
+def _named_plan_entries(entries: Any, *, max_items: int = 8) -> list[str]:
+    """Extract entry names from list-shaped plan sections."""
+    names: list[str] = []
+    if not isinstance(entries, list):
+        return names
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name")
+        if isinstance(name, str) and name.strip():
+            names.append(name.strip())
+        if len(names) >= max_items:
+            break
+    return names
+
+
+def _build_refine_prompt_context(
+    topic: str,
+    exp_plan_text: str,
+    repair_brief: str = "",
+) -> tuple[str, str]:
+    """Split long research briefs into a short topic and a structured anchor."""
+    compact_topic = _compact_research_topic(topic)
+    plan_summary_lines: list[str] = []
+    try:
+        plan_data = yaml.safe_load(exp_plan_text) if exp_plan_text.strip() else {}
+    except yaml.YAMLError:
+        plan_data = {}
+    if isinstance(plan_data, dict):
+        objectives = plan_data.get("objectives")
+        if isinstance(objectives, dict):
+            for label, key in (
+                ("Problem formulation", "problem_formulation"),
+                ("Novelty statement", "novelty_statement"),
+                ("Recommended first prototype", "recommended_first_prototype"),
+            ):
+                value = objectives.get(key)
+                if isinstance(value, str) and value.strip():
+                    plan_summary_lines.append(
+                        f"- {label}: {_shorten_prompt_text(value, max_chars=220)}"
+                    )
+            research_questions = objectives.get("research_questions")
+            if isinstance(research_questions, list) and research_questions:
+                rq_text = "; ".join(
+                    str(item).strip() for item in research_questions[:4] if str(item).strip()
+                )
+                if rq_text:
+                    plan_summary_lines.append(
+                        f"- Research questions: {_shorten_prompt_text(rq_text, max_chars=220)}"
+                    )
+        metrics = plan_data.get("metrics")
+        if isinstance(metrics, dict):
+            primary_metric = metrics.get("primary_metric")
+            if isinstance(primary_metric, dict):
+                metric_name = primary_metric.get("name")
+                if isinstance(metric_name, str) and metric_name.strip():
+                    plan_summary_lines.append(f"- Primary metric: {metric_name.strip()}")
+        for label, key in (
+            ("Baselines", "baselines"),
+            ("Proposed methods", "proposed_methods"),
+            ("Ablations", "ablations"),
+        ):
+            names = _named_plan_entries(plan_data.get(key))
+            if names:
+                plan_summary_lines.append(f"- {label}: {', '.join(names)}")
+
+    anchor_parts: list[str] = []
+    if repair_brief.strip():
+        anchor_parts.append(repair_brief.strip())
+    if plan_summary_lines:
+        anchor_parts.append(
+            "Structured experiment plan summary:\n" + "\n".join(plan_summary_lines)
+        )
+    topic_constraints = _extract_topic_constraints(topic)
+    if topic_constraints:
+        anchor_parts.append(
+            "Key research constraints to preserve:\n"
+            + "\n".join(f"- {item}" for item in topic_constraints)
+        )
+    if exp_plan_text.strip():
+        excerpt_limit = 900 if repair_brief.strip() else 1600
+        excerpt = exp_plan_text[:excerpt_limit].rstrip()
+        suffix = "\n...\n" if len(exp_plan_text) > excerpt_limit else "\n"
+        anchor_parts.append(
+            "Original experiment plan excerpt:\n"
+            f"```yaml\n{excerpt}{suffix}```\n"
+        )
+    return compact_topic or _shorten_prompt_text(topic, max_chars=260), "\n\n".join(anchor_parts)
+
+
 def _execute_resource_planning(
     stage_dir: Path,
     run_dir: Path,
@@ -131,6 +369,8 @@ def _execute_experiment_run(
     runs_dir.mkdir(parents=True, exist_ok=True)
     mode = config.experiment.mode
     if mode in ("sandbox", "docker"):
+        stage_status = StageStatus.DONE
+        stage_error: str | None = None
         # P7: Auto-install missing dependencies before subprocess sandbox
         if mode == "sandbox":
             _all_code = code_text
@@ -145,8 +385,14 @@ def _execute_experiment_run(
         sandbox = create_sandbox(config.experiment, runs_dir / "sandbox")
         # Use run_project for multi-file, run for single-file
         if exp_dir_path and Path(exp_dir_path).is_dir():
+            entry_args, env_overrides = _build_project_entrypoint_runtime_overrides(
+                Path(exp_dir_path)
+            )
             result = sandbox.run_project(
-                Path(exp_dir_path), timeout_sec=config.experiment.time_budget_sec
+                Path(exp_dir_path),
+                timeout_sec=config.experiment.time_budget_sec,
+                args=entry_args,
+                env_overrides=env_overrides,
             )
         else:
             result = sandbox.run(
@@ -220,6 +466,51 @@ def _execute_experiment_run(
         }
         if structured_results is not None:
             run_payload["structured_results"] = structured_results
+
+        guard_issues: list[str] = []
+        _structured_source = (
+            structured_results.get("source")
+            if isinstance(structured_results, dict)
+            else None
+        )
+        if getattr(config.experiment, "fail_on_stdout_parsed_results", False):
+            if structured_results is None and effective_metrics:
+                guard_issues.append(
+                    "structured results.json was missing; metrics were only recoverable via stdout parsing"
+                )
+            elif _structured_source == "stdout_parsed":
+                guard_issues.append(
+                    "results.json declares source=stdout_parsed instead of a structured experiment output"
+                )
+
+        if getattr(config.experiment, "forbid_synthetic_proxy", False) and sandbox_project.exists():
+            _project_texts: dict[str, str] = {}
+            for _pyf in sandbox_project.glob("*.py"):
+                try:
+                    _project_texts[_pyf.name] = _pyf.read_text(encoding="utf-8")
+                except (OSError, UnicodeDecodeError):
+                    continue
+            _proxy_signals = detect_synthetic_proxy_signals(_project_texts)
+            if should_fail_synthetic_proxy_guard(_proxy_signals):
+                guard_issues.extend(_proxy_signals)
+
+        if guard_issues:
+            guard_payload = {
+                "status": "failed",
+                "issues": guard_issues,
+                "structured_results_present": structured_results is not None,
+                "structured_results_source": _structured_source,
+                "timestamp": _utcnow_iso(),
+            }
+            (stage_dir / "real_data_guard.json").write_text(
+                json.dumps(guard_payload, indent=2), encoding="utf-8"
+            )
+            run_status = "failed"
+            run_payload["status"] = run_status
+            run_payload["real_data_guard"] = guard_payload
+            stage_status = StageStatus.FAILED
+            stage_error = "Real-data guard blocked proxy or stdout-only experiment results."
+
         # Auto-generate results.json from parsed metrics if sandbox didn't produce one
         if structured_results is None and effective_metrics:
             auto_results = {"source": "stdout_parsed", "metrics": effective_metrics}
@@ -325,6 +616,18 @@ def _execute_experiment_run(
             (runs_dir / f"{_safe_filename(run_id)}.json").write_text(
                 json.dumps(payload, indent=2), encoding="utf-8"
             )
+        artifacts = ["runs/"]
+        evidence_refs = ["stage-12/runs/"]
+        if (stage_dir / "real_data_guard.json").exists():
+            artifacts.append("real_data_guard.json")
+            evidence_refs.append("stage-12/real_data_guard.json")
+        return StageResult(
+            stage=Stage.EXPERIMENT_RUN,
+            status=stage_status,
+            artifacts=tuple(artifacts),
+            evidence_refs=tuple(evidence_refs),
+            error=stage_error,
+        )
     return StageResult(
         stage=Stage.EXPERIMENT_RUN,
         status=StageStatus.DONE,
@@ -652,6 +955,41 @@ def _files_to_context(project_files: dict[str, str]) -> str:
             parts.append(f"```filename:{fname}\n{code}\n```")
         return "\n\n".join(parts)
 
+    def _write_refinement_log() -> None:
+        (stage_dir / "refinement_log.json").write_text(
+            json.dumps(log, indent=2), encoding="utf-8"
+        )
+
+    def _pause_refinement(
+        *,
+        reason: str,
+        stop_reason: str,
+        iteration: int | None = None,
+    ) -> StageResult:
+        log.update(
+            {
+                "paused": True,
+                "converged": False,
+                "stop_reason": stop_reason,
+                "pause_reason": reason,
+                "best_metric": best_metric,
+                "best_version": best_version,
+                "iterations_completed": len(log["iterations"]),
+            }
+        )
+        if iteration is not None:
+            log["pause_iteration"] = iteration
+        _write_refinement_log()
+        artifacts = ("refinement_log.json",)
+        return StageResult(
+            stage=Stage.ITERATIVE_REFINE,
+            status=StageStatus.PAUSED,
+            artifacts=artifacts,
+            error=reason,
+            decision="resume",
+            evidence_refs=tuple(f"stage-13/{a}" for a in artifacts),
+        )
+
     if llm is None:
         logger.info("Stage 13: LLM unavailable, saving original experiment as final")
         final_dir = stage_dir / "experiment_final"
@@ -677,9 +1015,7 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                 ],
             }
         )
-        (stage_dir / "refinement_log.json").write_text(
-            json.dumps(log, indent=2), encoding="utf-8"
-        )
+        _write_refinement_log()
         artifacts = ("refinement_log.json", "experiment_final/")
         return StageResult(
             stage=Stage.ITERATIVE_REFINE,
@@ -693,6 +1029,12 @@ def _files_to_context(project_files: dict[str, str]) -> str:
 
     # R7-3: Read experiment plan to detect condition coverage gaps
     _exp_plan_text = _read_prior_artifact(run_dir, "exp_plan.yaml") or ""
+    _repair_brief = _build_research_repair_brief(run_dir)
+    _refine_topic, _exp_plan_anchor = _build_refine_prompt_context(
+        config.research.topic,
+        _exp_plan_text,
+        _repair_brief,
+    )
     _condition_coverage_hint = ""
     if _exp_plan_text and run_summaries:
         # Check if stdout contains condition labels
@@ -764,14 +1106,6 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                 logger.warning("Stage 13: metric saturation detected, injecting difficulty upgrade hint")
 
         files_context = _files_to_context(best_files)
-        # BUG-10 fix: anchor refinement to original experiment plan
-        _exp_plan_anchor = ""
-        if _exp_plan_text.strip():
-            _exp_plan_anchor = (
-                "Original experiment plan (exp_plan.yaml):\n"
-                "```yaml\n" + _exp_plan_text[:4000] + "\n```\n"
-                "You MUST preserve ALL condition names from this plan.\n\n"
-            )
         ip = _pm.sub_prompt(
             "iterative_improve",
             metric_key=metric_key,
@@ -779,7 +1113,7 @@ def _files_to_context(project_files: dict[str, str]) -> str:
             files_context=files_context,
             run_summaries=chr(10).join(run_summaries[:20]),
             condition_coverage_hint=_condition_coverage_hint,
-            topic=config.research.topic,
+            topic=_refine_topic,
             exp_plan_anchor=_exp_plan_anchor,
         )
 
@@ -803,12 +1137,25 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                 timeout_refine_attempts,
             )
 
-        response = _chat_with_prompt(
-            llm,
-            ip.system,
-            user_prompt,
-            max_tokens=ip.max_tokens or 8192,
-        )
+        try:
+            response = _chat_with_prompt(
+                llm,
+                ip.system,
+                user_prompt,
+                max_tokens=ip.max_tokens or 8192,
+            )
+        except RuntimeError as exc:
+            if "ACP prompt timed out after" in str(exc):
+                logger.warning(
+                    "Stage 13: ACP prompt timed out during iteration %d; pausing for resume",
+                    iteration,
+                )
+                return _pause_refinement(
+                    reason=str(exc),
+                    stop_reason="acp_prompt_timeout",
+                    iteration=iteration,
+                )
+            raise
         extracted_files = _extract_multi_file_blocks(response.content)
         # If LLM returns only single block, treat as main.py update
         if not extracted_files:
@@ -865,7 +1212,20 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                 issue_text=issue_text,
                 all_files_ctx=_files_to_context(candidate_files),
             )
-            repair_response = _chat_with_prompt(llm, irp.system, irp.user)
+            try:
+                repair_response = _chat_with_prompt(llm, irp.system, irp.user)
+            except RuntimeError as exc:
+                if "ACP prompt timed out after" in str(exc):
+                    logger.warning(
+                        "Stage 13: ACP repair prompt timed out during iteration %d; pausing for resume",
+                        iteration,
+                    )
+                    return _pause_refinement(
+                        reason=str(exc),
+                        stop_reason="acp_prompt_timeout",
+                        iteration=iteration,
+                    )
+                raise
             candidate_files["main.py"] = _extract_code_block(repair_response.content)
             validation = validate_code(candidate_files["main.py"])
             repaired = True
@@ -898,9 +1258,14 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                 config.experiment,
                 stage_dir / f"refine_sandbox_v{iteration}",
             )
+            rerun_args, rerun_env = _build_project_entrypoint_runtime_overrides(
+                version_dir
+            )
             rerun = sandbox.run_project(
                 version_dir,
                 timeout_sec=config.experiment.time_budget_sec,
+                args=rerun_args,
+                env_overrides=rerun_env,
             )
             metric_val = _find_metric(rerun.metrics, metric_key)
             # R19-1: Store stdout (capped) so PAIRED lines survive for Stage 14
@@ -977,7 +1342,20 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                     issue_text=runtime_issues,
                     all_files_ctx=_files_to_context(candidate_files),
                 )
-                repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user)
+                try:
+                    repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user)
+                except RuntimeError as exc:
+                    if "ACP prompt timed out after" in str(exc):
+                        logger.warning(
+                            "Stage 13: ACP runtime-repair prompt timed out during iteration %d; pausing for resume",
+                            iteration,
+                        )
+                        return _pause_refinement(
+                            reason=str(exc),
+                            stop_reason="acp_prompt_timeout",
+                            iteration=iteration,
+                        )
+                    raise
                 repaired_files = _extract_multi_file_blocks(repair_resp.content)
                 if not repaired_files:
                     single = _extract_code_block(repair_resp.content)
@@ -996,9 +1374,14 @@ def _files_to_context(project_files: dict[str, str]) -> str:
                         config.experiment,
                         stage_dir / f"refine_sandbox_v{iteration}_fix",
                     )
+                    rerun2_args, rerun2_env = _build_project_entrypoint_runtime_overrides(
+                        version_dir
+                    )
                     rerun2 = sandbox2.run_project(
                         version_dir,
                         timeout_sec=config.experiment.time_budget_sec,
+                        args=rerun2_args,
+                        env_overrides=rerun2_env,
                     )
                     metric_val = _find_metric(rerun2.metrics, metric_key)
                     iter_record["sandbox_after_fix"] = {
@@ -1067,9 +1450,7 @@ def _files_to_context(project_files: dict[str, str]) -> str:
     )
     if _all_ablation_identical:
         log["ablation_identical_warning"] = True
-    (stage_dir / "refinement_log.json").write_text(
-        json.dumps(log, indent=2), encoding="utf-8"
-    )
+    _write_refinement_log()
 
     artifacts = ["refinement_log.json", "experiment_final/"]
     artifacts.extend(
diff --git a/tests/test_rc_cli.py b/tests/test_rc_cli.py
index 3123ba82..4bf67e0d 100644
--- a/tests/test_rc_cli.py
+++ b/tests/test_rc_cli.py
@@ -9,6 +9,8 @@
 
 from researchclaw import cli as rc_cli
 from researchclaw.config import resolve_config_path
+from researchclaw.pipeline.executor import StageResult
+from researchclaw.pipeline.stages import Stage, StageStatus
 
 
 def _write_valid_config(path: Path) -> None:
@@ -100,6 +102,55 @@ def test_cmd_validate_valid_config_returns_zero(
     assert "Config validation passed" in capsys.readouterr().out
 
 
+def test_cmd_run_reports_paused_pipeline(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    config_path = tmp_path / "config.yaml"
+    _write_valid_config(config_path)
+    output_dir = tmp_path / "artifacts" / "paused-run"
+
+    from researchclaw.pipeline import runner as rc_runner
+
+    monkeypatch.setattr(
+        rc_runner,
+        "execute_pipeline",
+        lambda **kwargs: [
+            StageResult(
+                stage=Stage.TOPIC_INIT,
+                status=StageStatus.DONE,
+                artifacts=("goal.md",),
+            ),
+            StageResult(
+                stage=Stage.PROBLEM_DECOMPOSE,
+                status=StageStatus.PAUSED,
+                artifacts=("refinement_log.json",),
+                error="ACP prompt timed out after 1800s",
+                decision="resume",
+            ),
+        ],
+    )
+    monkeypatch.setattr(rc_runner, "read_checkpoint", lambda run_dir: None)
+
+    args = argparse.Namespace(
+        config=str(config_path),
+        topic=None,
+        output=str(output_dir),
+        from_stage=None,
+        auto_approve=False,
+        skip_preflight=True,
+        resume=False,
+        skip_noncritical_stage=False,
+        no_graceful_degradation=False,
+    )
+    code = rc_cli.cmd_run(args)
+    captured = capsys.readouterr()
+    assert code == 0
+    assert "Pipeline paused:" in captured.out
+    assert "1 paused" in captured.out
+
+
 def test_main_dispatches_run_command(monkeypatch: pytest.MonkeyPatch) -> None:
     captured = {}
 
diff --git a/tests/test_rc_executor.py b/tests/test_rc_executor.py
index 8554ad87..80b8046d 100644
--- a/tests/test_rc_executor.py
+++ b/tests/test_rc_executor.py
@@ -13,6 +13,7 @@
 from researchclaw.adapters import AdapterBundle
 from researchclaw.config import RCConfig
 from researchclaw.pipeline import executor as rc_executor
+from researchclaw.pipeline.stage_impls import _code_generation as code_generation
 from researchclaw.pipeline.stages import Stage, StageStatus
 
 
@@ -37,6 +38,26 @@ def __init__(self, response_text: str = "mock response"):
         )
 
 
+class SequencedFakeLLMClient(FakeLLMClient):
+    def __init__(self, responses: list[str]):
+        super().__init__(response_text=responses[-1] if responses else "mock response")
+        self._responses = list(responses)
+        self._idx = 0
+
+    def chat(self, messages: list[dict[str, str]], **kwargs: object):
+        _ = kwargs
+        self.calls.append(messages)
+        from researchclaw.llm.client import LLMResponse
+
+        if self._responses:
+            idx = min(self._idx, len(self._responses) - 1)
+            content = self._responses[idx]
+            self._idx += 1
+        else:
+            content = self.response_text
+        return LLMResponse(content=content, model="fake-model")
+
+
 @pytest.fixture()
 def rc_config(tmp_path: Path) -> RCConfig:
     data = {
@@ -272,6 +293,29 @@ def test_write_stage_meta_writes_expected_json(run_dir: Path) -> None:
     assert re.match(r"\d{4}-\d{2}-\d{2}T", payload["ts"])
 
 
+def test_write_stage_meta_keeps_paused_stage_as_next_stage(run_dir: Path) -> None:
+    stage_dir = run_dir / "stage-02"
+    stage_dir.mkdir()
+    result = rc_executor.StageResult(
+        stage=Stage.PROBLEM_DECOMPOSE,
+        status=StageStatus.PAUSED,
+        artifacts=("refinement_log.json",),
+        decision="resume",
+        error="ACP prompt timed out after 1800s",
+        evidence_refs=("stage-02/refinement_log.json",),
+    )
+    rc_executor._write_stage_meta(
+        stage_dir, Stage.PROBLEM_DECOMPOSE, "run-paused", result
+    )
+    payload = cast(
+        dict[str, Any],
+        json.loads((stage_dir / "decision.json").read_text(encoding="utf-8")),
+    )
+    assert payload["status"] == "paused"
+    assert payload["decision"] == "resume"
+    assert payload["next_stage"] == int(Stage.PROBLEM_DECOMPOSE)
+
+
 def test_execute_stage_creates_stage_dir_writes_artifacts_and_meta(
     monkeypatch: pytest.MonkeyPatch,
     run_dir: Path,
@@ -751,6 +795,45 @@ def test_refine_no_llm_saves_original_as_final(
         assert payload["stop_reason"] == "llm_unavailable"
         assert result.status == StageStatus.DONE
 
+    def test_refine_acp_timeout_pauses_for_resume(
+        self,
+        run_dir: Path,
+        rc_config: RCConfig,
+        adapters: AdapterBundle,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        self._prepare_refine_inputs(run_dir)
+        stage_dir = run_dir / "stage-13"
+        stage_dir.mkdir(parents=True, exist_ok=True)
+
+        from researchclaw.pipeline.stage_impls import _execution as execution_impl
+
+        def _timeout(*args, **kwargs):
+            _ = args, kwargs
+            raise RuntimeError("ACP prompt timed out after 1800s")
+
+        monkeypatch.setattr(execution_impl, "_chat_with_prompt", _timeout)
+
+        result = rc_executor._execute_iterative_refine(
+            stage_dir,
+            run_dir,
+            rc_config,
+            adapters,
+            llm=FakeLLMClient("unused"),
+        )
+
+        payload = json.loads(
+            (stage_dir / "refinement_log.json").read_text(encoding="utf-8")
+        )
+        assert result.status == StageStatus.PAUSED
+        assert result.decision == "resume"
+        assert result.artifacts == ("refinement_log.json",)
+        assert payload["paused"] is True
+        assert payload["stop_reason"] == "acp_prompt_timeout"
+        assert payload["pause_iteration"] == 1
+        assert payload["best_version"] == "experiment/"
+        assert not (stage_dir / "experiment_final").exists()
+
     def test_refine_with_llm_generates_improved_code(
         self,
         run_dir: Path,
@@ -1911,6 +1994,379 @@ def test_compute_budget_injected_into_code_generation(
         )
         assert "60" in all_user_msgs or "Compute Budget" in all_user_msgs
 
+    def test_code_generation_repairs_missing_local_helper_modules(
+        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
+    ) -> None:
+        data = {
+            "project": {"name": "rc-test", "mode": "docs-first"},
+            "research": {
+                "topic": "optimizer comparison",
+                "domains": ["ml"],
+                "daily_paper_count": 2,
+                "quality_threshold": 8.2,
+            },
+            "runtime": {"timezone": "UTC"},
+            "notifications": {
+                "channel": "local",
+                "on_stage_start": True,
+                "on_stage_fail": False,
+                "on_gate_required": True,
+            },
+            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
+            "openclaw_bridge": {"use_memory": True, "use_message": True},
+            "llm": {
+                "provider": "openai-compatible",
+                "base_url": "http://localhost:1234/v1",
+                "api_key_env": "RC_TEST_KEY",
+                "api_key": "inline-test-key",
+                "primary_model": "fake-model",
+                "fallback_models": [],
+            },
+            "security": {"hitl_required_stages": [5, 9, 20]},
+            "experiment": {
+                "mode": "sandbox",
+                "time_budget_sec": 30,
+                "metric_key": "primary_metric",
+                "metric_direction": "minimize",
+                "code_agent": {"enabled": False},
+                "opencode": {"enabled": False},
+            },
+        }
+        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+
+        _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test")
+
+        initial_generation = (
+            "```filename:main.py\n"
+            "from models import ToyModel\n\n"
+            "def main():\n"
+            "    print('primary_metric: 0.3')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        review_json = '{"score": 8, "issues": [], "verdict": "pass"}'
+        alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}'
+        self_contained_fix = (
+            "```filename:main.py\n"
+            "from models import ToyModel\n\n"
+            "def main():\n"
+            "    _ = ToyModel()\n"
+            "    print('primary_metric: 0.3')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:models.py\n"
+            "class ToyModel:\n"
+            "    pass\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        llm = SequencedFakeLLMClient(
+            [initial_generation, review_json, alignment_json, self_contained_fix]
+        )
+        stage_dir = run_dir / "stage-11"
+        stage_dir.mkdir(parents=True, exist_ok=True)
+
+        result = rc_executor._execute_code_generation(
+            stage_dir, run_dir, cfg, adapters, llm=llm
+        )
+
+        assert result.status == StageStatus.DONE
+        exp_dir = stage_dir / "experiment"
+        assert (exp_dir / "main.py").exists()
+        assert (exp_dir / "models.py").exists()
+
+    def test_detects_placeholder_ablation_stubs(self) -> None:
+        issues = code_generation._find_placeholder_experiment_issues(
+            {
+                "main.py": (
+                    "# Dummy Implementations for Standalone Operation\n"
+                    "class ClutterAwareDisagreementRadiusAdaptiveReranker:\n"
+                    "    def __init__(self, hparams):\n"
+                    "        pass\n"
+                    "    def evaluate(self, seed=None, regime=None):\n"
+                    "        return 0.22\n"
+                )
+            }
+        )
+
+        assert any("Placeholder experiment text found" in issue for issue in issues)
+        assert any(
+            "placeholder experiment implementation" in issue
+            or "demonstration stub" in issue
+            for issue in issues
+        )
+
+    def test_detects_missing_condition_distinctness_check(self) -> None:
+        issues = code_generation._find_condition_distinctness_issues(
+            {
+                "main.py": (
+                    "class BaselineVerifier:\n"
+                    "    def predict(self, value):\n"
+                    "        return {'score': value}\n\n"
+                    "class AblationWithoutRadius:\n"
+                    "    def predict(self, value):\n"
+                    "        return {'score': value + 1}\n\n"
+                    "models = [\n"
+                    "    ('Baseline', BaselineVerifier()),\n"
+                    "    ('Abl_NoRadius', AblationWithoutRadius()),\n"
+                    "    ('Abl_NoVoteShape', AblationWithoutRadius()),\n"
+                    "    ('FusionWithoutScaleBins', AblationWithoutRadius()),\n"
+                    "]\n"
+                )
+            }
+        )
+
+        assert any(
+            "No ablation/condition distinctness self-check found" in issue
+            for issue in issues
+        )
+
+    def test_code_generation_repairs_placeholder_ablation_stubs(
+        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
+    ) -> None:
+        data = {
+            "project": {"name": "rc-test", "mode": "docs-first"},
+            "research": {
+                "topic": "geometry-learning fusion ablation study",
+                "domains": ["ml"],
+                "daily_paper_count": 2,
+                "quality_threshold": 8.2,
+            },
+            "runtime": {"timezone": "UTC"},
+            "notifications": {
+                "channel": "local",
+                "on_stage_start": True,
+                "on_stage_fail": False,
+                "on_gate_required": True,
+            },
+            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
+            "openclaw_bridge": {"use_memory": True, "use_message": True},
+            "llm": {
+                "provider": "openai-compatible",
+                "base_url": "http://localhost:1234/v1",
+                "api_key_env": "RC_TEST_KEY",
+                "api_key": "inline-test-key",
+                "primary_model": "fake-model",
+                "fallback_models": [],
+            },
+            "security": {"hitl_required_stages": [5, 9, 20]},
+            "experiment": {
+                "mode": "sandbox",
+                "time_budget_sec": 30,
+                "metric_key": "primary_metric",
+                "metric_direction": "minimize",
+                "code_agent": {"enabled": False},
+                "opencode": {"enabled": False},
+            },
+        }
+        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+        _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test")
+
+        initial_generation = (
+            "```filename:main.py\n"
+            "# Dummy Implementations for Standalone Operation\n"
+            "class GeometryFusionVerifier:\n"
+            "    def __init__(self, hparams):\n"
+            "        pass\n\n"
+            "    def evaluate(self, seed=None, regime=None):\n"
+            "        return 0.2\n\n"
+            "class AblationWithoutRadius:\n"
+            "    def __init__(self, hparams):\n"
+            "        pass\n\n"
+            "    def evaluate(self, seed=None, regime=None):\n"
+            "        return 0.2\n\n"
+            "def main():\n"
+            "    print('primary_metric: 0.3')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        repaired_generation = (
+            "```filename:main.py\n"
+            "class GeometryFusionVerifier:\n"
+            "    def __init__(self, hparams):\n"
+            "        self.bias = float(hparams.get('bias', 0.0))\n\n"
+            "    def evaluate(self, seed=None, regime=None):\n"
+            "        seed = 0 if seed is None else int(seed)\n"
+            "        base = 0.10 + 0.02 * (seed % 3)\n"
+            "        if regime == 'hard':\n"
+            "            base += 0.05\n"
+            "        return base + self.bias\n\n"
+            "class AblationWithoutRadius(GeometryFusionVerifier):\n"
+            "    def evaluate(self, seed=None, regime=None):\n"
+            "        base = super().evaluate(seed=seed, regime=regime)\n"
+            "        return base + 0.07\n\n"
+            "def main():\n"
+            "    verifier = GeometryFusionVerifier({'bias': 0.01})\n"
+            "    ablation = AblationWithoutRadius({'bias': 0.01})\n"
+            "    primary_metric = min(\n"
+            "        verifier.evaluate(seed=1, regime='hard'),\n"
+            "        ablation.evaluate(seed=1, regime='hard'),\n"
+            "    )\n"
+            "    print(f'primary_metric: {primary_metric:.3f}')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        review_json = '{"score": 8, "issues": [], "verdict": "pass"}'
+        alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}'
+        llm = SequencedFakeLLMClient(
+            [initial_generation, repaired_generation, review_json, alignment_json]
+        )
+        stage_dir = run_dir / "stage-11"
+        stage_dir.mkdir(parents=True, exist_ok=True)
+
+        result = rc_executor._execute_code_generation(
+            stage_dir, run_dir, cfg, adapters, llm=llm
+        )
+
+        assert result.status == StageStatus.DONE
+        main_text = (stage_dir / "experiment" / "main.py").read_text(
+            encoding="utf-8"
+        )
+        assert "Dummy Implementations" not in main_text
+        assert "return 0.2" not in main_text
+
+    def test_code_generation_repairs_missing_condition_distinctness_check(
+        self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle
+    ) -> None:
+        data = {
+            "project": {"name": "rc-test", "mode": "docs-first"},
+            "research": {
+                "topic": "geometry-learning fusion ablation study",
+                "domains": ["ml"],
+                "daily_paper_count": 2,
+                "quality_threshold": 8.2,
+            },
+            "runtime": {"timezone": "UTC"},
+            "notifications": {
+                "channel": "local",
+                "on_stage_start": True,
+                "on_stage_fail": False,
+                "on_gate_required": True,
+            },
+            "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")},
+            "openclaw_bridge": {"use_memory": True, "use_message": True},
+            "llm": {
+                "provider": "openai-compatible",
+                "base_url": "http://localhost:1234/v1",
+                "api_key_env": "RC_TEST_KEY",
+                "api_key": "inline-test-key",
+                "primary_model": "fake-model",
+                "fallback_models": [],
+            },
+            "security": {"hitl_required_stages": [5, 9, 20]},
+            "experiment": {
+                "mode": "sandbox",
+                "time_budget_sec": 30,
+                "metric_key": "primary_metric",
+                "metric_direction": "minimize",
+                "code_agent": {"enabled": False},
+                "opencode": {"enabled": False},
+            },
+        }
+        cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+        _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test")
+
+        initial_generation = (
+            "```filename:main.py\n"
+            "class BaselineVerifier:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value}\n\n"
+            "class AblationWithoutRadius:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 1}\n\n"
+            "class AblationWithoutVoteShape:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 2}\n\n"
+            "class FusionWithoutScaleBins:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 3}\n\n"
+            "models = [\n"
+            "    ('Baseline', BaselineVerifier()),\n"
+            "    ('Abl_NoRadius', AblationWithoutRadius()),\n"
+            "    ('Abl_NoVoteShape', AblationWithoutVoteShape()),\n"
+            "    ('FusionWithoutScaleBins', FusionWithoutScaleBins()),\n"
+            "]\n\n"
+            "def sanity_check_condition_outputs_differ():\n"
+            "    pass\n\n"
+            "def main():\n"
+            "    print('primary_metric: 0.3')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        repaired_generation = (
+            "```filename:main.py\n"
+            "class BaselineVerifier:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value}\n\n"
+            "class AblationWithoutRadius:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 1}\n\n"
+            "class AblationWithoutVoteShape:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 2}\n\n"
+            "class FusionWithoutScaleBins:\n"
+            "    def predict(self, value):\n"
+            "        return {'score': value + 3}\n\n"
+            "models = [\n"
+            "    ('Baseline', BaselineVerifier()),\n"
+            "    ('Abl_NoRadius', AblationWithoutRadius()),\n"
+            "    ('Abl_NoVoteShape', AblationWithoutVoteShape()),\n"
+            "    ('FusionWithoutScaleBins', FusionWithoutScaleBins()),\n"
+            "]\n\n"
+            "def sanity_check_condition_outputs_differ():\n"
+            "    probe = 5\n"
+            "    outputs = {name: model.predict(probe)['score'] for name, model in models}\n"
+            "    assert len(set(outputs.values())) == len(outputs), outputs\n"
+            "    print('ABLATION_CHECK: outputs_differ=True')\n\n"
+            "def main():\n"
+            "    sanity_check_condition_outputs_differ()\n"
+            "    print('primary_metric: 0.3')\n\n"
+            "if __name__ == '__main__':\n"
+            "    main()\n"
+            "```\n"
+            "```filename:requirements.txt\n"
+            "numpy\n"
+            "```"
+        )
+        review_json = '{"score": 8, "issues": [], "verdict": "pass"}'
+        alignment_json = '{"aligned": true, "reason": "", "suggestions": ""}'
+        llm = SequencedFakeLLMClient(
+            [initial_generation, repaired_generation, review_json, alignment_json]
+        )
+        stage_dir = run_dir / "stage-11"
+        stage_dir.mkdir(parents=True, exist_ok=True)
+
+        result = rc_executor._execute_code_generation(
+            stage_dir, run_dir, cfg, adapters, llm=llm
+        )
+
+        assert result.status == StageStatus.DONE
+        main_text = (stage_dir / "experiment" / "main.py").read_text(
+            encoding="utf-8"
+        )
+        assert "def sanity_check_condition_outputs_differ" in main_text
+        assert "pass" not in main_text
+        assert "sanity_check_condition_outputs_differ()" in main_text
+
 
 class TestPartialTimeoutStatus:
     """Test partial status for timed-out experiments with data (R4-1c)."""
@@ -3172,6 +3628,55 @@ def test_topic_alignment_in_refine_prompt(self) -> None:
         assert "NEVER rename" in sp.user
 
 
+class TestRefinePromptCompaction:
+    def test_build_refine_prompt_context_preserves_constraints(self) -> None:
+        from researchclaw.pipeline.stage_impls._execution import (
+            _build_refine_prompt_context,
+        )
+
+        topic = """
+        Design a research project around hybrid circle localization for engineering drawings.
+
+        Important constraints:
+        - Keep the direction hybrid geometry + learning
+        - Focus especially on small circles, partial circles, dashed circles, and cluttered drawings
+        - Do not propose a purely black-box end-to-end detector
+
+        Please produce:
+        1. Problem formulation
+        2. Novelty statement
+        """
+        exp_plan = """
+baselines:
+- name: ExplicitArcVoteRuleCascade
+- name: ImplicitHeatmapPeakVerifier
+proposed_methods:
+- name: ScaleBinnedRulePriorHeatmapCalibration
+ablations:
+- name: AntiEvidenceRerankerWithFixedPatchVerifier
+metrics:
+  primary_metric:
+    name: hard_subset_miss_rate
+objectives:
+  problem_formulation: Treat circle localization as a comparison between D(x) and H(x).
+  novelty_statement: Audit whether geometric and learned vote fields are complementary.
+  recommended_first_prototype: Start with cached diagnostics and two shallow trainable methods.
+  research_questions:
+  - Are D(x) and H(x) complementary?
+  - Does rule density help more as calibration or anti-evidence?
+"""
+
+        compact_topic, anchor = _build_refine_prompt_context(topic, exp_plan)
+
+        assert len(compact_topic) < len(topic)
+        assert "Important constraints" not in compact_topic
+        assert "Structured experiment plan summary" in anchor
+        assert "ScaleBinnedRulePriorHeatmapCalibration" in anchor
+        assert "hard_subset_miss_rate" in anchor
+        assert "Key research constraints to preserve" in anchor
+        assert "Keep the direction hybrid geometry + learning" in anchor
+
+
 # =====================================================================
 # _validate_draft_quality tests
 # =====================================================================
diff --git a/tests/test_rc_runner.py b/tests/test_rc_runner.py
index 0d178529..5e5d6238 100644
--- a/tests/test_rc_runner.py
+++ b/tests/test_rc_runner.py
@@ -53,6 +53,16 @@ def _failed(stage: Stage, msg: str = "boom") -> StageResult:
     return StageResult(stage=stage, status=StageStatus.FAILED, artifacts=(), error=msg)
 
 
+def _paused(stage: Stage, msg: str = "resume needed") -> StageResult:
+    return StageResult(
+        stage=stage,
+        status=StageStatus.PAUSED,
+        artifacts=("refinement_log.json",),
+        error=msg,
+        decision="resume",
+    )
+
+
 def _blocked(stage: Stage) -> StageResult:
     return StageResult(
         stage=stage,
@@ -113,6 +123,37 @@ def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
     assert len(results) == int(fail_stage)
 
 
+def test_execute_pipeline_stops_on_paused_stage(
+    monkeypatch: pytest.MonkeyPatch,
+    run_dir: Path,
+    rc_config: RCConfig,
+    adapters: AdapterBundle,
+) -> None:
+    pause_stage = Stage.ITERATIVE_REFINE
+
+    def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
+        _ = kwargs
+        if stage == pause_stage:
+            return _paused(stage, "ACP prompt timed out after 1800s")
+        return _done(stage)
+
+    monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage)
+    results = rc_runner.execute_pipeline(
+        run_dir=run_dir,
+        run_id="run-paused",
+        config=rc_config,
+        adapters=adapters,
+    )
+    assert results[-1].stage == pause_stage
+    assert results[-1].status == StageStatus.PAUSED
+    assert len(results) == int(pause_stage)
+    checkpoint = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8"))
+    assert checkpoint["last_completed_stage"] == int(Stage.EXPERIMENT_RUN)
+    summary = json.loads((run_dir / "pipeline_summary.json").read_text(encoding="utf-8"))
+    assert summary["stages_paused"] == 1
+    assert summary["final_status"] == "paused"
+
+
 def test_execute_pipeline_stops_on_gate_when_stop_on_gate_enabled(
     monkeypatch: pytest.MonkeyPatch,
     run_dir: Path,
@@ -217,6 +258,7 @@ def mock_execute_stage(stage: Stage, **kwargs) -> StageResult:
     assert summary["stages_done"] == sum(
         1 for r in results if r.status == StageStatus.DONE
     )
+    assert summary["stages_paused"] == 0
     assert summary["stages_blocked"] == 1
     assert summary["stages_failed"] == 1
     assert summary["from_stage"] == 1
@@ -337,6 +379,7 @@ def test_should_start_logic(stage: Stage, started: bool, expected: bool) -> None
     [
         ([], "no_stages", int(Stage.TOPIC_INIT)),
         ([_done(Stage.TOPIC_INIT)], "done", int(Stage.TOPIC_INIT)),
+        ([_done(Stage.TOPIC_INIT), _paused(Stage.PROBLEM_DECOMPOSE)], "paused", int(Stage.PROBLEM_DECOMPOSE)),
         (
             [_done(Stage.TOPIC_INIT), _failed(Stage.PROBLEM_DECOMPOSE)],
             "failed",
diff --git a/tests/test_ssh_and_colab_sandbox.py b/tests/test_ssh_and_colab_sandbox.py
index d3436888..2132a459 100644
--- a/tests/test_ssh_and_colab_sandbox.py
+++ b/tests/test_ssh_and_colab_sandbox.py
@@ -508,7 +508,42 @@ def fail_with_other_error(acpx: str, prompt: str) -> str:
         import pytest
         with pytest.raises(RuntimeError, match="permission denied"):
             client._send_prompt("test prompt")
-        assert call_count == 1  # no retry
+
+    def test_stateless_reconnect_on_session_died(self):
+        """Stateless mode retries with a fresh ephemeral session on reconnect errors."""
+        from researchclaw.llm.acp_client import ACPClient, ACPConfig
+
+        client = ACPClient(ACPConfig(agent="claude", stateless_prompt=True))
+        client._acpx = "/usr/bin/true"
+
+        sessions: list[str] = []
+        closed: list[str] = []
+        call_count = 0
+
+        def fake_new_ephemeral(acpx: str) -> str:
+            name = f"ephemeral-{len(sessions) + 1}"
+            sessions.append(name)
+            return name
+
+        def fake_close_named(acpx: str, session_name: str) -> None:
+            closed.append(session_name)
+
+        def fake_cli(acpx: str, prompt: str, *, session_name: str | None = None) -> str:
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise RuntimeError("ACP prompt failed (exit 1): agent needs reconnect")
+            return f"success via {session_name}"
+
+        client._new_ephemeral_session = fake_new_ephemeral  # type: ignore[assignment]
+        client._close_named_session = fake_close_named  # type: ignore[assignment]
+        client._send_prompt_cli = fake_cli  # type: ignore[assignment]
+
+        result = client._send_prompt("test prompt")
+        assert result == "success via ephemeral-2"
+        assert call_count == 2
+        assert sessions == ["ephemeral-1", "ephemeral-2"]
+        assert closed == ["ephemeral-1", "ephemeral-2"]
 
 
 # ===========================================================================

From 18030fac1f6ad9ccc3f46fcbc8ab8cd0e1177162 Mon Sep 17 00:00:00 2001
From: CKwin26 <156837805+CKwin26@users.noreply.github.com>
Date: Tue, 31 Mar 2026 01:41:39 -0400
Subject: [PATCH 2/2] add manual repair workflows

---
 README.md                           |   65 +-
 autoresearchclaw/__init__.py        |    2 +
 autoresearchclaw/__main__.py        |    6 +
 autoresearchclaw/cli.py             |  254 ++++++
 autoresearchclaw/paper_repair.py    |  349 +++++++++
 autoresearchclaw/research_repair.py | 1133 +++++++++++++++++++++++++++
 pyproject.toml                      |    3 +-
 7 files changed, 1810 insertions(+), 2 deletions(-)
 create mode 100644 autoresearchclaw/__init__.py
 create mode 100644 autoresearchclaw/__main__.py
 create mode 100644 autoresearchclaw/cli.py
 create mode 100644 autoresearchclaw/paper_repair.py
 create mode 100644 autoresearchclaw/research_repair.py

diff --git a/README.md b/README.md
index 5d35ddb8..0ec39db7 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,70 @@ export OPENAI_API_KEY="sk-..."
 researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve
 ```
 
-Output → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` — compile-ready LaTeX, BibTeX, experiment code, charts.
+Output → `artifacts/rc-YYYYMMDD-HHMMSS-<hash>/deliverables/` - compile-ready LaTeX, BibTeX, experiment code, charts.
+
+## Repair Workflows
+
+AutoResearchClaw already has in-pipeline rollback and auto-repair loops. This repo now also ships a **manual repair companion CLI** for cases where a human wants to take a completed run and:
+
+- patch exported paper artifacts without rerunning the pipeline
+- or create a repair child run that reuses early stages and reruns from a later authoritative stage such as Stage 9, 10, or 12
+
+These workflows are exposed through a second CLI:
+
+```bash
+autoresearchclaw --help
+```
+
+### Paper Repair
+
+Use paper repair when the research run is complete but the exported paper package still needs human cleanup.
+
+```bash
+autoresearchclaw paper-repair-init \
+  --run-dir artifacts/rc-YYYYMMDD-HHMMSS-<hash> \
+  --output-dir artifacts/paper-repair/my-run-v1
+```
+
+Edit files under `workspace/`, then publish them back into the source run:
+
+```bash
+autoresearchclaw paper-repair-apply \
+  --repair-json artifacts/paper-repair/my-run-v1/paper-repair.json \
+  --note "Clarify wording and fix paper packaging"
+```
+
+If needed, roll back the most recent publish:
+
+```bash
+autoresearchclaw paper-repair-rollback \
+  --repair-json artifacts/paper-repair/my-run-v1/paper-repair.json
+```
+
+### Research Repair
+
+Use research repair when the completed run needs more data, more seeds, stronger protocol coverage, or a return to earlier experiment stages.
+
+```bash
+autoresearchclaw research-repair-init \
+  --run-dir artifacts/rc-YYYYMMDD-HHMMSS-<hash> \
+  --output-dir artifacts/research-repair/my-run-v1 \
+  --config config.arc.yaml \
+  --target-stage EXPERIMENT_DESIGN \
+  --reason "Human review found insufficient experiment coverage." \
+  --feedback "Use real local assets only." \
+  --feedback "Increase experiment coverage before claiming results."
+```
+
+Then prepare a child run:
+
+```bash
+autoresearchclaw research-repair-run \
+  --repair-json artifacts/research-repair/my-run-v1/research-repair.json \
+  --skip-preflight
+```
+
+This creates a child-run config, launch script, repair metadata, and a compact repair brief. Add `--execute` when you are ready to launch the rerun.
 
 <details>
 <summary>📝 Minimum required config</summary>
diff --git a/autoresearchclaw/__init__.py b/autoresearchclaw/__init__.py
new file mode 100644
index 00000000..e5430bc9
--- /dev/null
+++ b/autoresearchclaw/__init__.py
@@ -0,0 +1,2 @@
+"""Companion repair workflows for completed AutoResearchClaw runs."""
+
diff --git a/autoresearchclaw/__main__.py b/autoresearchclaw/__main__.py
new file mode 100644
index 00000000..0b6ae7cd
--- /dev/null
+++ b/autoresearchclaw/__main__.py
@@ -0,0 +1,6 @@
+from .cli import main
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
diff --git a/autoresearchclaw/cli.py b/autoresearchclaw/cli.py
new file mode 100644
index 00000000..57de5005
--- /dev/null
+++ b/autoresearchclaw/cli.py
@@ -0,0 +1,254 @@
+from __future__ import annotations
+
+import argparse
+import sys
+
+from .paper_repair import (
+    PaperRepairError,
+    apply_paper_repair,
+    init_paper_repair,
+    rollback_paper_repair,
+)
+from .research_repair import (
+    ResearchRepairError,
+    init_research_repair,
+    prepare_research_repair_run,
+)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="autoresearchclaw",
+        description="Manual paper-level and research-level repair workflows for completed AutoResearchClaw runs.",
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    paper_init = subparsers.add_parser(
+        "paper-repair-init",
+        help="Create an editable post-export paper repair workspace from a completed run.",
+    )
+    paper_init.add_argument("--run-dir", required=True, help="Completed run directory with stage-22 or stage-23 paper artifacts.")
+    paper_init.add_argument(
+        "--output-dir",
+        default="artifacts/paper-repair",
+        help="Directory where the paper repair workspace and manifest will be written.",
+    )
+
+    paper_apply = subparsers.add_parser(
+        "paper-repair-apply",
+        help="Publish repaired paper artifacts back into the source run.",
+    )
+    paper_apply.add_argument("--repair-json", required=True, help="Path to a paper-repair.json manifest.")
+    paper_apply.add_argument("--note", help="Optional note describing the published paper fix.")
+
+    paper_rollback = subparsers.add_parser(
+        "paper-repair-rollback",
+        help="Restore the most recent published paper repair snapshot into the source run.",
+    )
+    paper_rollback.add_argument("--repair-json", required=True, help="Path to a paper-repair.json manifest.")
+    paper_rollback.add_argument("--backup-id", help="Optional backup id to roll back to. Defaults to the most recent publish.")
+
+    research_init = subparsers.add_parser(
+        "research-repair-init",
+        help="Create a run-level repair workspace that can send a completed run back to experiment stages.",
+    )
+    research_init.add_argument("--run-dir", required=True, help="Existing AutoResearchClaw run directory to repair.")
+    research_init.add_argument(
+        "--output-dir",
+        default="artifacts/research-repair",
+        help="Directory where the research-repair workspace and manifest will be written.",
+    )
+    research_init.add_argument(
+        "--config",
+        default="config.arc.yaml",
+        help="Base config to copy into the repair workspace for the child run.",
+    )
+    research_init.add_argument(
+        "--target-stage",
+        default="EXPERIMENT_DESIGN",
+        help="Stage number or stage name to restart from, such as 9, CODE_GENERATION, or EXPERIMENT_RUN.",
+    )
+    research_init.add_argument("--reason", help="Short human reason for why the completed run should be repaired.")
+    research_init.add_argument(
+        "--feedback",
+        action="append",
+        default=[],
+        help="Initial repair feedback bullet to seed into workspace/feedback.md. Repeatable.",
+    )
+    research_init.add_argument(
+        "--upstream-root",
+        default=".",
+        help="Path to the AutoResearchClaw checkout used for child runs.",
+    )
+
+    research_run = subparsers.add_parser(
+        "research-repair-run",
+        help="Prepare, and optionally launch, a child run from a research-repair workspace.",
+    )
+    research_run.add_argument("--repair-json", required=True, help="Path to a research-repair.json manifest.")
+    research_run.add_argument("--output-dir", help="Optional explicit child run output directory.")
+    research_run.add_argument(
+        "--feedback",
+        action="append",
+        default=[],
+        help="Additional repair feedback bullet to append before generating the child run. Repeatable.",
+    )
+    research_run.add_argument(
+        "--auto-approve",
+        action="store_true",
+        help="Launch the child run with --auto-approve so the child pipeline will not stop at quality gates.",
+    )
+    research_run.add_argument(
+        "--skip-preflight",
+        action="store_true",
+        help="Pass --skip-preflight to the child run command.",
+    )
+    research_run.add_argument(
+        "--execute",
+        action="store_true",
+        help="Actually launch the child run. Without this flag, only launch metadata and scripts are prepared.",
+    )
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    if args.command == "paper-repair-init":
+        return _run_paper_repair_init(args.run_dir, args.output_dir)
+    if args.command == "paper-repair-apply":
+        return _run_paper_repair_apply(args.repair_json, args.note)
+    if args.command == "paper-repair-rollback":
+        return _run_paper_repair_rollback(args.repair_json, args.backup_id)
+    if args.command == "research-repair-init":
+        return _run_research_repair_init(
+            args.run_dir,
+            args.output_dir,
+            args.config,
+            args.target_stage,
+            args.reason,
+            list(args.feedback),
+            args.upstream_root,
+        )
+    if args.command == "research-repair-run":
+        return _run_research_repair_run(
+            args.repair_json,
+            args.output_dir,
+            list(args.feedback),
+            bool(args.auto_approve),
+            bool(args.skip_preflight),
+            bool(args.execute),
+        )
+
+    parser.error(f"Unknown command: {args.command}")
+    return 2
+
+
+def _run_paper_repair_init(run_dir: str, output_dir: str) -> int:
+    try:
+        outputs = init_paper_repair(run_dir, output_dir)
+    except (PaperRepairError, OSError) as exc:
+        print(f"Paper repair init failed: {exc}", file=sys.stderr)
+        return 1
+
+    print("Paper repair workspace created")
+    print(f"Workspace: {outputs['workspace']}")
+    print(f"Session JSON: {outputs['session_json']}")
+    print(f"README: {outputs['readme']}")
+    return 0
+
+
+def _run_paper_repair_apply(repair_json: str, note: str | None) -> int:
+    try:
+        outputs = apply_paper_repair(repair_json, note=note)
+    except (PaperRepairError, OSError) as exc:
+        print(f"Paper repair publish failed: {exc}", file=sys.stderr)
+        return 1
+
+    print("Paper repair published")
+    print(f"Run dir: {outputs['published_run_dir']}")
+    print(f"Backup dir: {outputs['backup_dir']}")
+    print(f"Session JSON: {outputs['session_json']}")
+    return 0
+
+
+def _run_paper_repair_rollback(repair_json: str, backup_id: str | None) -> int:
+    try:
+        outputs = rollback_paper_repair(repair_json, backup_id=backup_id)
+    except (PaperRepairError, OSError) as exc:
+        print(f"Paper repair rollback failed: {exc}", file=sys.stderr)
+        return 1
+
+    print("Paper repair rolled back")
+    print(f"Run dir: {outputs['published_run_dir']}")
+    print(f"Rolled back backup: {outputs['rolled_back_backup']}")
+    print(f"Session JSON: {outputs['session_json']}")
+    return 0
+
+
+def _run_research_repair_init(
+    run_dir: str,
+    output_dir: str,
+    config_path: str,
+    target_stage: str,
+    reason: str | None,
+    feedback: list[str],
+    upstream_root: str,
+) -> int:
+    try:
+        outputs = init_research_repair(
+            run_dir,
+            output_dir,
+            config_path=config_path,
+            target_stage=target_stage,
+            reason=reason,
+            feedback=feedback,
+            upstream_root=upstream_root,
+        )
+    except (ResearchRepairError, OSError) as exc:
+        print(f"Research repair init failed: {exc}", file=sys.stderr)
+        return 1
+
+    print("Research repair workspace created")
+    print(f"Workspace: {outputs['workspace']}")
+    print(f"Session JSON: {outputs['session_json']}")
+    print(f"Feedback: {outputs['feedback']}")
+    print(f"Repair config: {outputs['repair_config']}")
+    print(f"README: {outputs['readme']}")
+    return 0
+
+
+def _run_research_repair_run(
+    repair_json: str,
+    output_dir: str | None,
+    feedback: list[str],
+    auto_approve: bool,
+    skip_preflight: bool,
+    execute: bool,
+) -> int:
+    try:
+        outputs = prepare_research_repair_run(
+            repair_json,
+            output_dir=output_dir,
+            extra_feedback=feedback,
+            auto_approve=auto_approve,
+            skip_preflight=skip_preflight,
+            execute=execute,
+        )
+    except (ResearchRepairError, OSError) as exc:
+        print(f"Research repair launch preparation failed: {exc}", file=sys.stderr)
+        return 1
+
+    print("Research repair child run prepared")
+    print(f"Child run dir: {outputs['child_run_dir']}")
+    print(f"Generated config: {outputs['generated_config']}")
+    print(f"Launch script: {outputs['launch_script']}")
+    print(f"Metadata: {outputs['metadata']}")
+    print("Command preview:")
+    print(outputs["command_preview"])
+    if execute and outputs.get("pid"):
+        print(f"Process pid: {outputs['pid']}")
+    print(f"Session JSON: {outputs['session_json']}")
+    return 0
diff --git a/autoresearchclaw/paper_repair.py b/autoresearchclaw/paper_repair.py
new file mode 100644
index 00000000..b039397c
--- /dev/null
+++ b/autoresearchclaw/paper_repair.py
@@ -0,0 +1,349 @@
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from json import dumps, loads
+from pathlib import Path
+from shutil import copy2, copytree, rmtree
+from typing import Any
+
+
+class PaperRepairError(ValueError):
+    """Raised when a paper-repair session cannot be created or applied."""
+
+
+TRACKED_STAGE_PATHS: dict[str, tuple[str, ...]] = {
+    "stage-22": (
+        "paper.tex",
+        "paper.pdf",
+        "paper_final.md",
+        "paper_final_latex.md",
+        "references.bib",
+        "references_verified.bib",
+        "neurips_2025.sty",
+        "charts",
+    ),
+    "stage-23": (
+        "paper_final_verified.md",
+        "references_verified.bib",
+        "verification_report.json",
+        "charts",
+    ),
+}
+
+
+@dataclass(frozen=True)
+class TrackedItem:
+    relative_path: str
+    kind: str
+    exists: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> TrackedItem:
+        return cls(
+            relative_path=str(data.get("relative_path", "")),
+            kind=str(data.get("kind", "file")),
+            exists=bool(data.get("exists", False)),
+        )
+
+
+@dataclass(frozen=True)
+class ApplyEntry:
+    backup_id: str
+    applied_at: str
+    note: str
+    backup_dir: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ApplyEntry:
+        return cls(
+            backup_id=str(data.get("backup_id", "")),
+            applied_at=str(data.get("applied_at", "")),
+            note=str(data.get("note", "")),
+            backup_dir=str(data.get("backup_dir", "")),
+        )
+
+
+@dataclass(frozen=True)
+class PaperRepairSession:
+    source_run_dir: str
+    session_dir: str
+    workspace_dir: str
+    created_at: str
+    tracked_items: tuple[TrackedItem, ...]
+    apply_history: tuple[ApplyEntry, ...] = field(default_factory=tuple)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "source_run_dir": self.source_run_dir,
+            "session_dir": self.session_dir,
+            "workspace_dir": self.workspace_dir,
+            "created_at": self.created_at,
+            "tracked_items": [item.to_dict() for item in self.tracked_items],
+            "apply_history": [entry.to_dict() for entry in self.apply_history],
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> PaperRepairSession:
+        return cls(
+            source_run_dir=str(data.get("source_run_dir", "")),
+            session_dir=str(data.get("session_dir", "")),
+            workspace_dir=str(data.get("workspace_dir", "")),
+            created_at=str(data.get("created_at", "")),
+            tracked_items=tuple(
+                TrackedItem.from_dict(item)
+                for item in data.get("tracked_items", [])
+                if isinstance(item, dict)
+            ),
+            apply_history=tuple(
+                ApplyEntry.from_dict(item)
+                for item in data.get("apply_history", [])
+                if isinstance(item, dict)
+            ),
+        )
+
+
+def init_paper_repair(
+    run_dir: str | Path,
+    output_dir: str | Path,
+) -> dict[str, str]:
+    source_run_dir = Path(run_dir).resolve()
+    if not source_run_dir.exists():
+        raise PaperRepairError(f"Run directory not found: {source_run_dir}")
+
+    tracked_items = _collect_tracked_items(source_run_dir)
+    if not tracked_items:
+        raise PaperRepairError(
+            "No paper-export artifacts found under stage-22 or stage-23. "
+            "Expected files such as paper.tex or paper_final_verified.md."
+        )
+
+    session_dir = Path(output_dir).resolve()
+    session_dir.mkdir(parents=True, exist_ok=True)
+    workspace_dir = session_dir / "workspace"
+    if workspace_dir.exists():
+        raise PaperRepairError(
+            f"Repair workspace already exists: {workspace_dir}. "
+            "Use a fresh output directory for each repair session."
+        )
+    workspace_dir.mkdir(parents=True, exist_ok=False)
+
+    for item in tracked_items:
+        if not item.exists:
+            continue
+        source_path = source_run_dir / item.relative_path
+        target_path = workspace_dir / item.relative_path
+        _copy_path(source_path, target_path, item.kind)
+
+    session = PaperRepairSession(
+        source_run_dir=str(source_run_dir),
+        session_dir=str(session_dir),
+        workspace_dir=str(workspace_dir),
+        created_at=_utc_now(),
+        tracked_items=tracked_items,
+    )
+
+    session_json = session_dir / "paper-repair.json"
+    session_json.write_text(dumps(session.to_dict(), indent=2) + "\n", encoding="utf-8")
+    readme_path = session_dir / "README.md"
+    readme_path.write_text(_render_repair_readme(session), encoding="utf-8")
+
+    return {
+        "session_json": str(session_json),
+        "readme": str(readme_path),
+        "workspace": str(workspace_dir),
+    }
+
+
+def apply_paper_repair(
+    session_json_path: str | Path,
+    *,
+    note: str | None = None,
+) -> dict[str, str]:
+    session_path = Path(session_json_path).resolve()
+    session = _load_session(session_path)
+    source_run_dir = Path(session.source_run_dir)
+    workspace_dir = Path(session.workspace_dir)
+    if not workspace_dir.exists():
+        raise PaperRepairError(f"Repair workspace not found: {workspace_dir}")
+
+    backup_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    backup_dir = Path(session.session_dir) / "backups" / backup_id
+    backup_dir.mkdir(parents=True, exist_ok=False)
+
+    for item in session.tracked_items:
+        target_path = source_run_dir / item.relative_path
+        backup_path = backup_dir / item.relative_path
+        if target_path.exists():
+            _copy_path(target_path, backup_path, _path_kind(target_path))
+
+        workspace_path = workspace_dir / item.relative_path
+        if workspace_path.exists():
+            _copy_path(workspace_path, target_path, _path_kind(workspace_path))
+
+    entry = ApplyEntry(
+        backup_id=backup_id,
+        applied_at=_utc_now(),
+        note=(note or "").strip(),
+        backup_dir=str(backup_dir),
+    )
+    rewritten = PaperRepairSession(
+        source_run_dir=session.source_run_dir,
+        session_dir=session.session_dir,
+        workspace_dir=session.workspace_dir,
+        created_at=session.created_at,
+        tracked_items=session.tracked_items,
+        apply_history=session.apply_history + (entry,),
+    )
+    session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8")
+    return {
+        "session_json": str(session_path),
+        "backup_dir": str(backup_dir),
+        "published_run_dir": str(source_run_dir),
+    }
+
+
+def rollback_paper_repair(
+    session_json_path: str | Path,
+    *,
+    backup_id: str | None = None,
+) -> dict[str, str]:
+    session_path = Path(session_json_path).resolve()
+    session = _load_session(session_path)
+    if not session.apply_history:
+        raise PaperRepairError("No published repair exists yet, so there is nothing to roll back.")
+
+    entry = _select_backup_entry(session, backup_id)
+    backup_dir = Path(entry.backup_dir)
+    if not backup_dir.exists():
+        raise PaperRepairError(f"Backup directory not found: {backup_dir}")
+
+    source_run_dir = Path(session.source_run_dir)
+    for item in session.tracked_items:
+        source_path = backup_dir / item.relative_path
+        target_path = source_run_dir / item.relative_path
+        if source_path.exists():
+            _copy_path(source_path, target_path, _path_kind(source_path))
+        elif not item.exists and target_path.exists():
+            _remove_path(target_path)
+
+    remaining_history = tuple(
+        history_entry
+        for history_entry in session.apply_history
+        if history_entry.backup_id != entry.backup_id
+    )
+    rewritten = PaperRepairSession(
+        source_run_dir=session.source_run_dir,
+        session_dir=session.session_dir,
+        workspace_dir=session.workspace_dir,
+        created_at=session.created_at,
+        tracked_items=session.tracked_items,
+        apply_history=remaining_history,
+    )
+    session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8")
+    return {
+        "session_json": str(session_path),
+        "rolled_back_backup": entry.backup_id,
+        "published_run_dir": str(source_run_dir),
+    }
+
+
+def _collect_tracked_items(run_dir: Path) -> tuple[TrackedItem, ...]:
+    items: list[TrackedItem] = []
+    for stage_name, relative_paths in TRACKED_STAGE_PATHS.items():
+        stage_dir = run_dir / stage_name
+        if not stage_dir.exists():
+            continue
+        for relative_path in relative_paths:
+            full_path = stage_dir / relative_path
+            items.append(
+                TrackedItem(
+                    relative_path=f"{stage_name}/{relative_path}",
+                    kind="directory" if full_path.is_dir() else "file",
+                    exists=full_path.exists(),
+                )
+            )
+    return tuple(items)
+
+
+def _load_session(session_json_path: Path) -> PaperRepairSession:
+    if not session_json_path.exists():
+        raise PaperRepairError(f"Paper repair JSON not found: {session_json_path}")
+    data = loads(session_json_path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise PaperRepairError("Paper repair JSON must decode to a mapping.")
+    return PaperRepairSession.from_dict(data)
+
+
+def _select_backup_entry(
+    session: PaperRepairSession,
+    backup_id: str | None,
+) -> ApplyEntry:
+    if backup_id:
+        for entry in session.apply_history:
+            if entry.backup_id == backup_id:
+                return entry
+        raise PaperRepairError(f"Backup id not found in repair session: {backup_id}")
+    return session.apply_history[-1]
+
+
+def _copy_path(source_path: Path, target_path: Path, kind: str) -> None:
+    target_path.parent.mkdir(parents=True, exist_ok=True)
+    if target_path.exists():
+        _remove_path(target_path)
+    if kind == "directory":
+        copytree(source_path, target_path)
+        return
+    copy2(source_path, target_path)
+
+
+def _remove_path(path: Path) -> None:
+    if path.is_dir():
+        rmtree(path)
+        return
+    path.unlink()
+
+
+def _path_kind(path: Path) -> str:
+    return "directory" if path.is_dir() else "file"
+
+
+def _render_repair_readme(session: PaperRepairSession) -> str:
+    lines = [
+        "# Paper Repair Workspace",
+        "",
+        "This workspace is a post-export repair lane for a completed AutoResearchClaw run.",
+        "",
+        f"- Source run: `{session.source_run_dir}`",
+        f"- Created at: `{session.created_at}`",
+        f"- Workspace root: `{session.workspace_dir}`",
+        "",
+        "## Tracked Artifacts",
+    ]
+    for item in session.tracked_items:
+        state = "present" if item.exists else "missing in source run"
+        lines.append(f"- `{item.relative_path}` ({item.kind}, {state})")
+    lines.extend(
+        [
+            "",
+            "## Workflow",
+            "1. Edit files under `workspace/`.",
+            "2. Publish repairs back to the source run with:",
+            "   `python -m autoresearchclaw paper-repair-apply --repair-json <session>/paper-repair.json`",
+            "3. If needed, roll back the most recent publish with:",
+            "   `python -m autoresearchclaw paper-repair-rollback --repair-json <session>/paper-repair.json`",
+            "",
+            "Each publish snapshots the original files under `backups/<timestamp>/` before overwriting them.",
+        ]
+    )
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def _utc_now() -> str:
+    return datetime.now(timezone.utc).isoformat()
diff --git a/autoresearchclaw/research_repair.py b/autoresearchclaw/research_repair.py
new file mode 100644
index 00000000..5c7351de
--- /dev/null
+++ b/autoresearchclaw/research_repair.py
@@ -0,0 +1,1133 @@
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from json import dumps, loads
+import os
+from pathlib import Path
+from shutil import copy2, copytree
+import subprocess
+import sys
+from typing import Any
+
+import yaml
+
+
+class ResearchRepairError(ValueError):
+    """Raised when a research-repair session cannot be created or prepared."""
+
+
+STAGE_NAME_BY_NUMBER: dict[int, str] = {
+    1: "TOPIC_INIT",
+    2: "PROBLEM_DECOMPOSE",
+    3: "SEARCH_STRATEGY",
+    4: "LITERATURE_COLLECT",
+    5: "LITERATURE_SCREEN",
+    6: "KNOWLEDGE_EXTRACT",
+    7: "SYNTHESIS",
+    8: "HYPOTHESIS_GEN",
+    9: "EXPERIMENT_DESIGN",
+    10: "CODE_GENERATION",
+    11: "RESOURCE_PLANNING",
+    12: "EXPERIMENT_RUN",
+    13: "ITERATIVE_REFINE",
+    14: "RESULT_ANALYSIS",
+    15: "RESEARCH_DECISION",
+    16: "PAPER_OUTLINE",
+    17: "PAPER_DRAFT",
+    18: "PEER_REVIEW",
+    19: "PAPER_REVISION",
+    20: "QUALITY_GATE",
+    21: "KNOWLEDGE_ARCHIVE",
+    22: "EXPORT_PUBLISH",
+    23: "CITATION_VERIFY",
+}
+STAGE_NUMBER_BY_NAME: dict[str, int] = {
+    name.upper(): number for number, name in STAGE_NAME_BY_NUMBER.items()
+}
+
+FIXED_CONTEXT_PATHS: tuple[str, ...] = (
+    "checkpoint.json",
+    "pipeline_summary.json",
+    "experiment_diagnosis.json",
+    "repair_prompt.txt",
+    "quality_warning.txt",
+    "experiment_summary_best.json",
+    "analysis_best.md",
+    "stage-09/exp_plan.yaml",
+    "stage-12/runs/results.json",
+    "stage-20/quality_report.json",
+    "stage-23/paper_final_verified.md",
+    "stage-23/verification_report.json",
+)
+LATEST_GLOB_PATHS: tuple[str, ...] = (
+    "stage-14*/experiment_summary.json",
+    "stage-14*/analysis.md",
+    "stage-15*/decision.md",
+)
+
+WSL_PASSTHROUGH_ENV_VARS: tuple[str, ...] = (
+    "OPENAI_API_KEY",
+    "OPENAI_API_BASE",
+    "OPENAI_BASE_URL",
+    "OPENAI_ORG_ID",
+    "OPENAI_PROJECT_ID",
+)
+REPAIR_RUN_ROOT_ENV_VAR = "AUTORESEARCHCLAW_REPAIR_RUN_ROOT"
+
+
+@dataclass(frozen=True)
+class ContextItem:
+    relative_path: str
+    kind: str
+    exists: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ContextItem:
+        return cls(
+            relative_path=str(data.get("relative_path", "")),
+            kind=str(data.get("kind", "file")),
+            exists=bool(data.get("exists", False)),
+        )
+
+
+@dataclass(frozen=True)
+class LaunchEntry:
+    launched_at: str
+    child_run_dir: str
+    generated_config_path: str
+    launch_script: str
+    command_preview: str
+    target_stage_name: str
+    target_stage_number: int
+    launch_log: str = ""
+    inherited_stage_dirs: tuple[str, ...] = field(default_factory=tuple)
+    executed: bool = False
+    pid: int | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> LaunchEntry:
+        return cls(
+            launched_at=str(data.get("launched_at", "")),
+            child_run_dir=str(data.get("child_run_dir", "")),
+            generated_config_path=str(data.get("generated_config_path", "")),
+            launch_script=str(data.get("launch_script", "")),
+            launch_log=str(data.get("launch_log", "")),
+            command_preview=str(data.get("command_preview", "")),
+            target_stage_name=str(data.get("target_stage_name", "")),
+            target_stage_number=int(data.get("target_stage_number", 0)),
+            inherited_stage_dirs=tuple(data.get("inherited_stage_dirs") or ()),
+            executed=bool(data.get("executed", False)),
+            pid=int(data["pid"]) if data.get("pid") is not None else None,
+        )
+
+
+@dataclass(frozen=True)
+class ReusePolicy:
+    hard_reuse_stage_dirs: tuple[str, ...] = field(default_factory=tuple)
+    soft_context_paths: tuple[str, ...] = field(default_factory=tuple)
+    rerun_from_stage_name: str = ""
+    rerun_from_stage_number: int = 0
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ReusePolicy:
+        return cls(
+            hard_reuse_stage_dirs=tuple(data.get("hard_reuse_stage_dirs") or ()),
+            soft_context_paths=tuple(data.get("soft_context_paths") or ()),
+            rerun_from_stage_name=str(data.get("rerun_from_stage_name", "")),
+            rerun_from_stage_number=int(data.get("rerun_from_stage_number", 0)),
+        )
+
+
+@dataclass(frozen=True)
+class ResearchRepairSession:
+    source_run_dir: str
+    source_run_id: str
+    session_dir: str
+    workspace_dir: str
+    created_at: str
+    base_config_path: str
+    upstream_root: str
+    target_stage_name: str
+    target_stage_number: int
+    repair_reason: str
+    context_items: tuple[ContextItem, ...]
+    feedback_path: str
+    reuse_policy: ReusePolicy
+    launch_history: tuple[LaunchEntry, ...] = field(default_factory=tuple)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "source_run_dir": self.source_run_dir,
+            "source_run_id": self.source_run_id,
+            "session_dir": self.session_dir,
+            "workspace_dir": self.workspace_dir,
+            "created_at": self.created_at,
+            "base_config_path": self.base_config_path,
+            "upstream_root": self.upstream_root,
+            "target_stage_name": self.target_stage_name,
+            "target_stage_number": self.target_stage_number,
+            "repair_reason": self.repair_reason,
+            "context_items": [item.to_dict() for item in self.context_items],
+            "feedback_path": self.feedback_path,
+            "reuse_policy": self.reuse_policy.to_dict(),
+            "launch_history": [item.to_dict() for item in self.launch_history],
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ResearchRepairSession:
+        return cls(
+            source_run_dir=str(data.get("source_run_dir", "")),
+            source_run_id=str(data.get("source_run_id", "")),
+            session_dir=str(data.get("session_dir", "")),
+            workspace_dir=str(data.get("workspace_dir", "")),
+            created_at=str(data.get("created_at", "")),
+            base_config_path=str(data.get("base_config_path", "")),
+            upstream_root=str(data.get("upstream_root", "")),
+            target_stage_name=str(data.get("target_stage_name", "")),
+            target_stage_number=int(data.get("target_stage_number", 0)),
+            repair_reason=str(data.get("repair_reason", "")),
+            context_items=tuple(
+                ContextItem.from_dict(item)
+                for item in data.get("context_items", [])
+                if isinstance(item, dict)
+            ),
+            feedback_path=str(data.get("feedback_path", "")),
+            reuse_policy=ReusePolicy.from_dict(
+                data.get("reuse_policy") if isinstance(data.get("reuse_policy"), dict) else {}
+            ),
+            launch_history=tuple(
+                LaunchEntry.from_dict(item)
+                for item in data.get("launch_history", [])
+                if isinstance(item, dict)
+            ),
+        )
+
+
+def init_research_repair(
+    run_dir: str | Path,
+    output_dir: str | Path,
+    *,
+    config_path: str | Path = "config.arc.yaml",
+    target_stage: str = "EXPERIMENT_DESIGN",
+    reason: str | None = None,
+    feedback: list[str] | tuple[str, ...] = (),
+    upstream_root: str | Path = ".",
+) -> dict[str, str]:
+    source_run_dir = Path(run_dir).resolve()
+    if not source_run_dir.exists():
+        raise ResearchRepairError(f"Run directory not found: {source_run_dir}")
+
+    base_config_path = Path(config_path).resolve()
+    if not base_config_path.exists():
+        raise ResearchRepairError(f"Config not found: {base_config_path}")
+
+    upstream_root_path = Path(upstream_root).resolve()
+    if not upstream_root_path.exists():
+        raise ResearchRepairError(f"Upstream root not found: {upstream_root_path}")
+
+    stage_number, stage_name = _normalize_stage_ref(target_stage)
+    source_run_id = _read_source_run_id(source_run_dir)
+    context_items = _collect_context_items(source_run_dir)
+    reuse_policy = _build_reuse_policy(
+        context_items=context_items,
+        target_stage_number=stage_number,
+        target_stage_name=stage_name,
+    )
+
+    session_dir = Path(output_dir).resolve()
+    session_dir.mkdir(parents=True, exist_ok=True)
+    workspace_dir = session_dir / "workspace"
+    if workspace_dir.exists():
+        raise ResearchRepairError(
+            f"Research repair workspace already exists: {workspace_dir}. "
+            "Use a fresh output directory for each repair session."
+        )
+    workspace_dir.mkdir(parents=True, exist_ok=False)
+
+    context_root = workspace_dir / "context"
+    for item in context_items:
+        if not item.exists:
+            continue
+        source_path = source_run_dir / item.relative_path
+        target_path = context_root / item.relative_path
+        _copy_path(source_path, target_path, item.kind)
+
+    repair_config_path = workspace_dir / "repair-config.yaml"
+    copy2(base_config_path, repair_config_path)
+
+    feedback_path = workspace_dir / "feedback.md"
+    feedback_path.write_text(
+        _render_feedback_template(
+            source_run_id=source_run_id,
+            target_stage_name=stage_name,
+            reason=(reason or "").strip(),
+            feedback=list(feedback),
+        ),
+        encoding="utf-8",
+    )
+
+    repair_reason = (
+        (reason or "").strip()
+        or "Human review concluded that the completed run needs more data, more experiments, or stronger protocol coverage."
+    )
+    session = ResearchRepairSession(
+        source_run_dir=str(source_run_dir),
+        source_run_id=source_run_id,
+        session_dir=str(session_dir),
+        workspace_dir=str(workspace_dir),
+        created_at=_utc_now(),
+        base_config_path=str(base_config_path),
+        upstream_root=str(upstream_root_path),
+        target_stage_name=stage_name,
+        target_stage_number=stage_number,
+        repair_reason=repair_reason,
+        context_items=context_items,
+        feedback_path=str(feedback_path),
+        reuse_policy=reuse_policy,
+    )
+
+    session_json = session_dir / "research-repair.json"
+    session_json.write_text(dumps(session.to_dict(), indent=2) + "\n", encoding="utf-8")
+    readme_path = session_dir / "README.md"
+    readme_path.write_text(_render_repair_readme(session), encoding="utf-8")
+
+    return {
+        "session_json": str(session_json),
+        "readme": str(readme_path),
+        "workspace": str(workspace_dir),
+        "feedback": str(feedback_path),
+        "repair_config": str(repair_config_path),
+    }
+
+
+def prepare_research_repair_run(
+    session_json_path: str | Path,
+    *,
+    output_dir: str | Path | None = None,
+    extra_feedback: list[str] | tuple[str, ...] = (),
+    auto_approve: bool = False,
+    skip_preflight: bool = False,
+    execute: bool = False,
+) -> dict[str, str]:
+    session_path = Path(session_json_path).resolve()
+    session = _load_session(session_path)
+    workspace_dir = Path(session.workspace_dir)
+    repair_config_path = workspace_dir / "repair-config.yaml"
+    if not repair_config_path.exists():
+        raise ResearchRepairError(f"Repair config not found: {repair_config_path}")
+
+    feedback_text = _read_feedback(Path(session.feedback_path), extra_feedback)
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    child_run_dir = (
+        Path(output_dir).resolve()
+        if output_dir is not None
+        else _default_child_run_dir(session, timestamp=timestamp)
+    )
+    if child_run_dir.exists() and any(child_run_dir.iterdir()):
+        raise ResearchRepairError(
+            f"Child run directory already exists and is not empty: {child_run_dir}"
+        )
+    child_run_dir.mkdir(parents=True, exist_ok=True)
+    inherited_stage_dirs = _copy_prerequisite_stage_dirs(
+        source_run_dir=Path(session.source_run_dir),
+        child_run_dir=child_run_dir,
+        target_stage_number=session.target_stage_number,
+    )
+
+    generated_dir = Path(session.session_dir) / "generated-runs" / timestamp
+    generated_dir.mkdir(parents=True, exist_ok=False)
+
+    config_data = _load_yaml(repair_config_path)
+    generated_config_path = generated_dir / "repair-config.generated.yaml"
+    _write_generated_config(
+        config_data,
+        generated_config_path,
+        session=session,
+        feedback_text=feedback_text,
+        child_run_dir=child_run_dir,
+    )
+
+    metadata = {
+        "generated_at": _utc_now(),
+        "parent_run_dir": session.source_run_dir,
+        "parent_run_id": session.source_run_id,
+        "target_stage_name": session.target_stage_name,
+        "target_stage_number": session.target_stage_number,
+        "repair_reason": session.repair_reason,
+        "feedback_path": session.feedback_path,
+        "feedback_excerpt": feedback_text[:1200],
+        "generated_config_path": str(generated_config_path),
+        "inherited_stage_dirs": list(inherited_stage_dirs),
+        "reuse_policy": session.reuse_policy.to_dict(),
+        "soft_context_note": (
+            "Parent-run downstream analysis/draft artifacts are provided only as "
+            "reference context. They are not authoritative outputs for this child run."
+        ),
+        "compact_repair_brief": _build_compact_repair_brief(
+            session=session,
+            feedback_text=feedback_text,
+        ),
+    }
+    metadata_path = child_run_dir / "research_repair_parent.json"
+    metadata_path.write_text(dumps(metadata, indent=2) + "\n", encoding="utf-8")
+
+    inner_command = _build_inner_launch_command(
+        upstream_root=Path(session.upstream_root),
+        generated_config_path=generated_config_path,
+        child_run_dir=child_run_dir,
+        stage_name=session.target_stage_name,
+        auto_approve=auto_approve,
+        skip_preflight=skip_preflight,
+    )
+    command_preview = _wrap_launch_command_for_display(inner_command)
+    launch_script = generated_dir / "launch.sh"
+    launch_script.write_text(command_preview + "\n", encoding="utf-8")
+    launch_log = generated_dir / "launch.log"
+
+    pid: int | None = None
+    if execute:
+        pid = _launch_command(
+            inner_command,
+            launch_log,
+            upstream_root=Path(session.upstream_root),
+        )
+
+    launch_entry = LaunchEntry(
+        launched_at=_utc_now(),
+        child_run_dir=str(child_run_dir),
+        generated_config_path=str(generated_config_path),
+        launch_script=str(launch_script),
+        launch_log=str(launch_log),
+        command_preview=command_preview,
+        target_stage_name=session.target_stage_name,
+        target_stage_number=session.target_stage_number,
+        inherited_stage_dirs=inherited_stage_dirs,
+        executed=execute,
+        pid=pid,
+    )
+    rewritten = ResearchRepairSession(
+        source_run_dir=session.source_run_dir,
+        source_run_id=session.source_run_id,
+        session_dir=session.session_dir,
+        workspace_dir=session.workspace_dir,
+        created_at=session.created_at,
+        base_config_path=session.base_config_path,
+        upstream_root=session.upstream_root,
+        target_stage_name=session.target_stage_name,
+        target_stage_number=session.target_stage_number,
+        repair_reason=session.repair_reason,
+        context_items=session.context_items,
+        feedback_path=session.feedback_path,
+        reuse_policy=session.reuse_policy,
+        launch_history=session.launch_history + (launch_entry,),
+    )
+    session_path.write_text(dumps(rewritten.to_dict(), indent=2) + "\n", encoding="utf-8")
+
+    return {
+        "session_json": str(session_path),
+        "child_run_dir": str(child_run_dir),
+        "generated_config": str(generated_config_path),
+        "launch_script": str(launch_script),
+        "launch_log": str(launch_log),
+        "command_preview": command_preview,
+        "metadata": str(metadata_path),
+        "pid": "" if pid is None else str(pid),
+    }
+
+
+def _normalize_stage_ref(stage_ref: str) -> tuple[int, str]:
+    raw = str(stage_ref).strip()
+    if not raw:
+        raise ResearchRepairError("Target stage must not be empty.")
+    if raw.isdigit():
+        stage_number = int(raw)
+        stage_name = STAGE_NAME_BY_NUMBER.get(stage_number)
+        if stage_name is None:
+            raise ResearchRepairError(f"Unknown stage number: {stage_number}")
+        return stage_number, stage_name
+    stage_name = raw.upper()
+    stage_number = STAGE_NUMBER_BY_NAME.get(stage_name)
+    if stage_number is None:
+        valid = ", ".join(STAGE_NAME_BY_NUMBER.values())
+        raise ResearchRepairError(
+            f"Unknown stage name '{raw}'. Valid stage names: {valid}"
+        )
+    return stage_number, stage_name
+
+
+def _read_source_run_id(run_dir: Path) -> str:
+    if run_dir.name.strip():
+        return run_dir.name.strip()
+    summary_path = run_dir / "pipeline_summary.json"
+    if summary_path.exists():
+        try:
+            data = loads(summary_path.read_text(encoding="utf-8"))
+            run_id = data.get("run_id")
+            if isinstance(run_id, str) and run_id.strip():
+                return run_id.strip()
+        except (OSError, ValueError):
+            pass
+    return run_dir.name
+
+
+def _collect_context_items(run_dir: Path) -> tuple[ContextItem, ...]:
+    relative_paths: list[str] = []
+    for relative_path in FIXED_CONTEXT_PATHS:
+        relative_paths.append(relative_path)
+    for pattern in LATEST_GLOB_PATHS:
+        matches = sorted(run_dir.glob(pattern))
+        if matches:
+            relative_paths.append(matches[-1].relative_to(run_dir).as_posix())
+
+    deduped: list[str] = []
+    seen: set[str] = set()
+    for relative_path in relative_paths:
+        if relative_path in seen:
+            continue
+        seen.add(relative_path)
+        deduped.append(relative_path)
+
+    items: list[ContextItem] = []
+    for relative_path in deduped:
+        full_path = run_dir / relative_path
+        items.append(
+            ContextItem(
+                relative_path=relative_path,
+                kind="directory" if full_path.is_dir() else "file",
+                exists=full_path.exists(),
+            )
+        )
+    return tuple(items)
+
+
+def _build_reuse_policy(
+    *,
+    context_items: tuple[ContextItem, ...],
+    target_stage_number: int,
+    target_stage_name: str,
+) -> ReusePolicy:
+    hard_reuse = tuple(
+        f"stage-{number:02d}" for number in range(1, target_stage_number)
+    )
+    soft_context: list[str] = []
+    for item in context_items:
+        rel = item.relative_path
+        if rel.startswith("stage-"):
+            stage_prefix = rel.split("/", 1)[0]
+            number_text = stage_prefix.replace("stage-", "").split("_", 1)[0]
+            number_text = number_text.split("-", 1)[0]
+            try:
+                stage_number = int(number_text)
+            except ValueError:
+                stage_number = 0
+            if stage_number >= target_stage_number:
+                soft_context.append(rel)
+        elif rel in {
+            "checkpoint.json",
+            "pipeline_summary.json",
+            "experiment_diagnosis.json",
+            "experiment_summary_best.json",
+            "analysis_best.md",
+            "repair_prompt.txt",
+        }:
+            soft_context.append(rel)
+    deduped_soft: list[str] = []
+    seen: set[str] = set()
+    for rel in soft_context:
+        if rel in seen:
+            continue
+        seen.add(rel)
+        deduped_soft.append(rel)
+    return ReusePolicy(
+        hard_reuse_stage_dirs=hard_reuse,
+        soft_context_paths=tuple(deduped_soft),
+        rerun_from_stage_name=target_stage_name,
+        rerun_from_stage_number=target_stage_number,
+    )
+
+
+def _render_feedback_template(
+    *,
+    source_run_id: str,
+    target_stage_name: str,
+    reason: str,
+    feedback: list[str],
+) -> str:
+    lines = [
+        "# Research Repair Feedback",
+        "",
+        f"- Parent run: `{source_run_id}`",
+        f"- Target stage: `{target_stage_name}`",
+        f"- Reason: `{reason or 'Add the human repair reason here.'}`",
+        "",
+        "## Human Repair Request",
+    ]
+    if feedback:
+        for item in feedback:
+            item_text = str(item).strip()
+            if item_text:
+                lines.append(f"- {item_text}")
+    else:
+        lines.extend(
+            [
+                "- State exactly what was insufficient in the completed run.",
+                "- Say what must be added: more data, more seeds, more conditions, or stronger protocol checks.",
+                "- If real local assets are required, say so explicitly.",
+                "- If the previous run should be considered invalid unless those changes happen, say that too.",
+            ]
+        )
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def _render_repair_readme(session: ResearchRepairSession) -> str:
+    lines = [
+        "# Research Repair Workspace",
+        "",
+        "This workspace is for run-level repair, not post-export paper cleanup.",
+        "",
+        f"- Source run: `{session.source_run_dir}`",
+        f"- Source run id: `{session.source_run_id}`",
+        f"- Target rollback stage: `{session.target_stage_name}`",
+        f"- Base config: `{session.base_config_path}`",
+        f"- Created at: `{session.created_at}`",
+        "",
+        "## Reuse Policy",
+        "- Hard reuse: parent stages before the target stage are copied directly into the child run.",
+        f"- Hard-reused stage dirs: `{', '.join(session.reuse_policy.hard_reuse_stage_dirs)}`",
+        "- Soft reuse: downstream analysis / decision / paper artifacts are copied into `workspace/context/` only as draft reference material.",
+        f"- Soft-context artifacts: `{', '.join(session.reuse_policy.soft_context_paths)}`",
+        "- Authoritative rerun boundary: all stages from the target stage onward must be regenerated from the new evidence.",
+        "",
+        "## What This Is For",
+        "- Human review says the completed run is not strong enough yet.",
+        "- Instead of only editing the exported paper, create a child run that goes back to the experiment stages.",
+        "- Typical reasons: not enough data, not enough seeds, wrong protocol, or real assets were not used.",
+        "",
+        "## Workspace Files",
+        f"- `workspace/repair-config.yaml`: editable config seed for the child run.",
+        f"- `workspace/feedback.md`: human repair instructions that will be preserved in child-run repair metadata and exposed as a compact repair brief.",
+        f"- `workspace/context/`: copied reference artifacts from the parent run.",
+        "",
+        "## Workflow",
+        "1. Edit `workspace/feedback.md` and, if needed, `workspace/repair-config.yaml`.",
+        "2. Prepare a child run with:",
+        "   `python -m autoresearchclaw research-repair-run --repair-json <session>/research-repair.json`",
+        "3. Add `--execute` only when you explicitly want to launch the new upstream run.",
+        "",
+        "The prepared child run keeps a parent pointer via `research_repair_parent.json` so the repair lineage stays auditable.",
+    ]
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def _load_session(session_json_path: Path) -> ResearchRepairSession:
+    if not session_json_path.exists():
+        raise ResearchRepairError(f"Research repair JSON not found: {session_json_path}")
+    data = loads(session_json_path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ResearchRepairError("Research repair JSON must decode to a mapping.")
+    return ResearchRepairSession.from_dict(data)
+
+
+def _read_feedback(feedback_path: Path, extra_feedback: list[str] | tuple[str, ...]) -> str:
+    if not feedback_path.exists():
+        raise ResearchRepairError(f"Feedback file not found: {feedback_path}")
+    feedback_text = feedback_path.read_text(encoding="utf-8").strip()
+    extras = [str(item).strip() for item in extra_feedback if str(item).strip()]
+    if extras:
+        feedback_text = feedback_text.rstrip() + "\n\n## CLI Additions\n" + "\n".join(
+            f"- {item}" for item in extras
+        )
+    return feedback_text.strip()
+
+
+def _load_yaml(path: Path) -> dict[str, Any]:
+    try:
+        data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise ResearchRepairError(f"Could not read YAML config: {path}") from exc
+    if not isinstance(data, dict):
+        raise ResearchRepairError(f"Config must decode to a mapping: {path}")
+    return data
+
+
+def _write_generated_config(
+    config_data: dict[str, Any],
+    target_path: Path,
+    *,
+    session: ResearchRepairSession,
+    feedback_text: str,
+    child_run_dir: Path,
+) -> None:
+    research = config_data.setdefault("research", {})
+    if not isinstance(research, dict):
+        raise ResearchRepairError("Config field `research` must be a mapping.")
+    original_topic = str(research.get("topic", "")).strip()
+    research["topic"] = _build_repair_topic(
+        session=session,
+        original_topic=original_topic,
+        feedback_text=feedback_text,
+    )
+    project = config_data.setdefault("project", {})
+    if isinstance(project, dict):
+        project_name = str(project.get("name", "research-repair")).strip() or "research-repair"
+        if not project_name.endswith("-repair"):
+            project["name"] = f"{project_name}-repair"
+    _apply_repair_runtime_defaults(
+        config_data,
+        session=session,
+        generated_config_path=target_path,
+        child_run_dir=child_run_dir,
+    )
+
+    target_path.parent.mkdir(parents=True, exist_ok=True)
+    target_path.write_text(
+        yaml.safe_dump(config_data, sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+
+
+def _build_repair_topic(
+    *,
+    session: ResearchRepairSession,
+    original_topic: str,
+    feedback_text: str,
+) -> str:
+    base_topic = _normalize_single_line(original_topic)
+    if not base_topic:
+        base_topic = (
+            "Engineering-drawing circle localization with explicit rule evidence "
+            "and learned heatmaps."
+        )
+
+    repair_focus = _first_repair_focus_line(feedback_text) or _normalize_single_line(
+        session.repair_reason
+    )
+    if repair_focus:
+        return (
+            f"{base_topic}\n\n"
+            f"Repair focus: rerun from {session.target_stage_name} and strengthen "
+            f"{repair_focus}."
+        ).strip()
+    return (
+        f"{base_topic}\n\n"
+        f"Repair focus: rerun from {session.target_stage_name} with stronger "
+        "real-data coverage, seeds, and experiment protocol."
+    ).strip()
+
+
+def _normalize_single_line(text: str) -> str:
+    normalized = " ".join(str(text).split()).strip()
+    return normalized
+
+
+def _first_repair_focus_line(feedback_text: str) -> str:
+    for raw_line in feedback_text.splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if not line.startswith("- "):
+            continue
+        payload = line[2:].strip()
+        lowered = payload.lower()
+        if lowered.startswith(("parent run:", "target stage:", "reason:")):
+            continue
+        return _normalize_single_line(payload)
+    return ""
+
+
+def _apply_repair_runtime_defaults(
+    config_data: dict[str, Any],
+    *,
+    session: ResearchRepairSession,
+    generated_config_path: Path,
+    child_run_dir: Path,
+) -> None:
+    llm = config_data.setdefault("llm", {})
+    if isinstance(llm, dict):
+        acp = llm.setdefault("acp", {})
+        if isinstance(acp, dict):
+            timestamp = generated_config_path.parent.name.strip() or "repair"
+            stage_slug = session.target_stage_name.lower().replace("_", "-")
+            acp["session_name"] = f"researchclaw-{stage_slug}-{timestamp}"
+            current_timeout = _safe_int(acp.get("timeout_sec"), 1800)
+            acp["timeout_sec"] = max(current_timeout, 3200)
+            current_retries = _safe_int(acp.get("reconnect_retries"), 2)
+            acp["reconnect_retries"] = max(current_retries, 6)
+            acp["reconnect_backoff_sec"] = 3.0
+            acp["verbose"] = True
+            acp["capture_status_on_failure"] = True
+            acp["archive_failed_prompt_files"] = True
+            acp["debug_log_path"] = _to_wsl_path(child_run_dir / "acp_debug.jsonl")
+            if session.target_stage_number >= STAGE_NUMBER_BY_NAME["CODE_GENERATION"]:
+                acp["stateless_prompt"] = True
+
+    experiment = config_data.setdefault("experiment", {})
+    if isinstance(experiment, dict):
+        code_agent = experiment.setdefault("code_agent", {})
+        if isinstance(code_agent, dict):
+            code_agent["architecture_planning"] = False
+            code_agent["review_max_rounds"] = 0
+            if session.target_stage_number >= STAGE_NUMBER_BY_NAME["CODE_GENERATION"]:
+                code_agent["fallback_to_legacy_on_acp_failure"] = False
+
+
+def _default_child_run_dir(
+    session: ResearchRepairSession,
+    *,
+    timestamp: str,
+) -> Path:
+    suffix = f"{session.source_run_id}-repair-{timestamp}"
+    override = os.environ.get(REPAIR_RUN_ROOT_ENV_VAR, "").strip()
+    if override:
+        return Path(override).resolve() / suffix
+
+    if sys.platform.startswith("win"):
+        detected_root = _detect_windows_wsl_run_root()
+        if detected_root is not None:
+            return detected_root / suffix
+
+    return Path(session.source_run_dir).resolve().parent / suffix
+
+
+def _detect_windows_wsl_run_root() -> Path | None:
+    if not sys.platform.startswith("win"):
+        return None
+    try:
+        probe = subprocess.run(
+            [
+                "wsl",
+                "bash",
+                "-lc",
+                'mkdir -p "$HOME/.autoresearchclaw/artifacts" && wslpath -w "$HOME/.autoresearchclaw/artifacts"',
+            ],
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            timeout=20,
+            check=False,
+        )
+    except Exception:  # noqa: BLE001
+        return None
+    if probe.returncode != 0:
+        return None
+    output = (probe.stdout or "").strip().splitlines()
+    if not output:
+        return None
+    candidate = output[-1].strip()
+    if not candidate:
+        return None
+    return Path(candidate).resolve()
+
+
+def _safe_int(value: Any, default: int) -> int:
+    if value is None:
+        return default
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _build_inner_launch_command(
+    *,
+    upstream_root: Path,
+    generated_config_path: Path,
+    child_run_dir: Path,
+    stage_name: str,
+    auto_approve: bool,
+    skip_preflight: bool,
+) -> str:
+    upstream_root_wsl = _to_wsl_path(upstream_root)
+    generated_config_wsl = _to_wsl_path(generated_config_path)
+    child_run_dir_wsl = _to_wsl_path(child_run_dir)
+    exe_wsl = _to_wsl_path(upstream_root / ".venv" / "bin" / "researchclaw")
+    command_parts = [f"cd { _sh_quote(upstream_root_wsl) }", 'export PATH="$HOME/bin:$PATH"']
+    tmp_bin = upstream_root / ".tmp_bin"
+    if tmp_bin.exists():
+        tmp_bin_wsl = _to_wsl_path(tmp_bin)
+        command_parts.append(
+            f"export PATH={_sh_quote(tmp_bin_wsl)}:\"$PATH\""
+        )
+    command_parts.append(
+        f"{_sh_quote(exe_wsl)} run --config {_sh_quote(generated_config_wsl)} --output {_sh_quote(child_run_dir_wsl)} --from-stage {stage_name}"
+    )
+    if auto_approve:
+        command_parts[-1] += " --auto-approve"
+    if skip_preflight:
+        command_parts[-1] += " --skip-preflight"
+    inner = " && ".join(command_parts)
+    return inner
+
+
+def _build_compact_repair_brief(
+    *,
+    session: ResearchRepairSession,
+    feedback_text: str,
+) -> str:
+    feedback_lines: list[str] = []
+    for raw_line in feedback_text.splitlines():
+        line = raw_line.strip()
+        if not line.startswith("- "):
+            continue
+        payload = line[2:].strip()
+        lowered = payload.lower()
+        if lowered.startswith(("parent run:", "target stage:", "reason:")):
+            continue
+        feedback_lines.append(_normalize_single_line(payload))
+        if len(feedback_lines) >= 5:
+            break
+
+    lines = [
+        "## Repair Context",
+        f"- Parent run: `{session.source_run_id}`",
+        f"- Authoritative rerun starts at: `{session.target_stage_name}`",
+        f"- Repair reason: {_normalize_single_line(session.repair_reason)}",
+    ]
+    if session.reuse_policy.hard_reuse_stage_dirs:
+        lines.append(
+            "- Hard reuse: "
+            + ", ".join(session.reuse_policy.hard_reuse_stage_dirs)
+        )
+    lines.append(
+        "- Downstream parent analysis and paper artifacts are soft context only."
+    )
+    if feedback_lines:
+        lines.append("- Human requirements:")
+        lines.extend(f"- {item}" for item in feedback_lines)
+    return "\n".join(lines).strip()
+
+
+def _wrap_launch_command_for_display(inner_command: str) -> str:
+    if sys.platform.startswith("win"):
+        return f"wsl bash -lc {_sh_quote(inner_command)}"
+    return f"bash -lc {_sh_quote(inner_command)}"
+
+
+def _launch_command(
+    inner_command: str,
+    launch_log: Path,
+    *,
+    upstream_root: Path | None = None,
+) -> int:
+    launch_env, forwarded_env = _build_launch_env(upstream_root=upstream_root)
+    launch_log.parent.mkdir(parents=True, exist_ok=True)
+    with launch_log.open("w", encoding="utf-8") as log_handle:
+        log_handle.write(f"$ {inner_command}\n\n")
+        if forwarded_env:
+            log_handle.write(
+                "# Forwarded to child process via WSLENV: "
+                + ", ".join(forwarded_env)
+                + "\n\n"
+            )
+        log_handle.flush()
+        if sys.platform.startswith("win"):
+            process = subprocess.Popen(
+                ["wsl", "bash", "-lc", inner_command],
+                stdout=log_handle,
+                stderr=subprocess.STDOUT,
+                stdin=subprocess.DEVNULL,
+                env=launch_env,
+            )
+        else:
+            process = subprocess.Popen(
+                ["bash", "-lc", inner_command],
+                stdout=log_handle,
+                stderr=subprocess.STDOUT,
+                stdin=subprocess.DEVNULL,
+                env=launch_env,
+            )
+    return int(process.pid)
+
+
+def _build_launch_env(
+    *,
+    upstream_root: Path | None = None,
+) -> tuple[dict[str, str], tuple[str, ...]]:
+    env = dict(os.environ)
+    forwarded: list[str] = []
+    for name in WSL_PASSTHROUGH_ENV_VARS:
+        value = env.get(name, "")
+        if value:
+            forwarded.append(name)
+
+    asset_env = _discover_runtime_asset_env(upstream_root)
+    for name, value in asset_env.items():
+        if value:
+            env[name] = value
+            forwarded.append(name)
+
+    if not sys.platform.startswith("win"):
+        return env, tuple(_dedupe_preserve_order(forwarded))
+
+    forwarded = _dedupe_preserve_order(forwarded)
+    if not forwarded:
+        return env, ()
+
+    wslenv_entries = [item for item in env.get("WSLENV", "").split(":") if item]
+    existing_names = {item.split("/", 1)[0] for item in wslenv_entries}
+    for name in forwarded:
+        if name not in existing_names:
+            wslenv_entries.append(name)
+    env["WSLENV"] = ":".join(wslenv_entries)
+    return env, tuple(forwarded)
+
+
+def _discover_runtime_asset_env(upstream_root: Path | None) -> dict[str, str]:
+    if upstream_root is None:
+        return {}
+    if sys.platform.startswith("win"):
+        return _discover_runtime_asset_env_via_wsl(upstream_root)
+    return {}
+
+
+def _discover_runtime_asset_env_via_wsl(upstream_root: Path) -> dict[str, str]:
+    upstream_root_wsl = _to_wsl_path(upstream_root)
+    exe_wsl = _to_wsl_path(upstream_root / ".venv" / "bin" / "python")
+    script = r"""
+from config import build_default_config
+from pathlib import Path
+import json
+
+cfg = build_default_config()
+specs = cfg.build_dataset_specs()
+payload = {"VECTRA_REPO_ROOT": str(Path.cwd())}
+
+simple = specs.get("engineering_primitives_simple_scenes_noslot_v1_local_20260326", {})
+if isinstance(simple, dict):
+    payload["VECTRA_SIMPLE_DATASET_ROOT"] = str(simple.get("dataset_root", ""))
+    payload["VECTRA_SIMPLE_ASSET_ROOT"] = str(simple.get("dataset_root", ""))
+    payload["VECTRA_SIMPLE_MANIFEST_PATH"] = str(simple.get("manifest_path", ""))
+    caches = simple.get("cache_roots", {})
+    if isinstance(caches, dict):
+        payload["VECTRA_SIMPLE_HEATMAP_DIR"] = str(caches.get("learned", ""))
+
+page = specs.get("page_minus_titleblock", {})
+if isinstance(page, dict):
+    page_root = Path(str(page.get("dataset_root", ""))).expanduser()
+    payload["VECTRA_PAGE_DATASET_ROOT"] = str(page_root)
+    payload["VECTRA_PAGE_IMAGE_DIR"] = str(page_root / "train2017")
+    payload["VECTRA_PAGE_SIDECAR_DIR"] = str(page_root / "sidecars" / "train2017")
+    split_json = Path(str(page.get("split_manifest_path", ""))).expanduser()
+    payload["VECTRA_PAGE_SPLIT_JSON"] = str(split_json)
+    if str(split_json):
+        one_drive_png_root = split_json.parent.parent
+        payload["VECTRA_ONE_DRIVE_PNG_ROOT"] = str(one_drive_png_root)
+        payload["VECTRA_PAGE_GT_SOLID_CSV"] = str(split_json.parent / "gt" / "train2017_solid.csv")
+        payload["VECTRA_PAGE_GT_DASHED_CSV"] = str(split_json.parent / "gt" / "train2017_dashed.csv")
+
+probe = specs.get("DeepPatent2_negative_clutter_probe", {})
+if isinstance(probe, dict):
+    payload["VECTRA_DEEPPATENT_DATASET_ROOT"] = str(probe.get("dataset_root", ""))
+
+clean = {k: v for k, v in payload.items() if v and v != "."}
+print(json.dumps(clean, ensure_ascii=False))
+""".strip()
+
+    command = (
+        f"cd {_sh_quote(upstream_root_wsl)} && "
+        f"{_sh_quote(exe_wsl)} - <<'PY'\n{script}\nPY"
+    )
+    try:
+        probe = subprocess.run(
+            ["wsl", "bash", "-lc", command],
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            timeout=30,
+            check=False,
+        )
+    except Exception:  # noqa: BLE001
+        return {}
+    if probe.returncode != 0:
+        return {}
+    lines = [line.strip() for line in (probe.stdout or "").splitlines() if line.strip()]
+    if not lines:
+        return {}
+    try:
+        payload = loads(lines[-1])
+    except ValueError:
+        return {}
+    if not isinstance(payload, dict):
+        return {}
+    return {
+        str(key): str(value)
+        for key, value in payload.items()
+        if str(value).strip()
+    }
+
+
+def _dedupe_preserve_order(values: list[str]) -> list[str]:
+    result: list[str] = []
+    seen: set[str] = set()
+    for value in values:
+        if value in seen:
+            continue
+        seen.add(value)
+        result.append(value)
+    return result
+
+
+def _copy_path(source_path: Path, target_path: Path, kind: str) -> None:
+    target_path.parent.mkdir(parents=True, exist_ok=True)
+    if kind == "directory":
+        copytree(source_path, target_path)
+        return
+    copy2(source_path, target_path)
+
+
+def _copy_prerequisite_stage_dirs(
+    *,
+    source_run_dir: Path,
+    child_run_dir: Path,
+    target_stage_number: int,
+) -> tuple[str, ...]:
+    inherited: list[str] = []
+    for stage_number in range(1, target_stage_number):
+        stage_dir_name = f"stage-{stage_number:02d}"
+        source_stage_dir = source_run_dir / stage_dir_name
+        if not source_stage_dir.exists():
+            continue
+        target_stage_dir = child_run_dir / stage_dir_name
+        if target_stage_dir.exists():
+            continue
+        copytree(source_stage_dir, target_stage_dir)
+        inherited.append(stage_dir_name)
+    return tuple(inherited)
+
+
+def _to_wsl_path(path: Path) -> str:
+    resolved = str(path.resolve())
+    normalized = resolved.replace("/", "\\")
+    lowered = normalized.lower()
+    wsl_prefixes = ("\\\\wsl$\\", "\\\\wsl.localhost\\")
+    for prefix in wsl_prefixes:
+        if lowered.startswith(prefix):
+            parts = normalized.split("\\")
+            # UNC layout: \\wsl$\Distro\path\inside\wsl
+            if len(parts) >= 5:
+                remainder = "/".join(segment for segment in parts[4:] if segment)
+                return "/" + remainder if remainder else "/"
+    if ":" not in resolved:
+        return resolved.replace("\\", "/")
+    drive, rest = resolved.split(":", 1)
+    return f"/mnt/{drive.lower()}{rest.replace('\\', '/')}"
+
+
+def _sh_quote(value: str) -> str:
+    return "'" + value.replace("'", "'\"'\"'") + "'"
+
+
+def _utc_now() -> str:
+    return datetime.now(timezone.utc).isoformat()
diff --git a/pyproject.toml b/pyproject.toml
index d669a06d..b119c6d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,9 +30,10 @@ dev = ["pytest>=7.0", "httpx>=0.24"]
 
 [project.scripts]
 researchclaw = "researchclaw.cli:main"
+autoresearchclaw = "autoresearchclaw.cli:main"
 
 [tool.hatch.build.targets.wheel]
-packages = ["researchclaw", "sibyl", "arc"]
+packages = ["researchclaw", "sibyl", "arc", "autoresearchclaw"]
 
 [tool.hatch.build.targets.wheel.force-include]
 "researchclaw/templates/styles" = "researchclaw/templates/styles"