CKwin26 · CKwin26 · Mar 31, 2026
diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml
@@ -50,6 +50,13 @@ llm:
   # primary_model: "MiniMax-M2.5"
   # fallback_models:
   #   - "MiniMax-M2.5-highspeed"
+  # acp:
+  #   reconnect_retries: 2
+  #   reconnect_backoff_sec: 2.0
+  #   verbose: true
+  #   capture_status_on_failure: true
+  #   debug_log_path: "artifacts/acp_debug.jsonl"
+  #   archive_failed_prompt_files: true
 
 security:
   hitl_required_stages: [5, 9, 20]
@@ -66,6 +73,22 @@ experiment:
   max_iterations: 10
   metric_key: "primary_metric"
   metric_direction: "minimize"
+  # Optional hard guards for trust-first experiment runs.
+  # When enabled, generated experiments must use real local assets/caches, fail fast
+  # if those assets are missing, and emit structured machine-readable results.
+  require_real_data: false
+  forbid_synthetic_proxy: false
+  fail_on_stdout_parsed_results: false
+  required_real_data_refs: []
+  benchmark_agent:
+    enabled: true
+    preserve_existing_assets: true
+    pass_existing_assets_as_reference: true
+  code_agent:
+    enabled: true
+    # If the ACP transport drops during multi-round Stage 10 generation,
+    # fall back to the older one-shot generator instead of failing immediately.
+    fallback_to_legacy_on_acp_failure: false
   sandbox:
     # Use ".venv/Scripts/python.exe" on Windows
     python_path: ".venv/bin/python3"

diff --git a/researchclaw/config.py b/researchclaw/config.py
@@ -184,6 +184,13 @@ class AcpConfig:
     acpx_command: str = ""
     session_name: str = "researchclaw"
     timeout_sec: int = 1800
+    verbose: bool = False
+    stateless_prompt: bool = False
+    reconnect_retries: int = 2
+    reconnect_backoff_sec: float = 2.0
+    capture_status_on_failure: bool = False
+    debug_log_path: str = ""
+    archive_failed_prompt_files: bool = False
 
 
 @dataclass(frozen=True)
@@ -295,6 +302,7 @@ class CodeAgentConfig:
     """Configuration for the advanced multi-phase code generation agent."""
 
     enabled: bool = True
+    fallback_to_legacy_on_acp_failure: bool = False
     # Phase 1: Blueprint planning (deep implementation blueprint)
     architecture_planning: bool = True
     # Phase 2: Sequential file generation (one-by-one following blueprint)
@@ -347,6 +355,8 @@ class BenchmarkAgentConfig:
     min_benchmarks: int = 1
     min_baselines: int = 2
     prefer_cached: bool = True
+    preserve_existing_assets: bool = True
+    pass_existing_assets_as_reference: bool = True
     # Orchestrator
     max_iterations: int = 2
 
@@ -426,6 +436,10 @@ class ExperimentConfig:
     metric_key: str = "primary_metric"
     metric_direction: str = "minimize"
     keep_threshold: float = 0.0
+    require_real_data: bool = False
+    forbid_synthetic_proxy: bool = False
+    fail_on_stdout_parsed_results: bool = False
+    required_real_data_refs: tuple[str, ...] = ()
     sandbox: SandboxConfig = field(default_factory=SandboxConfig)
     docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig)
     agentic: AgenticConfig = field(default_factory=AgenticConfig)
@@ -972,6 +986,17 @@ def _parse_llm_config(data: dict[str, Any]) -> LlmConfig:
             acpx_command=acp_data.get("acpx_command", ""),
             session_name=acp_data.get("session_name", "researchclaw"),
             timeout_sec=int(acp_data.get("timeout_sec", 1800)),
+            verbose=bool(acp_data.get("verbose", False)),
+            stateless_prompt=bool(acp_data.get("stateless_prompt", False)),
+            reconnect_retries=_safe_int(acp_data.get("reconnect_retries"), 2),
+            reconnect_backoff_sec=_safe_float(acp_data.get("reconnect_backoff_sec"), 2.0),
+            capture_status_on_failure=bool(
+                acp_data.get("capture_status_on_failure", False)
+            ),
+            debug_log_path=str(acp_data.get("debug_log_path", "")),
+            archive_failed_prompt_files=bool(
+                acp_data.get("archive_failed_prompt_files", False)
+            ),
         ),
     )
 
@@ -1008,6 +1033,12 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
         metric_key=data.get("metric_key", "primary_metric"),
         metric_direction=data.get("metric_direction", "minimize"),
         keep_threshold=_safe_float(data.get("keep_threshold"), 0.0),
+        require_real_data=bool(data.get("require_real_data", False)),
+        forbid_synthetic_proxy=bool(data.get("forbid_synthetic_proxy", False)),
+        fail_on_stdout_parsed_results=bool(
+            data.get("fail_on_stdout_parsed_results", False)
+        ),
+        required_real_data_refs=tuple(data.get("required_real_data_refs") or ()),
         sandbox=SandboxConfig(
             python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH),
             gpu_required=bool(sandbox_data.get("gpu_required", False)),
@@ -1086,6 +1117,10 @@ def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig:
         min_benchmarks=_safe_int(data.get("min_benchmarks"), 1),
         min_baselines=_safe_int(data.get("min_baselines"), 2),
         prefer_cached=bool(data.get("prefer_cached", True)),
+        preserve_existing_assets=bool(data.get("preserve_existing_assets", True)),
+        pass_existing_assets_as_reference=bool(
+            data.get("pass_existing_assets_as_reference", True)
+        ),
         max_iterations=_safe_int(data.get("max_iterations"), 2),
     )
 
@@ -1142,6 +1177,9 @@ def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig:
         return CodeAgentConfig()
     return CodeAgentConfig(
         enabled=bool(data.get("enabled", True)),
+        fallback_to_legacy_on_acp_failure=bool(
+            data.get("fallback_to_legacy_on_acp_failure", False)
+        ),
         architecture_planning=bool(data.get("architecture_planning", True)),
         sequential_generation=bool(data.get("sequential_generation", True)),
         hard_validation=bool(data.get("hard_validation", True)),

diff --git a/researchclaw/docker/entrypoint.sh b/researchclaw/docker/entrypoint.sh
@@ -11,7 +11,10 @@
 set -e
 
 WORKSPACE="/workspace"
-ENTRY_POINT="${1:-main.py}"
+ENTRY_POINT="${RC_ENTRY_POINT:-${1:-main.py}}"
+if [ "$#" -gt 0 ]; then
+    shift
+fi
 
 # ----------------------------------------------------------------
 # Phase 0: Install additional pip packages
@@ -51,4 +54,4 @@ fi
 # Phase 2: Run experiment
 # ----------------------------------------------------------------
 echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
-exec python3 -u "$WORKSPACE/$ENTRY_POINT"
+exec python3 -u "$WORKSPACE/$ENTRY_POINT" "$@"
diff --git a/researchclaw/experiment/colab_sandbox.py b/researchclaw/experiment/colab_sandbox.py
@@ -158,6 +158,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends
         from researchclaw.experiment.sandbox import validate_entry_point

diff --git a/researchclaw/experiment/docker_sandbox.py b/researchclaw/experiment/docker_sandbox.py
@@ -138,6 +138,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project inside a container."""
         self._run_counter += 1
@@ -189,7 +191,13 @@ def run_project(
                 metrics={},
             )
 
-        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
+        return self._execute(
+            staging,
+            entry_point=entry_point,
+            timeout_sec=timeout_sec,
+            entry_args=args,
+            env_overrides=env_overrides,
+        )
 
     # ------------------------------------------------------------------
     # Static helpers
@@ -254,7 +262,13 @@ def _inject_harness(target_dir: Path) -> None:
     # ------------------------------------------------------------------
 
     def _execute(
-        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
+        self,
+        staging_dir: Path,
+        *,
+        entry_point: str,
+        timeout_sec: int,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Core execution: single container, three-phase via entrypoint.sh."""
         cfg = self.config
@@ -269,6 +283,8 @@ def _execute(
             staging_dir,
             entry_point=entry_point,
             container_name=container_name,
+            entry_args=entry_args,
+            env_overrides=env_overrides,
         )
 
         start = time.monotonic()
@@ -349,6 +365,8 @@ def _build_run_command(
         *,
         entry_point: str,
         container_name: str,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> list[str]:
         """Build the ``docker run`` command list.
 
@@ -453,9 +471,17 @@ def _user_flag() -> list[str]:
             else:
                 cmd.extend(["--gpus", "all"])
 
+        if env_overrides:
+            for name, value in sorted(env_overrides.items()):
+                if not value:
+                    continue
+                cmd.extend(["-e", f"{name}={value}"])
+
         # Image + entry point (passed as CMD arg to entrypoint.sh)
         cmd.append(cfg.image)
         cmd.append(entry_point)
+        if entry_args:
+            cmd.extend(entry_args)
 
         return cmd
 

diff --git a/researchclaw/experiment/sandbox.py b/researchclaw/experiment/sandbox.py
@@ -297,6 +297,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult: ...
 
 
@@ -350,6 +352,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project in the sandbox.
 
@@ -409,12 +413,14 @@ def run_project(
             )
 
         start = time.monotonic()
-        command = self._build_command(entry)
+        command = self._build_command(entry, args=args)
         logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project)
 
         result: SandboxResult
         try:
             env = {**os.environ, "PYTHONUNBUFFERED": "1"}
+            if env_overrides:
+                env.update(env_overrides)
             completed = subprocess.run(
                 command,
                 capture_output=True,
@@ -457,7 +463,12 @@ def _next_script_path(self) -> Path:
     def _write_script(script_path: Path, code: str) -> None:
         _ = script_path.write_text(code, encoding="utf-8")
 
-    def _build_command(self, script_path: Path) -> list[str]:
+    def _build_command(
+        self,
+        script_path: Path,
+        *,
+        args: list[str] | None = None,
+    ) -> list[str]:
         # Convert relative python_path to absolute WITHOUT resolving symlinks.
         # Using .resolve() would follow venv symlinks to the system Python binary,
         # which loses the venv context (site-packages like numpy become unavailable).
@@ -466,7 +477,10 @@ def _build_command(self, script_path: Path) -> list[str]:
         if not python_path.is_absolute():
             python_path = Path.cwd() / python_path
         # -u: unbuffered stdout/stderr so subprocess.run captures all output
-        return [str(python_path), "-u", str(script_path)]
+        command = [str(python_path), "-u", str(script_path)]
+        if args:
+            command.extend(args)
+        return command
 
     @staticmethod
     def _result_from_completed(