Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions config.researchclaw.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ llm:
# primary_model: "MiniMax-M2.5"
# fallback_models:
# - "MiniMax-M2.5-highspeed"
# acp:
# reconnect_retries: 2
# reconnect_backoff_sec: 2.0
# verbose: true
# capture_status_on_failure: true
# debug_log_path: "artifacts/acp_debug.jsonl"
# archive_failed_prompt_files: true

security:
hitl_required_stages: [5, 9, 20]
Expand All @@ -66,6 +73,22 @@ experiment:
max_iterations: 10
metric_key: "primary_metric"
metric_direction: "minimize"
# Optional hard guards for trust-first experiment runs.
# When enabled, generated experiments must use real local assets/caches, fail fast
# if those assets are missing, and emit structured machine-readable results.
require_real_data: false
forbid_synthetic_proxy: false
fail_on_stdout_parsed_results: false
required_real_data_refs: []
benchmark_agent:
enabled: true
preserve_existing_assets: true
pass_existing_assets_as_reference: true
code_agent:
enabled: true
# If the ACP transport drops during multi-round Stage 10 generation,
# fall back to the older one-shot generator instead of failing immediately.
fallback_to_legacy_on_acp_failure: false
sandbox:
# Use ".venv/Scripts/python.exe" on Windows
python_path: ".venv/bin/python3"
Expand Down
38 changes: 38 additions & 0 deletions researchclaw/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,13 @@ class AcpConfig:
acpx_command: str = ""
session_name: str = "researchclaw"
timeout_sec: int = 1800
verbose: bool = False
stateless_prompt: bool = False
reconnect_retries: int = 2
reconnect_backoff_sec: float = 2.0
capture_status_on_failure: bool = False
debug_log_path: str = ""
archive_failed_prompt_files: bool = False


@dataclass(frozen=True)
Expand Down Expand Up @@ -295,6 +302,7 @@ class CodeAgentConfig:
"""Configuration for the advanced multi-phase code generation agent."""

enabled: bool = True
fallback_to_legacy_on_acp_failure: bool = False
# Phase 1: Blueprint planning (deep implementation blueprint)
architecture_planning: bool = True
# Phase 2: Sequential file generation (one-by-one following blueprint)
Expand Down Expand Up @@ -347,6 +355,8 @@ class BenchmarkAgentConfig:
min_benchmarks: int = 1
min_baselines: int = 2
prefer_cached: bool = True
preserve_existing_assets: bool = True
pass_existing_assets_as_reference: bool = True
# Orchestrator
max_iterations: int = 2

Expand Down Expand Up @@ -426,6 +436,10 @@ class ExperimentConfig:
metric_key: str = "primary_metric"
metric_direction: str = "minimize"
keep_threshold: float = 0.0
require_real_data: bool = False
forbid_synthetic_proxy: bool = False
fail_on_stdout_parsed_results: bool = False
required_real_data_refs: tuple[str, ...] = ()
sandbox: SandboxConfig = field(default_factory=SandboxConfig)
docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig)
agentic: AgenticConfig = field(default_factory=AgenticConfig)
Expand Down Expand Up @@ -972,6 +986,17 @@ def _parse_llm_config(data: dict[str, Any]) -> LlmConfig:
acpx_command=acp_data.get("acpx_command", ""),
session_name=acp_data.get("session_name", "researchclaw"),
timeout_sec=int(acp_data.get("timeout_sec", 1800)),
verbose=bool(acp_data.get("verbose", False)),
stateless_prompt=bool(acp_data.get("stateless_prompt", False)),
reconnect_retries=_safe_int(acp_data.get("reconnect_retries"), 2),
reconnect_backoff_sec=_safe_float(acp_data.get("reconnect_backoff_sec"), 2.0),
capture_status_on_failure=bool(
acp_data.get("capture_status_on_failure", False)
),
debug_log_path=str(acp_data.get("debug_log_path", "")),
archive_failed_prompt_files=bool(
acp_data.get("archive_failed_prompt_files", False)
),
),
)

Expand Down Expand Up @@ -1008,6 +1033,12 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
metric_key=data.get("metric_key", "primary_metric"),
metric_direction=data.get("metric_direction", "minimize"),
keep_threshold=_safe_float(data.get("keep_threshold"), 0.0),
require_real_data=bool(data.get("require_real_data", False)),
forbid_synthetic_proxy=bool(data.get("forbid_synthetic_proxy", False)),
fail_on_stdout_parsed_results=bool(
data.get("fail_on_stdout_parsed_results", False)
),
required_real_data_refs=tuple(data.get("required_real_data_refs") or ()),
sandbox=SandboxConfig(
python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH),
gpu_required=bool(sandbox_data.get("gpu_required", False)),
Expand Down Expand Up @@ -1086,6 +1117,10 @@ def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig:
min_benchmarks=_safe_int(data.get("min_benchmarks"), 1),
min_baselines=_safe_int(data.get("min_baselines"), 2),
prefer_cached=bool(data.get("prefer_cached", True)),
preserve_existing_assets=bool(data.get("preserve_existing_assets", True)),
pass_existing_assets_as_reference=bool(
data.get("pass_existing_assets_as_reference", True)
),
max_iterations=_safe_int(data.get("max_iterations"), 2),
)

Expand Down Expand Up @@ -1142,6 +1177,9 @@ def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig:
return CodeAgentConfig()
return CodeAgentConfig(
enabled=bool(data.get("enabled", True)),
fallback_to_legacy_on_acp_failure=bool(
data.get("fallback_to_legacy_on_acp_failure", False)
),
architecture_planning=bool(data.get("architecture_planning", True)),
sequential_generation=bool(data.get("sequential_generation", True)),
hard_validation=bool(data.get("hard_validation", True)),
Expand Down
7 changes: 5 additions & 2 deletions researchclaw/docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
set -e

WORKSPACE="/workspace"
ENTRY_POINT="${1:-main.py}"
ENTRY_POINT="${RC_ENTRY_POINT:-${1:-main.py}}"
if [ "$#" -gt 0 ]; then
shift
fi

# ----------------------------------------------------------------
# Phase 0: Install additional pip packages
Expand Down Expand Up @@ -51,4 +54,4 @@ fi
# Phase 2: Run experiment
# ----------------------------------------------------------------
echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
exec python3 -u "$WORKSPACE/$ENTRY_POINT"
exec python3 -u "$WORKSPACE/$ENTRY_POINT" "$@"
2 changes: 2 additions & 0 deletions researchclaw/experiment/colab_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ def run_project(
*,
entry_point: str = "main.py",
timeout_sec: int = 300,
args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> SandboxResult:
# BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends
from researchclaw.experiment.sandbox import validate_entry_point
Expand Down
30 changes: 28 additions & 2 deletions researchclaw/experiment/docker_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def run_project(
*,
entry_point: str = "main.py",
timeout_sec: int = 300,
args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> SandboxResult:
"""Run a multi-file experiment project inside a container."""
self._run_counter += 1
Expand Down Expand Up @@ -189,7 +191,13 @@ def run_project(
metrics={},
)

return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
return self._execute(
staging,
entry_point=entry_point,
timeout_sec=timeout_sec,
entry_args=args,
env_overrides=env_overrides,
)

# ------------------------------------------------------------------
# Static helpers
Expand Down Expand Up @@ -254,7 +262,13 @@ def _inject_harness(target_dir: Path) -> None:
# ------------------------------------------------------------------

def _execute(
self, staging_dir: Path, *, entry_point: str, timeout_sec: int
self,
staging_dir: Path,
*,
entry_point: str,
timeout_sec: int,
entry_args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> SandboxResult:
"""Core execution: single container, three-phase via entrypoint.sh."""
cfg = self.config
Expand All @@ -269,6 +283,8 @@ def _execute(
staging_dir,
entry_point=entry_point,
container_name=container_name,
entry_args=entry_args,
env_overrides=env_overrides,
)

start = time.monotonic()
Expand Down Expand Up @@ -349,6 +365,8 @@ def _build_run_command(
*,
entry_point: str,
container_name: str,
entry_args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> list[str]:
"""Build the ``docker run`` command list.

Expand Down Expand Up @@ -453,9 +471,17 @@ def _user_flag() -> list[str]:
else:
cmd.extend(["--gpus", "all"])

if env_overrides:
for name, value in sorted(env_overrides.items()):
if not value:
continue
cmd.extend(["-e", f"{name}={value}"])

# Image + entry point (passed as CMD arg to entrypoint.sh)
cmd.append(cfg.image)
cmd.append(entry_point)
if entry_args:
cmd.extend(entry_args)

return cmd

Expand Down
20 changes: 17 additions & 3 deletions researchclaw/experiment/sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ def run_project(
*,
entry_point: str = "main.py",
timeout_sec: int = 300,
args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> SandboxResult: ...


Expand Down Expand Up @@ -350,6 +352,8 @@ def run_project(
*,
entry_point: str = "main.py",
timeout_sec: int = 300,
args: list[str] | None = None,
env_overrides: dict[str, str] | None = None,
) -> SandboxResult:
"""Run a multi-file experiment project in the sandbox.

Expand Down Expand Up @@ -409,12 +413,14 @@ def run_project(
)

start = time.monotonic()
command = self._build_command(entry)
command = self._build_command(entry, args=args)
logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project)

result: SandboxResult
try:
env = {**os.environ, "PYTHONUNBUFFERED": "1"}
if env_overrides:
env.update(env_overrides)
completed = subprocess.run(
command,
capture_output=True,
Expand Down Expand Up @@ -457,7 +463,12 @@ def _next_script_path(self) -> Path:
def _write_script(script_path: Path, code: str) -> None:
_ = script_path.write_text(code, encoding="utf-8")

def _build_command(self, script_path: Path) -> list[str]:
def _build_command(
self,
script_path: Path,
*,
args: list[str] | None = None,
) -> list[str]:
# Convert relative python_path to absolute WITHOUT resolving symlinks.
# Using .resolve() would follow venv symlinks to the system Python binary,
# which loses the venv context (site-packages like numpy become unavailable).
Expand All @@ -466,7 +477,10 @@ def _build_command(self, script_path: Path) -> list[str]:
if not python_path.is_absolute():
python_path = Path.cwd() / python_path
# -u: unbuffered stdout/stderr so subprocess.run captures all output
return [str(python_path), "-u", str(script_path)]
command = [str(python_path), "-u", str(script_path)]
if args:
command.extend(args)
return command

@staticmethod
def _result_from_completed(
Expand Down
Loading