From 44c7cc46a9edb7d47686d60613ee6ab8cd905638 Mon Sep 17 00:00:00 2001
From: CKwin26 <156837805+CKwin26@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:24:43 -0400
Subject: [PATCH 1/2] Allow sandboxes to pass optional entrypoint args and env

---
 researchclaw/docker/entrypoint.sh         |  7 ++-
 researchclaw/experiment/colab_sandbox.py  |  2 +
 researchclaw/experiment/docker_sandbox.py | 30 ++++++++++-
 researchclaw/experiment/sandbox.py        | 20 +++++--
 researchclaw/experiment/ssh_sandbox.py    | 64 ++++++++++++++++++++---
 tests/test_entry_point_validation.py      | 35 ++++++++++++-
 tests/test_rc_docker_sandbox.py           | 16 ++++++
 tests/test_ssh_and_colab_sandbox.py       | 30 +++++++++++
 8 files changed, 188 insertions(+), 16 deletions(-)

diff --git a/researchclaw/docker/entrypoint.sh b/researchclaw/docker/entrypoint.sh
index 316039c0..5104bd9c 100755
--- a/researchclaw/docker/entrypoint.sh
+++ b/researchclaw/docker/entrypoint.sh
@@ -11,7 +11,10 @@
 set -e
 
 WORKSPACE="/workspace"
-ENTRY_POINT="${1:-main.py}"
+ENTRY_POINT="${RC_ENTRY_POINT:-${1:-main.py}}"
+if [ "$#" -gt 0 ]; then
+    shift
+fi
 
 # ----------------------------------------------------------------
 # Phase 0: Install additional pip packages
@@ -51,4 +54,4 @@ fi
 # Phase 2: Run experiment
 # ----------------------------------------------------------------
 echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..."
-exec python3 -u "$WORKSPACE/$ENTRY_POINT"
+exec python3 -u "$WORKSPACE/$ENTRY_POINT" "$@"
diff --git a/researchclaw/experiment/colab_sandbox.py b/researchclaw/experiment/colab_sandbox.py
index b6a46542..eec6ad7e 100644
--- a/researchclaw/experiment/colab_sandbox.py
+++ b/researchclaw/experiment/colab_sandbox.py
@@ -158,6 +158,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends
         from researchclaw.experiment.sandbox import validate_entry_point
diff --git a/researchclaw/experiment/docker_sandbox.py b/researchclaw/experiment/docker_sandbox.py
index 3eda27c9..b45f21cd 100644
--- a/researchclaw/experiment/docker_sandbox.py
+++ b/researchclaw/experiment/docker_sandbox.py
@@ -138,6 +138,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project inside a container."""
         self._run_counter += 1
@@ -189,7 +191,13 @@ def run_project(
                 metrics={},
             )
 
-        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
+        return self._execute(
+            staging,
+            entry_point=entry_point,
+            timeout_sec=timeout_sec,
+            entry_args=args,
+            env_overrides=env_overrides,
+        )
 
     # ------------------------------------------------------------------
     # Static helpers
@@ -254,7 +262,13 @@ def _inject_harness(target_dir: Path) -> None:
     # ------------------------------------------------------------------
 
     def _execute(
-        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
+        self,
+        staging_dir: Path,
+        *,
+        entry_point: str,
+        timeout_sec: int,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Core execution: single container, three-phase via entrypoint.sh."""
         cfg = self.config
@@ -269,6 +283,8 @@ def _execute(
             staging_dir,
             entry_point=entry_point,
             container_name=container_name,
+            entry_args=entry_args,
+            env_overrides=env_overrides,
         )
 
         start = time.monotonic()
@@ -349,6 +365,8 @@ def _build_run_command(
         *,
         entry_point: str,
         container_name: str,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> list[str]:
         """Build the ``docker run`` command list.
 
@@ -453,9 +471,17 @@ def _user_flag() -> list[str]:
             else:
                 cmd.extend(["--gpus", "all"])
 
+        if env_overrides:
+            for name, value in sorted(env_overrides.items()):
+                if not value:
+                    continue
+                cmd.extend(["-e", f"{name}={value}"])
+
         # Image + entry point (passed as CMD arg to entrypoint.sh)
         cmd.append(cfg.image)
         cmd.append(entry_point)
+        if entry_args:
+            cmd.extend(entry_args)
 
         return cmd
 
diff --git a/researchclaw/experiment/sandbox.py b/researchclaw/experiment/sandbox.py
index d54e0fcf..6b66dc3c 100644
--- a/researchclaw/experiment/sandbox.py
+++ b/researchclaw/experiment/sandbox.py
@@ -297,6 +297,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult: ...
 
 
@@ -350,6 +352,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project in the sandbox.
 
@@ -409,12 +413,14 @@ def run_project(
             )
 
         start = time.monotonic()
-        command = self._build_command(entry)
+        command = self._build_command(entry, args=args)
         logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project)
 
         result: SandboxResult
         try:
             env = {**os.environ, "PYTHONUNBUFFERED": "1"}
+            if env_overrides:
+                env.update(env_overrides)
             completed = subprocess.run(
                 command,
                 capture_output=True,
@@ -457,7 +463,12 @@ def _next_script_path(self) -> Path:
     def _write_script(script_path: Path, code: str) -> None:
         _ = script_path.write_text(code, encoding="utf-8")
 
-    def _build_command(self, script_path: Path) -> list[str]:
+    def _build_command(
+        self,
+        script_path: Path,
+        *,
+        args: list[str] | None = None,
+    ) -> list[str]:
         # Convert relative python_path to absolute WITHOUT resolving symlinks.
         # Using .resolve() would follow venv symlinks to the system Python binary,
         # which loses the venv context (site-packages like numpy become unavailable).
@@ -466,7 +477,10 @@ def _build_command(self, script_path: Path) -> list[str]:
         if not python_path.is_absolute() and python != "python":
             python_path = Path.cwd() / python_path
         # -u: unbuffered stdout/stderr so subprocess.run captures all output
-        return [str(python_path), "-u", str(script_path)]
+        command = [str(python_path), "-u", str(script_path)]
+        if args:
+            command.extend(args)
+        return command
 
     @staticmethod
     def _result_from_completed(
diff --git a/researchclaw/experiment/ssh_sandbox.py b/researchclaw/experiment/ssh_sandbox.py
index aad97fca..ec5026da 100644
--- a/researchclaw/experiment/ssh_sandbox.py
+++ b/researchclaw/experiment/ssh_sandbox.py
@@ -71,6 +71,8 @@ def run_project(
         *,
         entry_point: str = "main.py",
         timeout_sec: int = 300,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Run a multi-file experiment project on the remote host."""
         self._run_counter += 1
@@ -119,7 +121,13 @@ def run_project(
                 metrics={},
             )
 
-        return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec)
+        return self._execute(
+            staging,
+            entry_point=entry_point,
+            timeout_sec=timeout_sec,
+            entry_args=args,
+            env_overrides=env_overrides,
+        )
 
     # ------------------------------------------------------------------
     # Static helpers
@@ -158,7 +166,13 @@ def _inject_harness(target_dir: Path) -> None:
     # ------------------------------------------------------------------
 
     def _execute(
-        self, staging_dir: Path, *, entry_point: str, timeout_sec: int
+        self,
+        staging_dir: Path,
+        *,
+        entry_point: str,
+        timeout_sec: int,
+        entry_args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> SandboxResult:
         """Core execution flow for remote experiments.
 
@@ -213,11 +227,17 @@ def _execute(
         # 4. Execute experiment
         if cfg.use_docker:
             exec_cmd = self._build_docker_exec_cmd(
-                remote_dir, entry_point=entry_point,
+                remote_dir,
+                entry_point=entry_point,
+                args=entry_args,
+                env_overrides=env_overrides,
             )
         else:
             exec_cmd = self._build_bare_exec_cmd(
-                remote_dir, entry_point=entry_point,
+                remote_dir,
+                entry_point=entry_point,
+                args=entry_args,
+                env_overrides=env_overrides,
             )
 
         start = time.monotonic()
@@ -242,13 +262,26 @@ def _execute(
         )
 
     def _build_bare_exec_cmd(
-        self, remote_dir: str, *, entry_point: str,
+        self,
+        remote_dir: str,
+        *,
+        entry_point: str,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> str:
         """Build command to run Python directly on remote host (with basic sandboxing)."""
         cfg = self.config
         rd = shlex.quote(remote_dir)
         ep = shlex.quote(entry_point)
         py = shlex.quote(cfg.remote_python)
+        arg_text = " ".join(shlex.quote(arg) for arg in (args or []))
+        arg_suffix = f" {arg_text}" if arg_text else ""
+        env_parts = [
+            f"{name}={shlex.quote(value)}"
+            for name, value in sorted((env_overrides or {}).items())
+            if value
+        ]
+        env_prefix = (" ".join(env_parts) + " ") if env_parts else ""
 
         gpu_env = ""
         if cfg.gpu_ids:
@@ -264,17 +297,24 @@ def _build_bare_exec_cmd(
             f"if command -v unshare >/dev/null 2>&1; then "
             f"HOME={rd} "
             f"{gpu_env}"
-            f"unshare --net {py} -u {ep}; "
+            f"{env_prefix}"
+            f"unshare --net {py} -u {ep}{arg_suffix}; "
             f"else "
             f"echo 'WARNING: unshare not available, running without network isolation' >&2; "
             f"HOME={rd} "
             f"{gpu_env}"
-            f"{py} -u {ep}; "
+            f"{env_prefix}"
+            f"{py} -u {ep}{arg_suffix}; "
             f"fi"
         )
 
     def _build_docker_exec_cmd(
-        self, remote_dir: str, *, entry_point: str,
+        self,
+        remote_dir: str,
+        *,
+        entry_point: str,
+        args: list[str] | None = None,
+        env_overrides: dict[str, str] | None = None,
     ) -> str:
         """Build command to run inside a Docker container on the remote host.
 
@@ -307,8 +347,16 @@ def _build_docker_exec_cmd(
             # Try to pass all GPUs; fails gracefully if none available
             parts.extend(["--gpus", "all"])
 
+        if env_overrides:
+            for name, value in sorted(env_overrides.items()):
+                if not value:
+                    continue
+                parts.extend(["-e", shlex.quote(f"{name}={value}")])
+
         parts.append(shlex.quote(cfg.docker_image))
         parts.extend(["python3", "-u", shlex.quote(entry_point)])
+        if args:
+            parts.extend(shlex.quote(arg) for arg in args)
 
         return " ".join(parts)
 
diff --git a/tests/test_entry_point_validation.py b/tests/test_entry_point_validation.py
index ca51d3b4..6e9b117e 100644
--- a/tests/test_entry_point_validation.py
+++ b/tests/test_entry_point_validation.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import sys
 from pathlib import Path
 from unittest.mock import patch
 
@@ -99,7 +100,7 @@ class TestExperimentSandboxEntryPointValidation:
     def _make_sandbox(self, tmp_path: Path) -> ExperimentSandbox:
         from researchclaw.config import SandboxConfig
 
-        cfg = SandboxConfig()
+        cfg = SandboxConfig(python_path=sys.executable)
         return ExperimentSandbox(cfg, tmp_path / "work")
 
     def test_rejects_path_traversal(self, tmp_path: Path) -> None:
@@ -140,3 +141,35 @@ def test_rejects_absolute_path(self, tmp_path: Path) -> None:
     # for future copy mechanism changes; see
     # TestValidateEntryPointResolved.test_symlink_escape_rejected for
     # the unit-level proof that the function catches symlink escapes.
+
+    def test_run_project_passes_args_and_env_overrides(self, tmp_path: Path) -> None:
+        project = tmp_path / "proj"
+        project.mkdir()
+        (project / "main.py").write_text(
+            "\n".join(
+                [
+                    "from __future__ import annotations",
+                    "import argparse",
+                    "import os",
+                    "",
+                    "parser = argparse.ArgumentParser()",
+                    "parser.add_argument('--value', required=True)",
+                    "args = parser.parse_args()",
+                    "if os.environ.get('RC_TEST_FLAG') != 'ok':",
+                    "    raise SystemExit('missing env override')",
+                    "print(f'metric: {float(args.value):.1f}')",
+                ]
+            ),
+            encoding="utf-8",
+        )
+
+        sandbox = self._make_sandbox(tmp_path)
+        result = sandbox.run_project(
+            project,
+            args=["--value", "1.0"],
+            env_overrides={"RC_TEST_FLAG": "ok"},
+            timeout_sec=10,
+        )
+
+        assert result.returncode == 0
+        assert result.metrics.get("metric") == 1.0
diff --git a/tests/test_rc_docker_sandbox.py b/tests/test_rc_docker_sandbox.py
index fc177988..3ac05dd7 100644
--- a/tests/test_rc_docker_sandbox.py
+++ b/tests/test_rc_docker_sandbox.py
@@ -118,6 +118,22 @@ def test_build_run_command_specific_gpus(tmp_path: Path):
     assert "0,2" in cmd[gpu_idx + 1]
 
 
+def test_build_run_command_forwards_entry_args_and_env(tmp_path: Path):
+    cfg = DockerSandboxConfig(network_policy="none")
+    sandbox = DockerSandbox(cfg, tmp_path / "work")
+    cmd = sandbox._build_run_command(
+        tmp_path / "staging",
+        entry_point="main.py",
+        container_name="rc-test-args",
+        entry_args=["--foo", "bar"],
+        env_overrides={"B_ENV": "2", "A_ENV": "1"},
+    )
+    env_values = [cmd[i + 1] for i, token in enumerate(cmd) if token == "-e"]
+    assert "A_ENV=1" in env_values
+    assert "B_ENV=2" in env_values
+    assert cmd[-3:] == ["main.py", "--foo", "bar"]
+
+
 # ── Harness injection ─────────────────────────────────────────────────
 
 
diff --git a/tests/test_ssh_and_colab_sandbox.py b/tests/test_ssh_and_colab_sandbox.py
index d3436888..21d7c8ee 100644
--- a/tests/test_ssh_and_colab_sandbox.py
+++ b/tests/test_ssh_and_colab_sandbox.py
@@ -104,6 +104,19 @@ def test_bare_exec_no_gpu(self, tmp_path: Path):
         cmd = sb._build_bare_exec_cmd("/tmp/rc-test", entry_point="main.py")
         assert "CUDA_VISIBLE_DEVICES" not in cmd
 
+    def test_bare_exec_cmd_forwards_args_and_env(self, tmp_path: Path):
+        cfg = SshRemoteConfig(host="server", user="test", remote_python="python3")
+        sb = SshRemoteSandbox(cfg, tmp_path)
+        cmd = sb._build_bare_exec_cmd(
+            "/tmp/rc-test",
+            entry_point="main.py",
+            args=["--foo", "bar baz"],
+            env_overrides={"A_ENV": "1", "B_ENV": "two words"},
+        )
+        assert "A_ENV=1" in cmd
+        assert "B_ENV='two words'" in cmd
+        assert "python3 -u main.py --foo 'bar baz'" in cmd
+
     def test_docker_exec_cmd(self, tmp_path: Path):
         cfg = SshRemoteConfig(
             host="server", user="test",
@@ -125,6 +138,23 @@ def test_docker_exec_cmd(self, tmp_path: Path):
         assert "myimage:latest" in cmd
         assert cmd.endswith("main.py")
 
+    def test_docker_exec_cmd_forwards_args_and_env(self, tmp_path: Path):
+        cfg = SshRemoteConfig(
+            host="server",
+            user="test",
+            use_docker=True,
+            docker_image="myimage:latest",
+        )
+        sb = SshRemoteSandbox(cfg, tmp_path)
+        cmd = sb._build_docker_exec_cmd(
+            "/tmp/rc-test",
+            entry_point="main.py",
+            args=["--foo", "bar"],
+            env_overrides={"A_ENV": "1"},
+        )
+        assert "-e A_ENV=1" in cmd
+        assert cmd.endswith("main.py --foo bar")
+
     def test_docker_exec_full_network(self, tmp_path: Path):
         cfg = SshRemoteConfig(
             host="server", use_docker=True,

From 0326b82b752f4ad0f421f120c31ade2b651494d8 Mon Sep 17 00:00:00 2001
From: CKwin26 <156837805+CKwin26@users.noreply.github.com>
Date: Tue, 31 Mar 2026 01:58:29 -0400
Subject: [PATCH 2/2] fix cross-platform absolute entrypoint checks

---
 researchclaw/experiment/sandbox.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/researchclaw/experiment/sandbox.py b/researchclaw/experiment/sandbox.py
index 6b66dc3c..7bb5edd3 100644
--- a/researchclaw/experiment/sandbox.py
+++ b/researchclaw/experiment/sandbox.py
@@ -9,7 +9,7 @@
 import subprocess
 import time
 from dataclasses import dataclass
-from pathlib import Path
+from pathlib import Path, PurePosixPath, PureWindowsPath
 from typing import Protocol
 
 from researchclaw.config import SandboxConfig
@@ -27,7 +27,9 @@ def validate_entry_point(entry_point: str) -> str | None:
     if not entry_point or not entry_point.strip():
         return "Entry point is empty"
     ep = Path(entry_point)
-    if ep.is_absolute():
+    posix_ep = PurePosixPath(entry_point)
+    windows_ep = PureWindowsPath(entry_point)
+    if ep.is_absolute() or posix_ep.is_absolute() or windows_ep.is_absolute():
         return f"Entry point must be a relative path, got: {entry_point}"
     if ".." in ep.parts:
         return f"Entry point must not contain '..': {entry_point}"