From c993c112098fe9f6bd03ec2ec6e94063aac3dc7c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 27 Jan 2026 18:10:46 +0000
Subject: [PATCH 1/3] refactor(commit0): consolidate hyperparameters in
 constants.py

This commit creates a single source of truth for all Commit0 benchmark
hyperparameters and constant values by introducing constants.py.

Changes:
- Create benchmarks/commit0/constants.py with all constant values:
  - Dataset configuration (DEFAULT_DATASET, DEFAULT_DATASET_SPLIT, DEFAULT_REPO_SPLIT)
  - Docker image configuration (DEFAULT_DOCKER_IMAGE_PREFIX, DEFAULT_IMAGE_TAG, CUSTOM_TAG_PREFIX)
  - Build configuration (BUILD_TARGET)
  - Workspace configuration (WORKSPACE_DIR)
  - Git configuration (GIT_BRANCH_NAME, AGENT_BRANCH_NAME)
  - Model configuration (DEFAULT_MODEL_NAME)
  - Runtime configuration (DEFAULT_RUNTIME_API_URL, timeouts)
  - Evaluation configuration (TOTAL_INSTANCES)

- Update build_images.py to import constants from constants.py
- Update run_infer.py to import constants from constants.py
- Update eval_infer.py to import constants from constants.py
- Add comprehensive tests for constants in test_constants.py

Fixes #365

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/build_images.py         |  16 ++-
 benchmarks/commit0/constants.py            |  38 +++++
 benchmarks/commit0/eval_infer.py           |   9 +-
 benchmarks/commit0/run_infer.py            |  94 +++++++-----
 benchmarks/commit0/tests/test_constants.py | 160 +++++++++++++++++++++
 5 files changed, 273 insertions(+), 44 deletions(-)
 create mode 100644 benchmarks/commit0/constants.py
 create mode 100644 benchmarks/commit0/tests/test_constants.py

diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py
index b59704ea..3f784b9c 100644
--- a/benchmarks/commit0/build_images.py
+++ b/benchmarks/commit0/build_images.py
@@ -13,6 +13,13 @@
 
 from commit0.harness.constants import SPLIT
 
+from benchmarks.commit0.constants import (
+    CUSTOM_TAG_PREFIX,
+    DEFAULT_DATASET,
+    DEFAULT_DOCKER_IMAGE_PREFIX,
+    DEFAULT_IMAGE_TAG,
+    DEFAULT_REPO_SPLIT,
+)
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
@@ -22,7 +29,6 @@
 
 
 logger = get_logger(__name__)
-DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"
 
 
 def get_base_docker_image(
@@ -33,14 +39,14 @@ def get_base_docker_image(
     prefix = docker_image_prefix or os.getenv(
         "EVAL_DOCKER_IMAGE_PREFIX", DEFAULT_DOCKER_IMAGE_PREFIX
     )
-    return (prefix.rstrip("/") + "/" + repo_name).lower() + ":v0"
+    return (prefix.rstrip("/") + "/" + repo_name).lower() + f":{DEFAULT_IMAGE_TAG}"
 
 
 def extract_custom_tag(base_image: str) -> str:
     """Extract Commit0 custom tag from a base image name."""
     repo_tag = base_image.rsplit("/", 1)[-1]
     repo_name = repo_tag.split(":", 1)[0].lower()
-    return f"commit0-{repo_name}"
+    return f"{CUSTOM_TAG_PREFIX}{repo_name}"
 
 
 def _load_selected_instances(selected_instances_file: str) -> list[str]:
@@ -90,7 +96,7 @@ def main(argv: list[str]) -> int:
     parser.add_argument(
         "--repo-split",
         type=str,
-        default="lite",
+        default=DEFAULT_REPO_SPLIT,
         help="Commit0 repo split (lite, all, or repo name)",
     )
     parser.add_argument(
@@ -99,7 +105,7 @@ def main(argv: list[str]) -> int:
         default="",
         help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)",
     )
-    parser.set_defaults(dataset="wentingzhao/commit0_combined")
+    parser.set_defaults(dataset=DEFAULT_DATASET)
     args = parser.parse_args(argv)
 
     docker_image_prefix = args.docker_image_prefix or None
diff --git a/benchmarks/commit0/constants.py b/benchmarks/commit0/constants.py
new file mode 100644
index 00000000..e75c76b5
--- /dev/null
+++ b/benchmarks/commit0/constants.py
@@ -0,0 +1,38 @@
+"""
+Commit0 Benchmark Constants
+
+This module serves as the single source of truth for all hyperparameters
+and constant values used in the Commit0 benchmark evaluation workflow.
+"""
+
+# Dataset configuration
+DEFAULT_DATASET = "wentingzhao/commit0_combined"
+DEFAULT_DATASET_SPLIT = "test"
+DEFAULT_REPO_SPLIT = "lite"
+
+# Docker image configuration
+DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"
+DEFAULT_IMAGE_TAG = "v0"
+CUSTOM_TAG_PREFIX = "commit0-"
+
+# Build configuration
+BUILD_TARGET = "source-minimal"
+
+# Workspace configuration
+WORKSPACE_DIR = "/workspace"
+
+# Git configuration
+GIT_BRANCH_NAME = "commit0_combined"
+AGENT_BRANCH_NAME = "openhands"
+
+# Model configuration
+DEFAULT_MODEL_NAME = "openhands"
+
+# Runtime configuration
+DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600
+DEFAULT_CONVERSATION_TIMEOUT = 3600
+DEFAULT_COMMAND_TIMEOUT = 600
+
+# Evaluation configuration
+TOTAL_INSTANCES = 16
diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
index f03e73f6..fcc7885f 100644
--- a/benchmarks/commit0/eval_infer.py
+++ b/benchmarks/commit0/eval_infer.py
@@ -15,6 +15,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.commit0.constants import DEFAULT_MODEL_NAME, TOTAL_INSTANCES
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 
@@ -27,7 +28,7 @@
 
 
 def process_commit0_results(
-    input_file: str, output_file: str, model_name: str = "openhands"
+    input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME
 ) -> None:
     """
     Process Commit0 output.jsonl and generate evaluation report.
@@ -123,7 +124,7 @@ def process_commit0_results(
     # Generate report
     report = {
         "model_name_or_path": model_name,
-        "total_instances": 16,  # Fixed as per requirement
+        "total_instances": TOTAL_INSTANCES,
         "submitted_instances": len(completed_ids),
         "completed_instances": len(completed_ids),
         "resolved_instances": len(resolved_ids),
@@ -174,8 +175,8 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        default=DEFAULT_MODEL_NAME,
+        help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})",
     )
 
     args = parser.parse_args()
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 2e473669..c0fb83e5 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -12,6 +12,19 @@
     extract_custom_tag,
     get_base_docker_image,
 )
+from benchmarks.commit0.constants import (
+    AGENT_BRANCH_NAME,
+    BUILD_TARGET,
+    DEFAULT_COMMAND_TIMEOUT,
+    DEFAULT_CONVERSATION_TIMEOUT,
+    DEFAULT_DATASET,
+    DEFAULT_DATASET_SPLIT,
+    DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
+    DEFAULT_REPO_SPLIT,
+    DEFAULT_RUNTIME_API_URL,
+    GIT_BRANCH_NAME,
+    WORKSPACE_DIR,
+)
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -110,9 +123,9 @@ def __init__(
         self,
         metadata: EvalMetadata,
         num_workers: int = 1,
-        repo_split: str = "lite",
-        dataset_name: str = "wentingzhao/commit0_combined",
-        dataset_split: str = "test",
+        repo_split: str = DEFAULT_REPO_SPLIT,
+        dataset_name: str = DEFAULT_DATASET,
+        dataset_split: str = DEFAULT_DATASET_SPLIT,
     ):
         super().__init__(metadata=metadata, num_workers=num_workers)
         # Store additional parameters in metadata.details for access in methods
@@ -130,9 +143,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Setting up Commit0 evaluation data")
 
         details = self.metadata.details or {}
-        dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined")
-        dataset_split = details.get("dataset_split", "test")
-        repo_split = details.get("repo_split", "lite")
+        dataset_name = details.get("dataset_name", DEFAULT_DATASET)
+        dataset_split = details.get("dataset_split", DEFAULT_DATASET_SPLIT)
+        repo_split = details.get("repo_split", DEFAULT_REPO_SPLIT)
 
         dataset = load_dataset(dataset_name, split=dataset_split)
         df = commit0_setup(dataset, repo_split)
@@ -180,14 +193,14 @@ def prepare_workspace(
         """
         repo_name = instance.data["repo"].split("/")[1]
         base_docker_image = get_base_docker_image(repo_name)
-        build_target = "source-minimal"
+        build_target = BUILD_TARGET
         logger.info(f"Using base docker image: {base_docker_image}")
 
         if self.metadata.workspace_type == "docker":
             # Build agent-server image from base commit0 image
             workspace = DockerDevWorkspace(
                 base_image=base_docker_image,
-                working_dir="/workspace",
+                working_dir=WORKSPACE_DIR,
                 target=build_target,
                 forward_env=forward_env or [],
             )
@@ -218,11 +231,14 @@ def prepare_workspace(
                 f"Using remote workspace with image {agent_server_image} "
                 f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
-            startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
+            startup_timeout = float(
+                os.getenv(
+                    "REMOTE_RUNTIME_STARTUP_TIMEOUT",
+                    str(DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),
+                )
+            )
             workspace = APIRemoteWorkspace(
-                runtime_api_url=os.getenv(
-                    "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
-                ),
+                runtime_api_url=os.getenv("RUNTIME_API_URL", DEFAULT_RUNTIME_API_URL),
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
@@ -238,37 +254,39 @@ def prepare_workspace(
 
         # Clone the repository to the specific directory
         workspace_dir_name = instance.data["repo"].split("/")[1]
-        clone_cmd = f"cd /workspace/ && git clone -b commit0_combined https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
-        res = workspace.execute_command(clone_cmd, timeout=600)
+        clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
+        res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to clone repo: {res.stderr}")
         logger.info(f"Cloned repository: {instance.data['repo']}")
 
         # Create new branch
-        branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands"
-        res = workspace.execute_command(branch_cmd, timeout=600)
+        branch_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
+        res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to create branch: {res.stderr}")
-        logger.info("Created new branch: openhands")
+        logger.info(f"Created new branch: {AGENT_BRANCH_NAME}")
 
         # Install commit0
         # Try uv first, fall back to pip if uv is not available
-        install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"
-        res = workspace.execute_command(install_cmd, timeout=600)
+        install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"
+        res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to install commit0: {res.stderr}")
         logger.info("Installed commit0")
 
         # Install pytest and required plugins for test reporting
-        plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"
-        res = workspace.execute_command(plugin_install_cmd, timeout=600)
+        plugin_install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"
+        res = workspace.execute_command(
+            plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT
+        )
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to install pytest and plugins: {res.stderr}")
         logger.info("Installed pytest and required plugins")
 
         # Verify pytest and plugin installation
         verify_pytest_cmd = (
-            f"cd /workspace/{workspace_dir_name} && python -m pytest --version"
+            f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -m pytest --version"
         )
         verify_pytest_res = workspace.execute_command(verify_pytest_cmd, timeout=60)
         logger.info(f"Pytest verification exit code: {verify_pytest_res.exit_code}")
@@ -277,7 +295,7 @@ def prepare_workspace(
         else:
             logger.warning(f"Pytest verification failed: {verify_pytest_res.stderr}")
 
-        verify_plugin_cmd = f"cd /workspace/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'"
+        verify_plugin_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'"
         verify_plugin_res = workspace.execute_command(verify_plugin_cmd, timeout=60)
         logger.info(f"Plugin verification exit code: {verify_plugin_res.exit_code}")
         if verify_plugin_res.exit_code == 0:
@@ -294,7 +312,7 @@ def evaluate_instance(
         Run agent, collect history, git patch, and test results.
         """
         workspace_dir_name = instance.data["repo"].split("/")[1]
-        repo_path = f"/workspace/{workspace_dir_name}"
+        repo_path = f"{WORKSPACE_DIR}/{workspace_dir_name}"
 
         tools = get_default_tools(enable_browser=False)
         agent = Agent(
@@ -323,20 +341,24 @@ def evaluate_instance(
             metadata=self.metadata,
         )
         conversation.send_message(instruction)
-        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        run_timeout = int(
+            os.getenv("CONVERSATION_TIMEOUT", str(DEFAULT_CONVERSATION_TIMEOUT))
+        )
         conversation.run(timeout=run_timeout)
 
         history = list(conversation.state.events)
 
         # Complete runtime: git add, commit, diff, run tests
-        workspace.execute_command(f"cd {repo_path} && git add .", timeout=600)
+        workspace.execute_command(
+            f"cd {repo_path} && git add .", timeout=DEFAULT_COMMAND_TIMEOUT
+        )
         # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
         workspace.execute_command(
             f"cd {repo_path} && "
             'git config --global user.email "evaluation@openhands.dev" && '
             'git config --global user.name "OpenHands Evaluation" && '
-            'git commit --no-verify -m "openhands edits"',
-            timeout=600,
+            f'git commit --no-verify -m "{AGENT_BRANCH_NAME} edits"',
+            timeout=DEFAULT_COMMAND_TIMEOUT,
         )
 
         # Get git patch
@@ -345,7 +367,7 @@ def evaluate_instance(
         for retry in range(5):
             patch_result = workspace.execute_command(
                 f"cd {repo_path} && git diff {base_commit} HEAD -- . ':(exclude)spec.pdf.bz2'",
-                timeout=600 + 100 * retry,
+                timeout=DEFAULT_COMMAND_TIMEOUT + 100 * retry,
             )
             if patch_result.exit_code == 0:
                 git_patch = patch_result.stdout.strip()
@@ -363,7 +385,9 @@ def evaluate_instance(
             test_cmd = "python -m pytest"
         full_test_cmd = f"cd {repo_path} && {test_cmd} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
         logger.info(f"Running test command: {full_test_cmd}")
-        test_result = workspace.execute_command(full_test_cmd, timeout=600)
+        test_result = workspace.execute_command(
+            full_test_cmd, timeout=DEFAULT_COMMAND_TIMEOUT
+        )
         logger.info(f"Test command exit code: {test_result.exit_code}")
         if test_result.exit_code != 0:
             logger.warning(f"Test command failed with stderr: {test_result.stderr}")
@@ -372,7 +396,7 @@ def evaluate_instance(
         # Read test output
         test_output_result = workspace.execute_command(
             f"cd {repo_path} && cat test_output.txt",
-            timeout=600,
+            timeout=DEFAULT_COMMAND_TIMEOUT,
         )
         test_output = (
             test_output_result.stdout.strip()
@@ -388,7 +412,7 @@ def evaluate_instance(
         repo_name_normalized = repo_name.replace(".", "-")
         test_ids_result = workspace.execute_command(
             f"cd {repo_path} && commit0 get-tests {repo_name_normalized}",
-            timeout=600,
+            timeout=DEFAULT_COMMAND_TIMEOUT,
         )
         test_ids = (
             test_ids_result.stdout.strip().split("\n")
@@ -405,7 +429,7 @@ def evaluate_instance(
         # Read test report
         report_result = workspace.execute_command(
             f"cd {repo_path} && cat report.json",
-            timeout=600,
+            timeout=DEFAULT_COMMAND_TIMEOUT,
         )
 
         # Debug logging for report
@@ -593,11 +617,11 @@ def main() -> None:
     parser.add_argument(
         "--repo-split",
         type=str,
-        default="lite",
+        default=DEFAULT_REPO_SPLIT,
         help="all, lite, or each repo name",
     )
     # Override the default dataset for commit0
-    parser.set_defaults(dataset="wentingzhao/commit0_combined")
+    parser.set_defaults(dataset=DEFAULT_DATASET)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/commit0/tests/test_constants.py b/benchmarks/commit0/tests/test_constants.py
new file mode 100644
index 00000000..73da46f5
--- /dev/null
+++ b/benchmarks/commit0/tests/test_constants.py
@@ -0,0 +1,160 @@
+"""Tests for commit0 constants.py."""
+
+from benchmarks.commit0.constants import (
+    AGENT_BRANCH_NAME,
+    BUILD_TARGET,
+    CUSTOM_TAG_PREFIX,
+    DEFAULT_COMMAND_TIMEOUT,
+    DEFAULT_CONVERSATION_TIMEOUT,
+    DEFAULT_DATASET,
+    DEFAULT_DATASET_SPLIT,
+    DEFAULT_DOCKER_IMAGE_PREFIX,
+    DEFAULT_IMAGE_TAG,
+    DEFAULT_MODEL_NAME,
+    DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
+    DEFAULT_REPO_SPLIT,
+    DEFAULT_RUNTIME_API_URL,
+    GIT_BRANCH_NAME,
+    TOTAL_INSTANCES,
+    WORKSPACE_DIR,
+)
+
+
+class TestDatasetConstants:
+    """Tests for dataset-related constants."""
+
+    def test_default_dataset_is_valid_huggingface_path(self):
+        """Test that DEFAULT_DATASET follows HuggingFace dataset path format."""
+        assert "/" in DEFAULT_DATASET
+        assert DEFAULT_DATASET == "wentingzhao/commit0_combined"
+
+    def test_default_dataset_split(self):
+        """Test that DEFAULT_DATASET_SPLIT is a valid split name."""
+        assert DEFAULT_DATASET_SPLIT == "test"
+
+    def test_default_repo_split(self):
+        """Test that DEFAULT_REPO_SPLIT is a valid repo split."""
+        assert DEFAULT_REPO_SPLIT in ["lite", "all"]
+
+
+class TestDockerConstants:
+    """Tests for Docker-related constants."""
+
+    def test_default_docker_image_prefix_format(self):
+        """Test that DEFAULT_DOCKER_IMAGE_PREFIX is a valid Docker registry prefix."""
+        assert DEFAULT_DOCKER_IMAGE_PREFIX.endswith("/")
+        assert "docker.io" in DEFAULT_DOCKER_IMAGE_PREFIX
+
+    def test_default_image_tag(self):
+        """Test that DEFAULT_IMAGE_TAG is a valid tag format."""
+        assert DEFAULT_IMAGE_TAG == "v0"
+        assert not DEFAULT_IMAGE_TAG.startswith(":")
+
+    def test_custom_tag_prefix(self):
+        """Test that CUSTOM_TAG_PREFIX is a valid prefix."""
+        assert CUSTOM_TAG_PREFIX == "commit0-"
+        assert CUSTOM_TAG_PREFIX.endswith("-")
+
+
+class TestBuildConstants:
+    """Tests for build-related constants."""
+
+    def test_build_target(self):
+        """Test that BUILD_TARGET is a valid build target."""
+        assert BUILD_TARGET == "source-minimal"
+
+
+class TestWorkspaceConstants:
+    """Tests for workspace-related constants."""
+
+    def test_workspace_dir_is_absolute_path(self):
+        """Test that WORKSPACE_DIR is an absolute path."""
+        assert WORKSPACE_DIR.startswith("/")
+        assert WORKSPACE_DIR == "/workspace"
+
+
+class TestGitConstants:
+    """Tests for Git-related constants."""
+
+    def test_git_branch_name(self):
+        """Test that GIT_BRANCH_NAME is a valid branch name."""
+        assert GIT_BRANCH_NAME == "commit0_combined"
+        assert " " not in GIT_BRANCH_NAME
+
+    def test_agent_branch_name(self):
+        """Test that AGENT_BRANCH_NAME is a valid branch name."""
+        assert AGENT_BRANCH_NAME == "openhands"
+        assert " " not in AGENT_BRANCH_NAME
+
+
+class TestModelConstants:
+    """Tests for model-related constants."""
+
+    def test_default_model_name(self):
+        """Test that DEFAULT_MODEL_NAME is set."""
+        assert DEFAULT_MODEL_NAME == "openhands"
+
+
+class TestRuntimeConstants:
+    """Tests for runtime-related constants."""
+
+    def test_default_runtime_api_url_is_valid_url(self):
+        """Test that DEFAULT_RUNTIME_API_URL is a valid URL."""
+        assert DEFAULT_RUNTIME_API_URL.startswith("https://")
+        assert "runtime" in DEFAULT_RUNTIME_API_URL
+
+    def test_default_remote_runtime_startup_timeout_is_positive(self):
+        """Test that DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is positive."""
+        assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT > 0
+        assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT == 600
+
+    def test_default_conversation_timeout_is_positive(self):
+        """Test that DEFAULT_CONVERSATION_TIMEOUT is positive."""
+        assert DEFAULT_CONVERSATION_TIMEOUT > 0
+        assert DEFAULT_CONVERSATION_TIMEOUT == 3600
+
+    def test_default_command_timeout_is_positive(self):
+        """Test that DEFAULT_COMMAND_TIMEOUT is positive."""
+        assert DEFAULT_COMMAND_TIMEOUT > 0
+        assert DEFAULT_COMMAND_TIMEOUT == 600
+
+
+class TestEvaluationConstants:
+    """Tests for evaluation-related constants."""
+
+    def test_total_instances_is_positive(self):
+        """Test that TOTAL_INSTANCES is positive."""
+        assert TOTAL_INSTANCES > 0
+        assert TOTAL_INSTANCES == 16
+
+
+class TestConstantsIntegration:
+    """Integration tests for constants usage."""
+
+    def test_docker_image_can_be_constructed(self):
+        """Test that a valid Docker image name can be constructed from constants."""
+        repo_name = "test-repo"
+        image = f"{DEFAULT_DOCKER_IMAGE_PREFIX}{repo_name}:{DEFAULT_IMAGE_TAG}"
+        assert image == "docker.io/wentingzhao/test-repo:v0"
+
+    def test_custom_tag_can_be_constructed(self):
+        """Test that a valid custom tag can be constructed from constants."""
+        repo_name = "test-repo"
+        custom_tag = f"{CUSTOM_TAG_PREFIX}{repo_name}"
+        assert custom_tag == "commit0-test-repo"
+
+    def test_workspace_path_can_be_constructed(self):
+        """Test that a valid workspace path can be constructed from constants."""
+        repo_name = "test-repo"
+        workspace_path = f"{WORKSPACE_DIR}/{repo_name}"
+        assert workspace_path == "/workspace/test-repo"
+
+    def test_clone_command_can_be_constructed(self):
+        """Test that a valid git clone command can be constructed from constants."""
+        repo = "owner/test-repo"
+        repo_name = repo.split("/")[1]
+        clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{repo}.git {repo_name}"
+        assert (
+            clone_cmd
+            == "cd /workspace/ && git clone -b commit0_combined https://github.com/owner/test-repo.git test-repo"
+        )

From 08844b2ee8785a5ca9964612fa73324708c7d401 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 10:04:40 +0000
Subject: [PATCH 2/3] Remove test_constants.py and revert WORKSPACE_DIR changes

Per user request:
- Remove benchmarks/commit0/tests/test_constants.py
- Remove WORKSPACE_DIR constant from constants.py
- Revert all WORKSPACE_DIR usages back to hardcoded '/workspace'

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/constants.py            |   3 -
 benchmarks/commit0/run_infer.py            |  17 ++-
 benchmarks/commit0/tests/test_constants.py | 160 ---------------------
 3 files changed, 8 insertions(+), 172 deletions(-)
 delete mode 100644 benchmarks/commit0/tests/test_constants.py

diff --git a/benchmarks/commit0/constants.py b/benchmarks/commit0/constants.py
index e75c76b5..14d044d4 100644
--- a/benchmarks/commit0/constants.py
+++ b/benchmarks/commit0/constants.py
@@ -18,9 +18,6 @@
 # Build configuration
 BUILD_TARGET = "source-minimal"
 
-# Workspace configuration
-WORKSPACE_DIR = "/workspace"
-
 # Git configuration
 GIT_BRANCH_NAME = "commit0_combined"
 AGENT_BRANCH_NAME = "openhands"
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index c0fb83e5..145c9971 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -23,7 +23,6 @@
     DEFAULT_REPO_SPLIT,
     DEFAULT_RUNTIME_API_URL,
     GIT_BRANCH_NAME,
-    WORKSPACE_DIR,
 )
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
@@ -200,7 +199,7 @@ def prepare_workspace(
             # Build agent-server image from base commit0 image
             workspace = DockerDevWorkspace(
                 base_image=base_docker_image,
-                working_dir=WORKSPACE_DIR,
+                working_dir="/workspace",
                 target=build_target,
                 forward_env=forward_env or [],
             )
@@ -254,14 +253,14 @@ def prepare_workspace(
 
         # Clone the repository to the specific directory
         workspace_dir_name = instance.data["repo"].split("/")[1]
-        clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
+        clone_cmd = f"cd /workspace/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
         res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to clone repo: {res.stderr}")
         logger.info(f"Cloned repository: {instance.data['repo']}")
 
         # Create new branch
-        branch_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
+        branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
         res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to create branch: {res.stderr}")
@@ -269,14 +268,14 @@ def prepare_workspace(
 
         # Install commit0
         # Try uv first, fall back to pip if uv is not available
-        install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"
+        install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"
         res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to install commit0: {res.stderr}")
         logger.info("Installed commit0")
 
         # Install pytest and required plugins for test reporting
-        plugin_install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"
+        plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"
         res = workspace.execute_command(
             plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT
         )
@@ -286,7 +285,7 @@ def prepare_workspace(
 
         # Verify pytest and plugin installation
         verify_pytest_cmd = (
-            f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -m pytest --version"
+            f"cd /workspace/{workspace_dir_name} && python -m pytest --version"
         )
         verify_pytest_res = workspace.execute_command(verify_pytest_cmd, timeout=60)
         logger.info(f"Pytest verification exit code: {verify_pytest_res.exit_code}")
@@ -295,7 +294,7 @@ def prepare_workspace(
         else:
             logger.warning(f"Pytest verification failed: {verify_pytest_res.stderr}")
 
-        verify_plugin_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'"
+        verify_plugin_cmd = f"cd /workspace/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'"
         verify_plugin_res = workspace.execute_command(verify_plugin_cmd, timeout=60)
         logger.info(f"Plugin verification exit code: {verify_plugin_res.exit_code}")
         if verify_plugin_res.exit_code == 0:
@@ -312,7 +311,7 @@ def evaluate_instance(
         Run agent, collect history, git patch, and test results.
         """
         workspace_dir_name = instance.data["repo"].split("/")[1]
-        repo_path = f"{WORKSPACE_DIR}/{workspace_dir_name}"
+        repo_path = f"/workspace/{workspace_dir_name}"
 
         tools = get_default_tools(enable_browser=False)
         agent = Agent(
diff --git a/benchmarks/commit0/tests/test_constants.py b/benchmarks/commit0/tests/test_constants.py
deleted file mode 100644
index 73da46f5..00000000
--- a/benchmarks/commit0/tests/test_constants.py
+++ /dev/null
@@ -1,160 +0,0 @@
-"""Tests for commit0 constants.py."""
-
-from benchmarks.commit0.constants import (
-    AGENT_BRANCH_NAME,
-    BUILD_TARGET,
-    CUSTOM_TAG_PREFIX,
-    DEFAULT_COMMAND_TIMEOUT,
-    DEFAULT_CONVERSATION_TIMEOUT,
-    DEFAULT_DATASET,
-    DEFAULT_DATASET_SPLIT,
-    DEFAULT_DOCKER_IMAGE_PREFIX,
-    DEFAULT_IMAGE_TAG,
-    DEFAULT_MODEL_NAME,
-    DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
-    DEFAULT_REPO_SPLIT,
-    DEFAULT_RUNTIME_API_URL,
-    GIT_BRANCH_NAME,
-    TOTAL_INSTANCES,
-    WORKSPACE_DIR,
-)
-
-
-class TestDatasetConstants:
-    """Tests for dataset-related constants."""
-
-    def test_default_dataset_is_valid_huggingface_path(self):
-        """Test that DEFAULT_DATASET follows HuggingFace dataset path format."""
-        assert "/" in DEFAULT_DATASET
-        assert DEFAULT_DATASET == "wentingzhao/commit0_combined"
-
-    def test_default_dataset_split(self):
-        """Test that DEFAULT_DATASET_SPLIT is a valid split name."""
-        assert DEFAULT_DATASET_SPLIT == "test"
-
-    def test_default_repo_split(self):
-        """Test that DEFAULT_REPO_SPLIT is a valid repo split."""
-        assert DEFAULT_REPO_SPLIT in ["lite", "all"]
-
-
-class TestDockerConstants:
-    """Tests for Docker-related constants."""
-
-    def test_default_docker_image_prefix_format(self):
-        """Test that DEFAULT_DOCKER_IMAGE_PREFIX is a valid Docker registry prefix."""
-        assert DEFAULT_DOCKER_IMAGE_PREFIX.endswith("/")
-        assert "docker.io" in DEFAULT_DOCKER_IMAGE_PREFIX
-
-    def test_default_image_tag(self):
-        """Test that DEFAULT_IMAGE_TAG is a valid tag format."""
-        assert DEFAULT_IMAGE_TAG == "v0"
-        assert not DEFAULT_IMAGE_TAG.startswith(":")
-
-    def test_custom_tag_prefix(self):
-        """Test that CUSTOM_TAG_PREFIX is a valid prefix."""
-        assert CUSTOM_TAG_PREFIX == "commit0-"
-        assert CUSTOM_TAG_PREFIX.endswith("-")
-
-
-class TestBuildConstants:
-    """Tests for build-related constants."""
-
-    def test_build_target(self):
-        """Test that BUILD_TARGET is a valid build target."""
-        assert BUILD_TARGET == "source-minimal"
-
-
-class TestWorkspaceConstants:
-    """Tests for workspace-related constants."""
-
-    def test_workspace_dir_is_absolute_path(self):
-        """Test that WORKSPACE_DIR is an absolute path."""
-        assert WORKSPACE_DIR.startswith("/")
-        assert WORKSPACE_DIR == "/workspace"
-
-
-class TestGitConstants:
-    """Tests for Git-related constants."""
-
-    def test_git_branch_name(self):
-        """Test that GIT_BRANCH_NAME is a valid branch name."""
-        assert GIT_BRANCH_NAME == "commit0_combined"
-        assert " " not in GIT_BRANCH_NAME
-
-    def test_agent_branch_name(self):
-        """Test that AGENT_BRANCH_NAME is a valid branch name."""
-        assert AGENT_BRANCH_NAME == "openhands"
-        assert " " not in AGENT_BRANCH_NAME
-
-
-class TestModelConstants:
-    """Tests for model-related constants."""
-
-    def test_default_model_name(self):
-        """Test that DEFAULT_MODEL_NAME is set."""
-        assert DEFAULT_MODEL_NAME == "openhands"
-
-
-class TestRuntimeConstants:
-    """Tests for runtime-related constants."""
-
-    def test_default_runtime_api_url_is_valid_url(self):
-        """Test that DEFAULT_RUNTIME_API_URL is a valid URL."""
-        assert DEFAULT_RUNTIME_API_URL.startswith("https://")
-        assert "runtime" in DEFAULT_RUNTIME_API_URL
-
-    def test_default_remote_runtime_startup_timeout_is_positive(self):
-        """Test that DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is positive."""
-        assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT > 0
-        assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT == 600
-
-    def test_default_conversation_timeout_is_positive(self):
-        """Test that DEFAULT_CONVERSATION_TIMEOUT is positive."""
-        assert DEFAULT_CONVERSATION_TIMEOUT > 0
-        assert DEFAULT_CONVERSATION_TIMEOUT == 3600
-
-    def test_default_command_timeout_is_positive(self):
-        """Test that DEFAULT_COMMAND_TIMEOUT is positive."""
-        assert DEFAULT_COMMAND_TIMEOUT > 0
-        assert DEFAULT_COMMAND_TIMEOUT == 600
-
-
-class TestEvaluationConstants:
-    """Tests for evaluation-related constants."""
-
-    def test_total_instances_is_positive(self):
-        """Test that TOTAL_INSTANCES is positive."""
-        assert TOTAL_INSTANCES > 0
-        assert TOTAL_INSTANCES == 16
-
-
-class TestConstantsIntegration:
-    """Integration tests for constants usage."""
-
-    def test_docker_image_can_be_constructed(self):
-        """Test that a valid Docker image name can be constructed from constants."""
-        repo_name = "test-repo"
-        image = f"{DEFAULT_DOCKER_IMAGE_PREFIX}{repo_name}:{DEFAULT_IMAGE_TAG}"
-        assert image == "docker.io/wentingzhao/test-repo:v0"
-
-    def test_custom_tag_can_be_constructed(self):
-        """Test that a valid custom tag can be constructed from constants."""
-        repo_name = "test-repo"
-        custom_tag = f"{CUSTOM_TAG_PREFIX}{repo_name}"
-        assert custom_tag == "commit0-test-repo"
-
-    def test_workspace_path_can_be_constructed(self):
-        """Test that a valid workspace path can be constructed from constants."""
-        repo_name = "test-repo"
-        workspace_path = f"{WORKSPACE_DIR}/{repo_name}"
-        assert workspace_path == "/workspace/test-repo"
-
-    def test_clone_command_can_be_constructed(self):
-        """Test that a valid git clone command can be constructed from constants."""
-        repo = "owner/test-repo"
-        repo_name = repo.split("/")[1]
-        clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{repo}.git {repo_name}"
-        assert (
-            clone_cmd
-            == "cd /workspace/ && git clone -b commit0_combined https://github.com/owner/test-repo.git test-repo"
-        )

From 0cc47a97ef5a0e12d419e9dc59d221400e72592a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 13:32:35 +0000
Subject: [PATCH 3/3] style: fix ruff formatting in run_infer.py

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/run_infer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 145c9971..0ab93f4a 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -260,7 +260,9 @@ def prepare_workspace(
         logger.info(f"Cloned repository: {instance.data['repo']}")
 
         # Create new branch
-        branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
+        branch_cmd = (
+            f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
+        )
         res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
         if res.exit_code != 0:
             raise RuntimeError(f"Failed to create branch: {res.stderr}")