From c993c112098fe9f6bd03ec2ec6e94063aac3dc7c Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 27 Jan 2026 18:10:46 +0000 Subject: [PATCH 1/3] refactor(commit0): consolidate hyperparameters in constants.py This commit creates a single source of truth for all Commit0 benchmark hyperparameters and constant values by introducing constants.py. Changes: - Create benchmarks/commit0/constants.py with all constant values: - Dataset configuration (DEFAULT_DATASET, DEFAULT_DATASET_SPLIT, DEFAULT_REPO_SPLIT) - Docker image configuration (DEFAULT_DOCKER_IMAGE_PREFIX, DEFAULT_IMAGE_TAG, CUSTOM_TAG_PREFIX) - Build configuration (BUILD_TARGET) - Workspace configuration (WORKSPACE_DIR) - Git configuration (GIT_BRANCH_NAME, AGENT_BRANCH_NAME) - Model configuration (DEFAULT_MODEL_NAME) - Runtime configuration (DEFAULT_RUNTIME_API_URL, timeouts) - Evaluation configuration (TOTAL_INSTANCES) - Update build_images.py to import constants from constants.py - Update run_infer.py to import constants from constants.py - Update eval_infer.py to import constants from constants.py - Add comprehensive tests for constants in test_constants.py Fixes #365 Co-authored-by: openhands --- benchmarks/commit0/build_images.py | 16 ++- benchmarks/commit0/constants.py | 38 +++++ benchmarks/commit0/eval_infer.py | 9 +- benchmarks/commit0/run_infer.py | 94 +++++++----- benchmarks/commit0/tests/test_constants.py | 160 +++++++++++++++++++++ 5 files changed, 273 insertions(+), 44 deletions(-) create mode 100644 benchmarks/commit0/constants.py create mode 100644 benchmarks/commit0/tests/test_constants.py diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index b59704ea..3f784b9c 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -13,6 +13,13 @@ from commit0.harness.constants import SPLIT +from benchmarks.commit0.constants import ( + CUSTOM_TAG_PREFIX, + DEFAULT_DATASET, + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_IMAGE_TAG, + DEFAULT_REPO_SPLIT, +) from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -22,7 +29,6 @@ logger = get_logger(__name__) -DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/" def get_base_docker_image( @@ -33,14 +39,14 @@ def get_base_docker_image( prefix = docker_image_prefix or os.getenv( "EVAL_DOCKER_IMAGE_PREFIX", DEFAULT_DOCKER_IMAGE_PREFIX ) - return (prefix.rstrip("/") + "/" + repo_name).lower() + ":v0" + return (prefix.rstrip("/") + "/" + repo_name).lower() + f":{DEFAULT_IMAGE_TAG}" def extract_custom_tag(base_image: str) -> str: """Extract Commit0 custom tag from a base image name.""" repo_tag = base_image.rsplit("/", 1)[-1] repo_name = repo_tag.split(":", 1)[0].lower() - return f"commit0-{repo_name}" + return f"{CUSTOM_TAG_PREFIX}{repo_name}" def _load_selected_instances(selected_instances_file: str) -> list[str]: @@ -90,7 +96,7 @@ def main(argv: list[str]) -> int: parser.add_argument( "--repo-split", type=str, - default="lite", + default=DEFAULT_REPO_SPLIT, help="Commit0 repo split (lite, all, or repo name)", ) parser.add_argument( @@ -99,7 +105,7 @@ def main(argv: list[str]) -> int: default="", help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)", ) - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults(dataset=DEFAULT_DATASET) args = parser.parse_args(argv) docker_image_prefix = args.docker_image_prefix or None diff --git a/benchmarks/commit0/constants.py b/benchmarks/commit0/constants.py new file mode 100644 index 00000000..e75c76b5 --- /dev/null +++ b/benchmarks/commit0/constants.py @@ -0,0 +1,38 @@ +""" +Commit0 Benchmark Constants + +This module serves as the single source of truth for all hyperparameters +and constant values used in the Commit0 benchmark evaluation workflow. +""" + +# Dataset configuration +DEFAULT_DATASET = "wentingzhao/commit0_combined" +DEFAULT_DATASET_SPLIT = "test" +DEFAULT_REPO_SPLIT = "lite" + +# Docker image configuration +DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/" +DEFAULT_IMAGE_TAG = "v0" +CUSTOM_TAG_PREFIX = "commit0-" + +# Build configuration +BUILD_TARGET = "source-minimal" + +# Workspace configuration +WORKSPACE_DIR = "/workspace" + +# Git configuration +GIT_BRANCH_NAME = "commit0_combined" +AGENT_BRANCH_NAME = "openhands" + +# Model configuration +DEFAULT_MODEL_NAME = "openhands" + +# Runtime configuration +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600 +DEFAULT_CONVERSATION_TIMEOUT = 3600 +DEFAULT_COMMAND_TIMEOUT = 600 + +# Evaluation configuration +TOTAL_INSTANCES = 16 diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index f03e73f6..fcc7885f 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -15,6 +15,7 @@ import sys from pathlib import Path +from benchmarks.commit0.constants import DEFAULT_MODEL_NAME, TOTAL_INSTANCES from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report @@ -27,7 +28,7 @@ def process_commit0_results( - input_file: str, output_file: str, model_name: str = "openhands" + input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME ) -> None: """ Process Commit0 output.jsonl and generate evaluation report. @@ -123,7 +124,7 @@ def process_commit0_results( # Generate report report = { "model_name_or_path": model_name, - "total_instances": 16, # Fixed as per requirement + "total_instances": TOTAL_INSTANCES, "submitted_instances": len(completed_ids), "completed_instances": len(completed_ids), "resolved_instances": len(resolved_ids), @@ -174,8 +175,8 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})", ) args = parser.parse_args() diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2e473669..c0fb83e5 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,6 +12,19 @@ extract_custom_tag, get_base_docker_image, ) +from benchmarks.commit0.constants import ( + AGENT_BRANCH_NAME, + BUILD_TARGET, + DEFAULT_COMMAND_TIMEOUT, + DEFAULT_CONVERSATION_TIMEOUT, + DEFAULT_DATASET, + DEFAULT_DATASET_SPLIT, + DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + DEFAULT_REPO_SPLIT, + DEFAULT_RUNTIME_API_URL, + GIT_BRANCH_NAME, + WORKSPACE_DIR, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -110,9 +123,9 @@ def __init__( self, metadata: EvalMetadata, num_workers: int = 1, - repo_split: str = "lite", - dataset_name: str = "wentingzhao/commit0_combined", - dataset_split: str = "test", + repo_split: str = DEFAULT_REPO_SPLIT, + dataset_name: str = DEFAULT_DATASET, + dataset_split: str = DEFAULT_DATASET_SPLIT, ): super().__init__(metadata=metadata, num_workers=num_workers) # Store additional parameters in metadata.details for access in methods @@ -130,9 +143,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up Commit0 evaluation data") details = self.metadata.details or {} - dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined") - dataset_split = details.get("dataset_split", "test") - repo_split = details.get("repo_split", "lite") + dataset_name = details.get("dataset_name", DEFAULT_DATASET) + dataset_split = details.get("dataset_split", DEFAULT_DATASET_SPLIT) + repo_split = details.get("repo_split", DEFAULT_REPO_SPLIT) dataset = load_dataset(dataset_name, split=dataset_split) df = commit0_setup(dataset, repo_split) @@ -180,14 +193,14 @@ def prepare_workspace( """ repo_name = instance.data["repo"].split("/")[1] base_docker_image = get_base_docker_image(repo_name) - build_target = "source-minimal" + build_target = BUILD_TARGET logger.info(f"Using base docker image: {base_docker_image}") if self.metadata.workspace_type == "docker": # Build agent-server image from base commit0 image workspace = DockerDevWorkspace( base_image=base_docker_image, - working_dir="/workspace", + working_dir=WORKSPACE_DIR, target=build_target, forward_env=forward_env or [], ) @@ -218,11 +231,14 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + str(DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT), + ) + ) workspace = APIRemoteWorkspace( - runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" - ), + runtime_api_url=os.getenv("RUNTIME_API_URL", DEFAULT_RUNTIME_API_URL), runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", @@ -238,37 +254,39 @@ def prepare_workspace( # Clone the repository to the specific directory workspace_dir_name = instance.data["repo"].split("/")[1] - clone_cmd = f"cd /workspace/ && git clone -b commit0_combined https://github.com/{instance.data['repo']}.git {workspace_dir_name}" - res = workspace.execute_command(clone_cmd, timeout=600) + clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}" + res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to clone repo: {res.stderr}") logger.info(f"Cloned repository: {instance.data['repo']}") # Create new branch - branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands" - res = workspace.execute_command(branch_cmd, timeout=600) + branch_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" + res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to create branch: {res.stderr}") - logger.info("Created new branch: openhands") + logger.info(f"Created new branch: {AGENT_BRANCH_NAME}") # Install commit0 # Try uv first, fall back to pip if uv is not available - install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)" - res = workspace.execute_command(install_cmd, timeout=600) + install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)" + res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to install commit0: {res.stderr}") logger.info("Installed commit0") # Install pytest and required plugins for test reporting - plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)" - res = workspace.execute_command(plugin_install_cmd, timeout=600) + plugin_install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)" + res = workspace.execute_command( + plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT + ) if res.exit_code != 0: raise RuntimeError(f"Failed to install pytest and plugins: {res.stderr}") logger.info("Installed pytest and required plugins") # Verify pytest and plugin installation verify_pytest_cmd = ( - f"cd /workspace/{workspace_dir_name} && python -m pytest --version" + f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -m pytest --version" ) verify_pytest_res = workspace.execute_command(verify_pytest_cmd, timeout=60) logger.info(f"Pytest verification exit code: {verify_pytest_res.exit_code}") @@ -277,7 +295,7 @@ def prepare_workspace( else: logger.warning(f"Pytest verification failed: {verify_pytest_res.stderr}") - verify_plugin_cmd = f"cd /workspace/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'" + verify_plugin_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'" verify_plugin_res = workspace.execute_command(verify_plugin_cmd, timeout=60) logger.info(f"Plugin verification exit code: {verify_plugin_res.exit_code}") if verify_plugin_res.exit_code == 0: @@ -294,7 +312,7 @@ def evaluate_instance( Run agent, collect history, git patch, and test results. """ workspace_dir_name = instance.data["repo"].split("/")[1] - repo_path = f"/workspace/{workspace_dir_name}" + repo_path = f"{WORKSPACE_DIR}/{workspace_dir_name}" tools = get_default_tools(enable_browser=False) agent = Agent( @@ -323,20 +341,24 @@ def evaluate_instance( metadata=self.metadata, ) conversation.send_message(instruction) - run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + run_timeout = int( + os.getenv("CONVERSATION_TIMEOUT", str(DEFAULT_CONVERSATION_TIMEOUT)) + ) conversation.run(timeout=run_timeout) history = list(conversation.state.events) # Complete runtime: git add, commit, diff, run tests - workspace.execute_command(f"cd {repo_path} && git add .", timeout=600) + workspace.execute_command( + f"cd {repo_path} && git add .", timeout=DEFAULT_COMMAND_TIMEOUT + ) # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " 'git config --global user.email "evaluation@openhands.dev" && ' 'git config --global user.name "OpenHands Evaluation" && ' - 'git commit --no-verify -m "openhands edits"', - timeout=600, + f'git commit --no-verify -m "{AGENT_BRANCH_NAME} edits"', + timeout=DEFAULT_COMMAND_TIMEOUT, ) # Get git patch @@ -345,7 +367,7 @@ def evaluate_instance( for retry in range(5): patch_result = workspace.execute_command( f"cd {repo_path} && git diff {base_commit} HEAD -- . ':(exclude)spec.pdf.bz2'", - timeout=600 + 100 * retry, + timeout=DEFAULT_COMMAND_TIMEOUT + 100 * retry, ) if patch_result.exit_code == 0: git_patch = patch_result.stdout.strip() @@ -363,7 +385,9 @@ def evaluate_instance( test_cmd = "python -m pytest" full_test_cmd = f"cd {repo_path} && {test_cmd} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1" logger.info(f"Running test command: {full_test_cmd}") - test_result = workspace.execute_command(full_test_cmd, timeout=600) + test_result = workspace.execute_command( + full_test_cmd, timeout=DEFAULT_COMMAND_TIMEOUT + ) logger.info(f"Test command exit code: {test_result.exit_code}") if test_result.exit_code != 0: logger.warning(f"Test command failed with stderr: {test_result.stderr}") @@ -372,7 +396,7 @@ def evaluate_instance( # Read test output test_output_result = workspace.execute_command( f"cd {repo_path} && cat test_output.txt", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) test_output = ( test_output_result.stdout.strip() @@ -388,7 +412,7 @@ def evaluate_instance( repo_name_normalized = repo_name.replace(".", "-") test_ids_result = workspace.execute_command( f"cd {repo_path} && commit0 get-tests {repo_name_normalized}", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) test_ids = ( test_ids_result.stdout.strip().split("\n") @@ -405,7 +429,7 @@ def evaluate_instance( # Read test report report_result = workspace.execute_command( f"cd {repo_path} && cat report.json", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) # Debug logging for report @@ -593,11 +617,11 @@ def main() -> None: parser.add_argument( "--repo-split", type=str, - default="lite", + default=DEFAULT_REPO_SPLIT, help="all, lite, or each repo name", ) # Override the default dataset for commit0 - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults(dataset=DEFAULT_DATASET) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/commit0/tests/test_constants.py b/benchmarks/commit0/tests/test_constants.py new file mode 100644 index 00000000..73da46f5 --- /dev/null +++ b/benchmarks/commit0/tests/test_constants.py @@ -0,0 +1,160 @@ +"""Tests for commit0 constants.py.""" + +from benchmarks.commit0.constants import ( + AGENT_BRANCH_NAME, + BUILD_TARGET, + CUSTOM_TAG_PREFIX, + DEFAULT_COMMAND_TIMEOUT, + DEFAULT_CONVERSATION_TIMEOUT, + DEFAULT_DATASET, + DEFAULT_DATASET_SPLIT, + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_IMAGE_TAG, + DEFAULT_MODEL_NAME, + DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + DEFAULT_REPO_SPLIT, + DEFAULT_RUNTIME_API_URL, + GIT_BRANCH_NAME, + TOTAL_INSTANCES, + WORKSPACE_DIR, +) + + +class TestDatasetConstants: + """Tests for dataset-related constants.""" + + def test_default_dataset_is_valid_huggingface_path(self): + """Test that DEFAULT_DATASET follows HuggingFace dataset path format.""" + assert "/" in DEFAULT_DATASET + assert DEFAULT_DATASET == "wentingzhao/commit0_combined" + + def test_default_dataset_split(self): + """Test that DEFAULT_DATASET_SPLIT is a valid split name.""" + assert DEFAULT_DATASET_SPLIT == "test" + + def test_default_repo_split(self): + """Test that DEFAULT_REPO_SPLIT is a valid repo split.""" + assert DEFAULT_REPO_SPLIT in ["lite", "all"] + + +class TestDockerConstants: + """Tests for Docker-related constants.""" + + def test_default_docker_image_prefix_format(self): + """Test that DEFAULT_DOCKER_IMAGE_PREFIX is a valid Docker registry prefix.""" + assert DEFAULT_DOCKER_IMAGE_PREFIX.endswith("/") + assert "docker.io" in DEFAULT_DOCKER_IMAGE_PREFIX + + def test_default_image_tag(self): + """Test that DEFAULT_IMAGE_TAG is a valid tag format.""" + assert DEFAULT_IMAGE_TAG == "v0" + assert not DEFAULT_IMAGE_TAG.startswith(":") + + def test_custom_tag_prefix(self): + """Test that CUSTOM_TAG_PREFIX is a valid prefix.""" + assert CUSTOM_TAG_PREFIX == "commit0-" + assert CUSTOM_TAG_PREFIX.endswith("-") + + +class TestBuildConstants: + """Tests for build-related constants.""" + + def test_build_target(self): + """Test that BUILD_TARGET is a valid build target.""" + assert BUILD_TARGET == "source-minimal" + + +class TestWorkspaceConstants: + """Tests for workspace-related constants.""" + + def test_workspace_dir_is_absolute_path(self): + """Test that WORKSPACE_DIR is an absolute path.""" + assert WORKSPACE_DIR.startswith("/") + assert WORKSPACE_DIR == "/workspace" + + +class TestGitConstants: + """Tests for Git-related constants.""" + + def test_git_branch_name(self): + """Test that GIT_BRANCH_NAME is a valid branch name.""" + assert GIT_BRANCH_NAME == "commit0_combined" + assert " " not in GIT_BRANCH_NAME + + def test_agent_branch_name(self): + """Test that AGENT_BRANCH_NAME is a valid branch name.""" + assert AGENT_BRANCH_NAME == "openhands" + assert " " not in AGENT_BRANCH_NAME + + +class TestModelConstants: + """Tests for model-related constants.""" + + def test_default_model_name(self): + """Test that DEFAULT_MODEL_NAME is set.""" + assert DEFAULT_MODEL_NAME == "openhands" + + +class TestRuntimeConstants: + """Tests for runtime-related constants.""" + + def test_default_runtime_api_url_is_valid_url(self): + """Test that DEFAULT_RUNTIME_API_URL is a valid URL.""" + assert DEFAULT_RUNTIME_API_URL.startswith("https://") + assert "runtime" in DEFAULT_RUNTIME_API_URL + + def test_default_remote_runtime_startup_timeout_is_positive(self): + """Test that DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is positive.""" + assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT > 0 + assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT == 600 + + def test_default_conversation_timeout_is_positive(self): + """Test that DEFAULT_CONVERSATION_TIMEOUT is positive.""" + assert DEFAULT_CONVERSATION_TIMEOUT > 0 + assert DEFAULT_CONVERSATION_TIMEOUT == 3600 + + def test_default_command_timeout_is_positive(self): + """Test that DEFAULT_COMMAND_TIMEOUT is positive.""" + assert DEFAULT_COMMAND_TIMEOUT > 0 + assert DEFAULT_COMMAND_TIMEOUT == 600 + + +class TestEvaluationConstants: + """Tests for evaluation-related constants.""" + + def test_total_instances_is_positive(self): + """Test that TOTAL_INSTANCES is positive.""" + assert TOTAL_INSTANCES > 0 + assert TOTAL_INSTANCES == 16 + + +class TestConstantsIntegration: + """Integration tests for constants usage.""" + + def test_docker_image_can_be_constructed(self): + """Test that a valid Docker image name can be constructed from constants.""" + repo_name = "test-repo" + image = f"{DEFAULT_DOCKER_IMAGE_PREFIX}{repo_name}:{DEFAULT_IMAGE_TAG}" + assert image == "docker.io/wentingzhao/test-repo:v0" + + def test_custom_tag_can_be_constructed(self): + """Test that a valid custom tag can be constructed from constants.""" + repo_name = "test-repo" + custom_tag = f"{CUSTOM_TAG_PREFIX}{repo_name}" + assert custom_tag == "commit0-test-repo" + + def test_workspace_path_can_be_constructed(self): + """Test that a valid workspace path can be constructed from constants.""" + repo_name = "test-repo" + workspace_path = f"{WORKSPACE_DIR}/{repo_name}" + assert workspace_path == "/workspace/test-repo" + + def test_clone_command_can_be_constructed(self): + """Test that a valid git clone command can be constructed from constants.""" + repo = "owner/test-repo" + repo_name = repo.split("/")[1] + clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{repo}.git {repo_name}" + assert ( + clone_cmd + == "cd /workspace/ && git clone -b commit0_combined https://github.com/owner/test-repo.git test-repo" + ) From 08844b2ee8785a5ca9964612fa73324708c7d401 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 10:04:40 +0000 Subject: [PATCH 2/3] Remove test_constants.py and revert WORKSPACE_DIR changes Per user request: - Remove benchmarks/commit0/tests/test_constants.py - Remove WORKSPACE_DIR constant from constants.py - Revert all WORKSPACE_DIR usages back to hardcoded '/workspace' Co-authored-by: openhands --- benchmarks/commit0/constants.py | 3 - benchmarks/commit0/run_infer.py | 17 ++- benchmarks/commit0/tests/test_constants.py | 160 --------------------- 3 files changed, 8 insertions(+), 172 deletions(-) delete mode 100644 benchmarks/commit0/tests/test_constants.py diff --git a/benchmarks/commit0/constants.py b/benchmarks/commit0/constants.py index e75c76b5..14d044d4 100644 --- a/benchmarks/commit0/constants.py +++ b/benchmarks/commit0/constants.py @@ -18,9 +18,6 @@ # Build configuration BUILD_TARGET = "source-minimal" -# Workspace configuration -WORKSPACE_DIR = "/workspace" - # Git configuration GIT_BRANCH_NAME = "commit0_combined" AGENT_BRANCH_NAME = "openhands" diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index c0fb83e5..145c9971 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -23,7 +23,6 @@ DEFAULT_REPO_SPLIT, DEFAULT_RUNTIME_API_URL, GIT_BRANCH_NAME, - WORKSPACE_DIR, ) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -200,7 +199,7 @@ def prepare_workspace( # Build agent-server image from base commit0 image workspace = DockerDevWorkspace( base_image=base_docker_image, - working_dir=WORKSPACE_DIR, + working_dir="/workspace", target=build_target, forward_env=forward_env or [], ) @@ -254,14 +253,14 @@ def prepare_workspace( # Clone the repository to the specific directory workspace_dir_name = instance.data["repo"].split("/")[1] - clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}" + clone_cmd = f"cd /workspace/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}" res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to clone repo: {res.stderr}") logger.info(f"Cloned repository: {instance.data['repo']}") # Create new branch - branch_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" + branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to create branch: {res.stderr}") @@ -269,14 +268,14 @@ def prepare_workspace( # Install commit0 # Try uv first, fall back to pip if uv is not available - install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)" + install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)" res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to install commit0: {res.stderr}") logger.info("Installed commit0") # Install pytest and required plugins for test reporting - plugin_install_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)" + plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)" res = workspace.execute_command( plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT ) @@ -286,7 +285,7 @@ def prepare_workspace( # Verify pytest and plugin installation verify_pytest_cmd = ( - f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -m pytest --version" + f"cd /workspace/{workspace_dir_name} && python -m pytest --version" ) verify_pytest_res = workspace.execute_command(verify_pytest_cmd, timeout=60) logger.info(f"Pytest verification exit code: {verify_pytest_res.exit_code}") @@ -295,7 +294,7 @@ def prepare_workspace( else: logger.warning(f"Pytest verification failed: {verify_pytest_res.stderr}") - verify_plugin_cmd = f"cd {WORKSPACE_DIR}/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'" + verify_plugin_cmd = f"cd /workspace/{workspace_dir_name} && python -c 'import pytest_jsonreport; print(\"Plugin available\")'" verify_plugin_res = workspace.execute_command(verify_plugin_cmd, timeout=60) logger.info(f"Plugin verification exit code: {verify_plugin_res.exit_code}") if verify_plugin_res.exit_code == 0: @@ -312,7 +311,7 @@ def evaluate_instance( Run agent, collect history, git patch, and test results. """ workspace_dir_name = instance.data["repo"].split("/")[1] - repo_path = f"{WORKSPACE_DIR}/{workspace_dir_name}" + repo_path = f"/workspace/{workspace_dir_name}" tools = get_default_tools(enable_browser=False) agent = Agent( diff --git a/benchmarks/commit0/tests/test_constants.py b/benchmarks/commit0/tests/test_constants.py deleted file mode 100644 index 73da46f5..00000000 --- a/benchmarks/commit0/tests/test_constants.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Tests for commit0 constants.py.""" - -from benchmarks.commit0.constants import ( - AGENT_BRANCH_NAME, - BUILD_TARGET, - CUSTOM_TAG_PREFIX, - DEFAULT_COMMAND_TIMEOUT, - DEFAULT_CONVERSATION_TIMEOUT, - DEFAULT_DATASET, - DEFAULT_DATASET_SPLIT, - DEFAULT_DOCKER_IMAGE_PREFIX, - DEFAULT_IMAGE_TAG, - DEFAULT_MODEL_NAME, - DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, - DEFAULT_REPO_SPLIT, - DEFAULT_RUNTIME_API_URL, - GIT_BRANCH_NAME, - TOTAL_INSTANCES, - WORKSPACE_DIR, -) - - -class TestDatasetConstants: - """Tests for dataset-related constants.""" - - def test_default_dataset_is_valid_huggingface_path(self): - """Test that DEFAULT_DATASET follows HuggingFace dataset path format.""" - assert "/" in DEFAULT_DATASET - assert DEFAULT_DATASET == "wentingzhao/commit0_combined" - - def test_default_dataset_split(self): - """Test that DEFAULT_DATASET_SPLIT is a valid split name.""" - assert DEFAULT_DATASET_SPLIT == "test" - - def test_default_repo_split(self): - """Test that DEFAULT_REPO_SPLIT is a valid repo split.""" - assert DEFAULT_REPO_SPLIT in ["lite", "all"] - - -class TestDockerConstants: - """Tests for Docker-related constants.""" - - def test_default_docker_image_prefix_format(self): - """Test that DEFAULT_DOCKER_IMAGE_PREFIX is a valid Docker registry prefix.""" - assert DEFAULT_DOCKER_IMAGE_PREFIX.endswith("/") - assert "docker.io" in DEFAULT_DOCKER_IMAGE_PREFIX - - def test_default_image_tag(self): - """Test that DEFAULT_IMAGE_TAG is a valid tag format.""" - assert DEFAULT_IMAGE_TAG == "v0" - assert not DEFAULT_IMAGE_TAG.startswith(":") - - def test_custom_tag_prefix(self): - """Test that CUSTOM_TAG_PREFIX is a valid prefix.""" - assert CUSTOM_TAG_PREFIX == "commit0-" - assert CUSTOM_TAG_PREFIX.endswith("-") - - -class TestBuildConstants: - """Tests for build-related constants.""" - - def test_build_target(self): - """Test that BUILD_TARGET is a valid build target.""" - assert BUILD_TARGET == "source-minimal" - - -class TestWorkspaceConstants: - """Tests for workspace-related constants.""" - - def test_workspace_dir_is_absolute_path(self): - """Test that WORKSPACE_DIR is an absolute path.""" - assert WORKSPACE_DIR.startswith("/") - assert WORKSPACE_DIR == "/workspace" - - -class TestGitConstants: - """Tests for Git-related constants.""" - - def test_git_branch_name(self): - """Test that GIT_BRANCH_NAME is a valid branch name.""" - assert GIT_BRANCH_NAME == "commit0_combined" - assert " " not in GIT_BRANCH_NAME - - def test_agent_branch_name(self): - """Test that AGENT_BRANCH_NAME is a valid branch name.""" - assert AGENT_BRANCH_NAME == "openhands" - assert " " not in AGENT_BRANCH_NAME - - -class TestModelConstants: - """Tests for model-related constants.""" - - def test_default_model_name(self): - """Test that DEFAULT_MODEL_NAME is set.""" - assert DEFAULT_MODEL_NAME == "openhands" - - -class TestRuntimeConstants: - """Tests for runtime-related constants.""" - - def test_default_runtime_api_url_is_valid_url(self): - """Test that DEFAULT_RUNTIME_API_URL is a valid URL.""" - assert DEFAULT_RUNTIME_API_URL.startswith("https://") - assert "runtime" in DEFAULT_RUNTIME_API_URL - - def test_default_remote_runtime_startup_timeout_is_positive(self): - """Test that DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is positive.""" - assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT > 0 - assert DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT == 600 - - def test_default_conversation_timeout_is_positive(self): - """Test that DEFAULT_CONVERSATION_TIMEOUT is positive.""" - assert DEFAULT_CONVERSATION_TIMEOUT > 0 - assert DEFAULT_CONVERSATION_TIMEOUT == 3600 - - def test_default_command_timeout_is_positive(self): - """Test that DEFAULT_COMMAND_TIMEOUT is positive.""" - assert DEFAULT_COMMAND_TIMEOUT > 0 - assert DEFAULT_COMMAND_TIMEOUT == 600 - - -class TestEvaluationConstants: - """Tests for evaluation-related constants.""" - - def test_total_instances_is_positive(self): - """Test that TOTAL_INSTANCES is positive.""" - assert TOTAL_INSTANCES > 0 - assert TOTAL_INSTANCES == 16 - - -class TestConstantsIntegration: - """Integration tests for constants usage.""" - - def test_docker_image_can_be_constructed(self): - """Test that a valid Docker image name can be constructed from constants.""" - repo_name = "test-repo" - image = f"{DEFAULT_DOCKER_IMAGE_PREFIX}{repo_name}:{DEFAULT_IMAGE_TAG}" - assert image == "docker.io/wentingzhao/test-repo:v0" - - def test_custom_tag_can_be_constructed(self): - """Test that a valid custom tag can be constructed from constants.""" - repo_name = "test-repo" - custom_tag = f"{CUSTOM_TAG_PREFIX}{repo_name}" - assert custom_tag == "commit0-test-repo" - - def test_workspace_path_can_be_constructed(self): - """Test that a valid workspace path can be constructed from constants.""" - repo_name = "test-repo" - workspace_path = f"{WORKSPACE_DIR}/{repo_name}" - assert workspace_path == "/workspace/test-repo" - - def test_clone_command_can_be_constructed(self): - """Test that a valid git clone command can be constructed from constants.""" - repo = "owner/test-repo" - repo_name = repo.split("/")[1] - clone_cmd = f"cd {WORKSPACE_DIR}/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{repo}.git {repo_name}" - assert ( - clone_cmd - == "cd /workspace/ && git clone -b commit0_combined https://github.com/owner/test-repo.git test-repo" - ) From 0cc47a97ef5a0e12d419e9dc59d221400e72592a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 13:32:35 +0000 Subject: [PATCH 3/3] style: fix ruff formatting in run_infer.py Co-authored-by: openhands --- benchmarks/commit0/run_infer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 145c9971..0ab93f4a 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -260,7 +260,9 @@ def prepare_workspace( logger.info(f"Cloned repository: {instance.data['repo']}") # Create new branch - branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" + branch_cmd = ( + f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" + ) res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to create branch: {res.stderr}")