diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index b59704ea..3f784b9c 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -13,6 +13,13 @@ from commit0.harness.constants import SPLIT +from benchmarks.commit0.constants import ( + CUSTOM_TAG_PREFIX, + DEFAULT_DATASET, + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_IMAGE_TAG, + DEFAULT_REPO_SPLIT, +) from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -22,7 +29,6 @@ logger = get_logger(__name__) -DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/" def get_base_docker_image( @@ -33,14 +39,14 @@ def get_base_docker_image( prefix = docker_image_prefix or os.getenv( "EVAL_DOCKER_IMAGE_PREFIX", DEFAULT_DOCKER_IMAGE_PREFIX ) - return (prefix.rstrip("/") + "/" + repo_name).lower() + ":v0" + return (prefix.rstrip("/") + "/" + repo_name).lower() + f":{DEFAULT_IMAGE_TAG}" def extract_custom_tag(base_image: str) -> str: """Extract Commit0 custom tag from a base image name.""" repo_tag = base_image.rsplit("/", 1)[-1] repo_name = repo_tag.split(":", 1)[0].lower() - return f"commit0-{repo_name}" + return f"{CUSTOM_TAG_PREFIX}{repo_name}" def _load_selected_instances(selected_instances_file: str) -> list[str]: @@ -90,7 +96,7 @@ def main(argv: list[str]) -> int: parser.add_argument( "--repo-split", type=str, - default="lite", + default=DEFAULT_REPO_SPLIT, help="Commit0 repo split (lite, all, or repo name)", ) parser.add_argument( @@ -99,7 +105,7 @@ def main(argv: list[str]) -> int: default="", help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)", ) - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults(dataset=DEFAULT_DATASET) args = parser.parse_args(argv) docker_image_prefix = args.docker_image_prefix or None diff --git a/benchmarks/commit0/constants.py b/benchmarks/commit0/constants.py new file mode 100644 index 00000000..14d044d4 --- /dev/null +++ b/benchmarks/commit0/constants.py @@ -0,0 +1,35 @@ +""" +Commit0 Benchmark Constants + +This module serves as the single source of truth for all hyperparameters +and constant values used in the Commit0 benchmark evaluation workflow. +""" + +# Dataset configuration +DEFAULT_DATASET = "wentingzhao/commit0_combined" +DEFAULT_DATASET_SPLIT = "test" +DEFAULT_REPO_SPLIT = "lite" + +# Docker image configuration +DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/" +DEFAULT_IMAGE_TAG = "v0" +CUSTOM_TAG_PREFIX = "commit0-" + +# Build configuration +BUILD_TARGET = "source-minimal" + +# Git configuration +GIT_BRANCH_NAME = "commit0_combined" +AGENT_BRANCH_NAME = "openhands" + +# Model configuration +DEFAULT_MODEL_NAME = "openhands" + +# Runtime configuration +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600 +DEFAULT_CONVERSATION_TIMEOUT = 3600 +DEFAULT_COMMAND_TIMEOUT = 600 + +# Evaluation configuration +TOTAL_INSTANCES = 16 diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index f03e73f6..fcc7885f 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -15,6 +15,7 @@ import sys from pathlib import Path +from benchmarks.commit0.constants import DEFAULT_MODEL_NAME, TOTAL_INSTANCES from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report @@ -27,7 +28,7 @@ def process_commit0_results( - input_file: str, output_file: str, model_name: str = "openhands" + input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME ) -> None: """ Process Commit0 output.jsonl and generate evaluation report. @@ -123,7 +124,7 @@ def process_commit0_results( # Generate report report = { "model_name_or_path": model_name, - "total_instances": 16, # Fixed as per requirement + "total_instances": TOTAL_INSTANCES, "submitted_instances": len(completed_ids), "completed_instances": len(completed_ids), "resolved_instances": len(resolved_ids), @@ -174,8 +175,8 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})", ) args = parser.parse_args() diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2e473669..0ab93f4a 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,6 +12,18 @@ extract_custom_tag, get_base_docker_image, ) +from benchmarks.commit0.constants import ( + AGENT_BRANCH_NAME, + BUILD_TARGET, + DEFAULT_COMMAND_TIMEOUT, + DEFAULT_CONVERSATION_TIMEOUT, + DEFAULT_DATASET, + DEFAULT_DATASET_SPLIT, + DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + DEFAULT_REPO_SPLIT, + DEFAULT_RUNTIME_API_URL, + GIT_BRANCH_NAME, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -110,9 +122,9 @@ def __init__( self, metadata: EvalMetadata, num_workers: int = 1, - repo_split: str = "lite", - dataset_name: str = "wentingzhao/commit0_combined", - dataset_split: str = "test", + repo_split: str = DEFAULT_REPO_SPLIT, + dataset_name: str = DEFAULT_DATASET, + dataset_split: str = DEFAULT_DATASET_SPLIT, ): super().__init__(metadata=metadata, num_workers=num_workers) # Store additional parameters in metadata.details for access in methods @@ -130,9 +142,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up Commit0 evaluation data") details = self.metadata.details or {} - dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined") - dataset_split = details.get("dataset_split", "test") - repo_split = details.get("repo_split", "lite") + dataset_name = details.get("dataset_name", DEFAULT_DATASET) + dataset_split = details.get("dataset_split", DEFAULT_DATASET_SPLIT) + repo_split = details.get("repo_split", DEFAULT_REPO_SPLIT) dataset = load_dataset(dataset_name, split=dataset_split) df = commit0_setup(dataset, repo_split) @@ -180,7 +192,7 @@ def prepare_workspace( """ repo_name = instance.data["repo"].split("/")[1] base_docker_image = get_base_docker_image(repo_name) - build_target = "source-minimal" + build_target = BUILD_TARGET logger.info(f"Using base docker image: {base_docker_image}") if self.metadata.workspace_type == "docker": @@ -218,11 +230,14 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + str(DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT), + ) + ) workspace = APIRemoteWorkspace( - runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" - ), + runtime_api_url=os.getenv("RUNTIME_API_URL", DEFAULT_RUNTIME_API_URL), runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", @@ -238,30 +253,34 @@ def prepare_workspace( # Clone the repository to the specific directory workspace_dir_name = instance.data["repo"].split("/")[1] - clone_cmd = f"cd /workspace/ && git clone -b commit0_combined https://github.com/{instance.data['repo']}.git {workspace_dir_name}" - res = workspace.execute_command(clone_cmd, timeout=600) + clone_cmd = f"cd /workspace/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}" + res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to clone repo: {res.stderr}") logger.info(f"Cloned repository: {instance.data['repo']}") # Create new branch - branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands" - res = workspace.execute_command(branch_cmd, timeout=600) + branch_cmd = ( + f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}" + ) + res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to create branch: {res.stderr}") - logger.info("Created new branch: openhands") + logger.info(f"Created new branch: {AGENT_BRANCH_NAME}") # Install commit0 # Try uv first, fall back to pip if uv is not available install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)" - res = workspace.execute_command(install_cmd, timeout=600) + res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT) if res.exit_code != 0: raise RuntimeError(f"Failed to install commit0: {res.stderr}") logger.info("Installed commit0") # Install pytest and required plugins for test reporting plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)" - res = workspace.execute_command(plugin_install_cmd, timeout=600) + res = workspace.execute_command( + plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT + ) if res.exit_code != 0: raise RuntimeError(f"Failed to install pytest and plugins: {res.stderr}") logger.info("Installed pytest and required plugins") @@ -323,20 +342,24 @@ def evaluate_instance( metadata=self.metadata, ) conversation.send_message(instruction) - run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + run_timeout = int( + os.getenv("CONVERSATION_TIMEOUT", str(DEFAULT_CONVERSATION_TIMEOUT)) + ) conversation.run(timeout=run_timeout) history = list(conversation.state.events) # Complete runtime: git add, commit, diff, run tests - workspace.execute_command(f"cd {repo_path} && git add .", timeout=600) + workspace.execute_command( + f"cd {repo_path} && git add .", timeout=DEFAULT_COMMAND_TIMEOUT + ) # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " 'git config --global user.email "evaluation@openhands.dev" && ' 'git config --global user.name "OpenHands Evaluation" && ' - 'git commit --no-verify -m "openhands edits"', - timeout=600, + f'git commit --no-verify -m "{AGENT_BRANCH_NAME} edits"', + timeout=DEFAULT_COMMAND_TIMEOUT, ) # Get git patch @@ -345,7 +368,7 @@ def evaluate_instance( for retry in range(5): patch_result = workspace.execute_command( f"cd {repo_path} && git diff {base_commit} HEAD -- . ':(exclude)spec.pdf.bz2'", - timeout=600 + 100 * retry, + timeout=DEFAULT_COMMAND_TIMEOUT + 100 * retry, ) if patch_result.exit_code == 0: git_patch = patch_result.stdout.strip() @@ -363,7 +386,9 @@ def evaluate_instance( test_cmd = "python -m pytest" full_test_cmd = f"cd {repo_path} && {test_cmd} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1" logger.info(f"Running test command: {full_test_cmd}") - test_result = workspace.execute_command(full_test_cmd, timeout=600) + test_result = workspace.execute_command( + full_test_cmd, timeout=DEFAULT_COMMAND_TIMEOUT + ) logger.info(f"Test command exit code: {test_result.exit_code}") if test_result.exit_code != 0: logger.warning(f"Test command failed with stderr: {test_result.stderr}") @@ -372,7 +397,7 @@ def evaluate_instance( # Read test output test_output_result = workspace.execute_command( f"cd {repo_path} && cat test_output.txt", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) test_output = ( test_output_result.stdout.strip() @@ -388,7 +413,7 @@ def evaluate_instance( repo_name_normalized = repo_name.replace(".", "-") test_ids_result = workspace.execute_command( f"cd {repo_path} && commit0 get-tests {repo_name_normalized}", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) test_ids = ( test_ids_result.stdout.strip().split("\n") @@ -405,7 +430,7 @@ def evaluate_instance( # Read test report report_result = workspace.execute_command( f"cd {repo_path} && cat report.json", - timeout=600, + timeout=DEFAULT_COMMAND_TIMEOUT, ) # Debug logging for report @@ -593,11 +618,11 @@ def main() -> None: parser.add_argument( "--repo-split", type=str, - default="lite", + default=DEFAULT_REPO_SPLIT, help="all, lite, or each repo name", ) # Override the default dataset for commit0 - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults(dataset=DEFAULT_DATASET) args = parser.parse_args() # Validate max_attempts