diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 3ecdeeb6..38b8823d 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -11,6 +11,12 @@ import os from pathlib import Path +from benchmarks.multiswebench.constants import ( + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_LANGUAGE, + DOCKER_IMAGE_PREFIX_ENV_VAR, + LANGUAGE_ENV_VAR, +) from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -23,8 +29,10 @@ logger = get_logger(__name__) # Environment variables for multi-language support -DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench") -LANGUAGE = os.environ.get("LANGUAGE", "java") +DOCKER_IMAGE_PREFIX = os.environ.get( + DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX +) +LANGUAGE = os.environ.get(LANGUAGE_ENV_VAR, DEFAULT_LANGUAGE) def get_official_docker_image( diff --git a/benchmarks/multiswebench/constants.py b/benchmarks/multiswebench/constants.py new file mode 100644 index 00000000..f90c158e --- /dev/null +++ b/benchmarks/multiswebench/constants.py @@ -0,0 +1,98 @@ +""" +Constants and hyperparameters for Multi-SWE-Bench evaluation. + +This module serves as the single source of truth for all constant values +used throughout the Multi-SWE-Bench benchmark implementation. +""" + +# ============================================================================= +# Dataset Configuration +# ============================================================================= + +# Default dataset name on HuggingFace +DEFAULT_DATASET = "bytedance-research/Multi-SWE-Bench" + +# Default dataset split +DEFAULT_SPLIT = "test" + +# Default programming language +DEFAULT_LANGUAGE = "java" + +# Default model name for predictions +DEFAULT_MODEL_NAME = "OpenHands" + +# ============================================================================= +# Docker/Image Configuration +# ============================================================================= + +# Default Docker image prefix for Multi-SWE-Bench +DEFAULT_DOCKER_IMAGE_PREFIX = "mswebench" + +# Default build target for agent server images +DEFAULT_BUILD_TARGET = "source-minimal" + +# Environment variable names +DOCKER_IMAGE_PREFIX_ENV_VAR = "EVAL_DOCKER_IMAGE_PREFIX" +LANGUAGE_ENV_VAR = "LANGUAGE" +SKIP_BUILD_ENV_VAR = "MULTI_SWE_BENCH_SKIP_BUILD" + +# ============================================================================= +# Runtime Configuration +# ============================================================================= + +# Default runtime API URL for remote workspace +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" + +# Default startup timeout in seconds +DEFAULT_STARTUP_TIMEOUT = 600 + +# Environment variable names for runtime configuration +USE_HINT_TEXT_ENV_VAR = "USE_HINT_TEXT" +USE_INSTANCE_IMAGE_ENV_VAR = "USE_INSTANCE_IMAGE" +RUN_WITH_BROWSING_ENV_VAR = "RUN_WITH_BROWSING" +RUNTIME_API_KEY_ENV_VAR = "RUNTIME_API_KEY" +RUNTIME_API_URL_ENV_VAR = "RUNTIME_API_URL" +SDK_SHORT_SHA_ENV_VAR = "SDK_SHORT_SHA" +REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR = "REMOTE_RUNTIME_STARTUP_TIMEOUT" + +# Default values for boolean environment variables +DEFAULT_USE_HINT_TEXT = False +DEFAULT_USE_INSTANCE_IMAGE = True +DEFAULT_RUN_WITH_BROWSING = False + +# ============================================================================= +# Evaluation Harness Configuration +# ============================================================================= + +# Default configuration template for Multi-SWE-Bench evaluation harness. +# Dynamic values (paths) are added at runtime. +DEFAULT_EVAL_HARNESS_CONFIG = { + "mode": "evaluation", + "force_build": True, + "need_clone": True, + "clear_env": True, + "stop_on_error": False, + "max_workers": 5, + "max_workers_build_image": 5, + "max_workers_run_instance": 5, + "log_level": "DEBUG", + "fix_patch_run_cmd": ( + 'bash -c "apt update ; apt install -y patch ; ' + "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" + "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; " + 'chmod +x /home/*.sh ; /home/fix-run.sh"' + ), + "specifics": [], + "skips": [], + "global_env": [], +} + +# ============================================================================= +# Workspace Configuration +# ============================================================================= + +# Default working directory in container +DEFAULT_WORKING_DIR = "/workspace" + +# Default environment setup commands +DEFAULT_ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"] diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py index 3bb88cf1..2b509f4d 100644 --- a/benchmarks/multiswebench/eval_infer.py +++ b/benchmarks/multiswebench/eval_infer.py @@ -14,6 +14,12 @@ import subprocess from pathlib import Path +from benchmarks.multiswebench.constants import ( + DEFAULT_DATASET, + DEFAULT_LANGUAGE, + DEFAULT_MODEL_NAME, + DEFAULT_SPLIT, +) from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import ( update_multi_swe_config, @@ -29,7 +35,7 @@ def run_multi_swebench_evaluation( dataset_name: str | None = None, split: str | None = None, input_file: str | None = None, - lang: str = "java", + lang: str = DEFAULT_LANGUAGE, ): """ Run Multi-SWE-Bench evaluation using the predictions file. @@ -46,9 +52,9 @@ def run_multi_swebench_evaluation( # Default dataset and split if not provided if dataset_name is None: - dataset_name = "bytedance-research/Multi-SWE-Bench" + dataset_name = DEFAULT_DATASET if split is None: - split = "test" + split = DEFAULT_SPLIT try: if input_file is None: @@ -108,14 +114,12 @@ def main(): parser = argparse.ArgumentParser(description="Multi-SWE-Bench Evaluation") parser.add_argument("input_file", help="Path to OpenHands output.jsonl file") parser.add_argument( - "--model-name", default="OpenHands", help="Model name for predictions" - ) - parser.add_argument( - "--dataset", default="bytedance-research/Multi-SWE-Bench", help="Dataset name" + "--model-name", default=DEFAULT_MODEL_NAME, help="Model name for predictions" ) - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", default=DEFAULT_DATASET, help="Dataset name") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( - "--lang", default="java", help="Language for Multi-SWE-bench dataset" + "--lang", default=DEFAULT_LANGUAGE, help="Language for Multi-SWE-bench dataset" ) parser.add_argument( "--skip-evaluation", diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 7eae1c6c..a5ec9edd 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -11,6 +11,27 @@ extract_custom_tag, get_official_docker_image, ) +from benchmarks.multiswebench.constants import ( + DEFAULT_BUILD_TARGET, + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_ENV_SETUP_COMMANDS, + DEFAULT_LANGUAGE, + DEFAULT_RUN_WITH_BROWSING, + DEFAULT_RUNTIME_API_URL, + DEFAULT_STARTUP_TIMEOUT, + DEFAULT_USE_HINT_TEXT, + DEFAULT_USE_INSTANCE_IMAGE, + DEFAULT_WORKING_DIR, + DOCKER_IMAGE_PREFIX_ENV_VAR, + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, + RUN_WITH_BROWSING_ENV_VAR, + RUNTIME_API_KEY_ENV_VAR, + RUNTIME_API_URL_ENV_VAR, + SDK_SHORT_SHA_ENV_VAR, + SKIP_BUILD_ENV_VAR, + USE_HINT_TEXT_ENV_VAR, + USE_INSTANCE_IMAGE_ENV_VAR, +) from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference from benchmarks.utils.args_parser import get_parser @@ -42,18 +63,33 @@ class MultiSWEBenchEvalMetadata(EvalMetadata): """Extended metadata for Multi-SWE-bench evaluation with language support.""" lang: str = Field( - default="java", description="Language for Multi-SWE-bench dataset" + default=DEFAULT_LANGUAGE, description="Language for Multi-SWE-bench dataset" ) logger = get_logger(__name__) # Environment variables for Multi-SWE-Bench configuration -USE_HINT_TEXT = os.environ.get("USE_HINT_TEXT", "false").lower() == "true" -USE_INSTANCE_IMAGE = os.environ.get("USE_INSTANCE_IMAGE", "true").lower() == "true" -RUN_WITH_BROWSING = os.environ.get("RUN_WITH_BROWSING", "false").lower() == "true" +USE_HINT_TEXT = ( + os.environ.get(USE_HINT_TEXT_ENV_VAR, str(DEFAULT_USE_HINT_TEXT).lower()).lower() + == "true" +) +USE_INSTANCE_IMAGE = ( + os.environ.get( + USE_INSTANCE_IMAGE_ENV_VAR, str(DEFAULT_USE_INSTANCE_IMAGE).lower() + ).lower() + == "true" +) +RUN_WITH_BROWSING = ( + os.environ.get( + RUN_WITH_BROWSING_ENV_VAR, str(DEFAULT_RUN_WITH_BROWSING).lower() + ).lower() + == "true" +) # For Multi-SWE-Bench, force mswebench prefix instead of the general SWE-Bench prefix -DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench") +DOCKER_IMAGE_PREFIX = os.environ.get( + DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX +) logger.info(f"Using docker image prefix: {DOCKER_IMAGE_PREFIX}") @@ -200,7 +236,7 @@ def prepare_workspace( instance.data, docker_image_prefix=DOCKER_IMAGE_PREFIX ) logger.info(f"Using official docker image: {official_docker_image}") - build_target = "source-minimal" + build_target = DEFAULT_BUILD_TARGET custom_tag = extract_custom_tag(official_docker_image) # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" @@ -209,7 +245,7 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in ( + SKIP_BUILD = os.getenv(SKIP_BUILD_ENV_VAR, "0").lower() in ( "1", "true", "yes", @@ -241,15 +277,15 @@ def prepare_workspace( workspace = DockerWorkspace( server_image=agent_server_image, - working_dir="/workspace", + working_dir=DEFAULT_WORKING_DIR, forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": - runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + runtime_api_key = os.getenv(RUNTIME_API_KEY_ENV_VAR) + sdk_short_sha = os.getenv(SDK_SHORT_SHA_ENV_VAR, SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( - "RUNTIME_API_KEY environment variable is not set for remote workspace" + f"{RUNTIME_API_KEY_ENV_VAR} environment variable is not set for remote workspace" ) agent_server_image = ( @@ -264,10 +300,14 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, str(DEFAULT_STARTUP_TIMEOUT) + ) + ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + RUNTIME_API_URL_ENV_VAR, DEFAULT_RUNTIME_API_URL ), runtime_api_key=runtime_api_key, server_image=agent_server_image, @@ -432,7 +472,7 @@ def main() -> None: parser.add_argument( "--lang", type=str, - default="java", + default=DEFAULT_LANGUAGE, help="Language for Multi-SWE-bench dataset", ) args = parser.parse_args() @@ -475,7 +515,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + env_setup_commands=DEFAULT_ENV_SETUP_COMMANDS, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, diff --git a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py index c18ca6d7..4ec4cdb8 100644 --- a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py +++ b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py @@ -2,6 +2,7 @@ import json import os +from benchmarks.multiswebench.constants import DEFAULT_EVAL_HARNESS_CONFIG from benchmarks.multiswebench.scripts.eval.convert import convert_to_eval_format @@ -18,32 +19,14 @@ def update_multi_swe_config(output_jsonl_path, config_path, dataset): os.makedirs(os.path.join(path_to_parent, "eval_files", "repos"), exist_ok=True) os.makedirs(os.path.join(path_to_parent, "eval_files", "logs"), exist_ok=True) - # Prepare config dict - config = { - "mode": "evaluation", - "workdir": os.path.join(path_to_parent, "eval_files", "workdir"), - "patch_files": [converted_path], - "dataset_files": [dataset], - "force_build": True, - "output_dir": os.path.join(path_to_parent, "eval_files", "dataset"), - "specifics": [], - "skips": [], - "repo_dir": os.path.join(path_to_parent, "eval_files", "repos"), - "need_clone": True, - "global_env": [], - "clear_env": True, - "stop_on_error": False, - "max_workers": 5, - "max_workers_build_image": 5, - "max_workers_run_instance": 5, - "log_dir": os.path.join(path_to_parent, "eval_files", "logs"), - "log_level": "DEBUG", - "fix_patch_run_cmd": ( - 'bash -c "apt update ; apt install -y patch ; ' - "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" - "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\"" - ), - } + # Start with default config and add dynamic paths + config = DEFAULT_EVAL_HARNESS_CONFIG.copy() + config["workdir"] = os.path.join(path_to_parent, "eval_files", "workdir") + config["patch_files"] = [converted_path] + config["dataset_files"] = [dataset] + config["output_dir"] = os.path.join(path_to_parent, "eval_files", "dataset") + config["repo_dir"] = os.path.join(path_to_parent, "eval_files", "repos") + config["log_dir"] = os.path.join(path_to_parent, "eval_files", "logs") # Save to multibench.config os.makedirs(os.path.dirname(config_path), exist_ok=True)