From bd1e60e70cd98bed74663053170b24e2378a7fb1 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 27 Jan 2026 18:09:43 +0000 Subject: [PATCH 1/3] Regroup all multiswebench hyperparameters in constants.py This commit creates a single source of truth for all Multi-SWE-Bench constant values and hyperparameters by: 1. Creating benchmarks/multiswebench/constants.py with all constants: - Dataset configuration (DEFAULT_DATASET, DEFAULT_SPLIT, DEFAULT_LANGUAGE, etc.) - Docker/Image configuration (DEFAULT_DOCKER_IMAGE_PREFIX, DEFAULT_BUILD_TARGET, etc.) - Runtime configuration (DEFAULT_RUNTIME_API_URL, DEFAULT_STARTUP_TIMEOUT, etc.) - Evaluation configuration (DEFAULT_EVAL_MODE, DEFAULT_MAX_WORKERS, etc.) - Path configuration (DATASET_CACHE_DIR, DEFAULT_WORKING_DIR, etc.) - Environment variable names for all configurable values 2. Updating all multiswebench modules to import from constants.py: - build_images.py - download_dataset.py - eval_infer.py - run_infer.py - scripts/data/data_change.py - scripts/eval/update_multi_swe_bench_config.py 3. Adding comprehensive tests in tests/test_multiswebench_constants.py Fixes #366 Co-authored-by: openhands --- benchmarks/multiswebench/build_images.py | 12 +- benchmarks/multiswebench/constants.py | 111 ++++++++ benchmarks/multiswebench/download_dataset.py | 5 +- benchmarks/multiswebench/eval_infer.py | 22 +- benchmarks/multiswebench/run_infer.py | 70 ++++- .../multiswebench/scripts/data/data_change.py | 6 +- .../eval/update_multi_swe_bench_config.py | 36 ++- tests/test_multiswebench_constants.py | 263 ++++++++++++++++++ 8 files changed, 479 insertions(+), 46 deletions(-) create mode 100644 benchmarks/multiswebench/constants.py create mode 100644 tests/test_multiswebench_constants.py diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 3ecdeeb6..38b8823d 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -11,6 +11,12 @@ import os from pathlib import Path +from benchmarks.multiswebench.constants import ( + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_LANGUAGE, + DOCKER_IMAGE_PREFIX_ENV_VAR, + LANGUAGE_ENV_VAR, +) from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -23,8 +29,10 @@ logger = get_logger(__name__) # Environment variables for multi-language support -DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench") -LANGUAGE = os.environ.get("LANGUAGE", "java") +DOCKER_IMAGE_PREFIX = os.environ.get( + DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX +) +LANGUAGE = os.environ.get(LANGUAGE_ENV_VAR, DEFAULT_LANGUAGE) def get_official_docker_image( diff --git a/benchmarks/multiswebench/constants.py b/benchmarks/multiswebench/constants.py new file mode 100644 index 00000000..3d1f2441 --- /dev/null +++ b/benchmarks/multiswebench/constants.py @@ -0,0 +1,111 @@ +""" +Constants and hyperparameters for Multi-SWE-Bench evaluation. + +This module serves as the single source of truth for all constant values +used throughout the Multi-SWE-Bench benchmark implementation. +""" + +from pathlib import Path + + +# ============================================================================= +# Dataset Configuration +# ============================================================================= + +# Default dataset name on HuggingFace +DEFAULT_DATASET = "bytedance-research/Multi-SWE-Bench" + +# Default dataset split +DEFAULT_SPLIT = "test" + +# Default programming language +DEFAULT_LANGUAGE = "java" + +# Default model name for predictions +DEFAULT_MODEL_NAME = "OpenHands" + +# Default version for formatted data +DEFAULT_VERSION = "0.1" + +# ============================================================================= +# Docker/Image Configuration +# ============================================================================= + +# Default Docker image prefix for Multi-SWE-Bench +DEFAULT_DOCKER_IMAGE_PREFIX = "mswebench" + +# Default build target for agent server images +DEFAULT_BUILD_TARGET = "source-minimal" + +# Environment variable names +DOCKER_IMAGE_PREFIX_ENV_VAR = "EVAL_DOCKER_IMAGE_PREFIX" +LANGUAGE_ENV_VAR = "LANGUAGE" +SKIP_BUILD_ENV_VAR = "MULTI_SWE_BENCH_SKIP_BUILD" + +# ============================================================================= +# Runtime Configuration +# ============================================================================= + +# Default runtime API URL for remote workspace +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" + +# Default startup timeout in seconds +DEFAULT_STARTUP_TIMEOUT = 600 + +# Environment variable names for runtime configuration +USE_HINT_TEXT_ENV_VAR = "USE_HINT_TEXT" +USE_INSTANCE_IMAGE_ENV_VAR = "USE_INSTANCE_IMAGE" +RUN_WITH_BROWSING_ENV_VAR = "RUN_WITH_BROWSING" +RUNTIME_API_KEY_ENV_VAR = "RUNTIME_API_KEY" +RUNTIME_API_URL_ENV_VAR = "RUNTIME_API_URL" +SDK_SHORT_SHA_ENV_VAR = "SDK_SHORT_SHA" +REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR = "REMOTE_RUNTIME_STARTUP_TIMEOUT" + +# Default values for boolean environment variables +DEFAULT_USE_HINT_TEXT = False +DEFAULT_USE_INSTANCE_IMAGE = True +DEFAULT_RUN_WITH_BROWSING = False + +# ============================================================================= +# Evaluation Configuration +# ============================================================================= + +# Default evaluation mode +DEFAULT_EVAL_MODE = "evaluation" + +# Default evaluation config values +DEFAULT_FORCE_BUILD = True +DEFAULT_NEED_CLONE = True +DEFAULT_CLEAR_ENV = True +DEFAULT_STOP_ON_ERROR = False +DEFAULT_MAX_WORKERS = 5 +DEFAULT_MAX_WORKERS_BUILD_IMAGE = 5 +DEFAULT_MAX_WORKERS_RUN_INSTANCE = 5 +DEFAULT_LOG_LEVEL = "DEBUG" + +# Fix patch run command for evaluation harness +FIX_PATCH_RUN_CMD = ( + 'bash -c "apt update ; apt install -y patch ; ' + "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" + "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\"" +) + +# ============================================================================= +# Paths +# ============================================================================= + +# Cache directory for downloaded datasets (relative to module location) +DATASET_CACHE_DIR_NAME = "data" + +# Get the absolute path to the dataset cache directory +DATASET_CACHE_DIR = Path(__file__).parent / DATASET_CACHE_DIR_NAME + +# ============================================================================= +# Workspace Configuration +# ============================================================================= + +# Default working directory in container +DEFAULT_WORKING_DIR = "/workspace" + +# Default environment setup commands +DEFAULT_ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"] diff --git a/benchmarks/multiswebench/download_dataset.py b/benchmarks/multiswebench/download_dataset.py index 95c9044b..5b546848 100644 --- a/benchmarks/multiswebench/download_dataset.py +++ b/benchmarks/multiswebench/download_dataset.py @@ -6,18 +6,15 @@ """ import json -from pathlib import Path from huggingface_hub import hf_hub_download, list_repo_files +from benchmarks.multiswebench.constants import DATASET_CACHE_DIR from openhands.sdk import get_logger logger = get_logger(__name__) -# Cache directory for downloaded datasets -DATASET_CACHE_DIR = Path(__file__).parent / "data" - def download_and_concat_dataset(dataset_path: str, language: str) -> str: """ diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py index 3bb88cf1..2b509f4d 100644 --- a/benchmarks/multiswebench/eval_infer.py +++ b/benchmarks/multiswebench/eval_infer.py @@ -14,6 +14,12 @@ import subprocess from pathlib import Path +from benchmarks.multiswebench.constants import ( + DEFAULT_DATASET, + DEFAULT_LANGUAGE, + DEFAULT_MODEL_NAME, + DEFAULT_SPLIT, +) from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import ( update_multi_swe_config, @@ -29,7 +35,7 @@ def run_multi_swebench_evaluation( dataset_name: str | None = None, split: str | None = None, input_file: str | None = None, - lang: str = "java", + lang: str = DEFAULT_LANGUAGE, ): """ Run Multi-SWE-Bench evaluation using the predictions file. @@ -46,9 +52,9 @@ def run_multi_swebench_evaluation( # Default dataset and split if not provided if dataset_name is None: - dataset_name = "bytedance-research/Multi-SWE-Bench" + dataset_name = DEFAULT_DATASET if split is None: - split = "test" + split = DEFAULT_SPLIT try: if input_file is None: @@ -108,14 +114,12 @@ def main(): parser = argparse.ArgumentParser(description="Multi-SWE-Bench Evaluation") parser.add_argument("input_file", help="Path to OpenHands output.jsonl file") parser.add_argument( - "--model-name", default="OpenHands", help="Model name for predictions" - ) - parser.add_argument( - "--dataset", default="bytedance-research/Multi-SWE-Bench", help="Dataset name" + "--model-name", default=DEFAULT_MODEL_NAME, help="Model name for predictions" ) - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", default=DEFAULT_DATASET, help="Dataset name") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( - "--lang", default="java", help="Language for Multi-SWE-bench dataset" + "--lang", default=DEFAULT_LANGUAGE, help="Language for Multi-SWE-bench dataset" ) parser.add_argument( "--skip-evaluation", diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 7eae1c6c..a5ec9edd 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -11,6 +11,27 @@ extract_custom_tag, get_official_docker_image, ) +from benchmarks.multiswebench.constants import ( + DEFAULT_BUILD_TARGET, + DEFAULT_DOCKER_IMAGE_PREFIX, + DEFAULT_ENV_SETUP_COMMANDS, + DEFAULT_LANGUAGE, + DEFAULT_RUN_WITH_BROWSING, + DEFAULT_RUNTIME_API_URL, + DEFAULT_STARTUP_TIMEOUT, + DEFAULT_USE_HINT_TEXT, + DEFAULT_USE_INSTANCE_IMAGE, + DEFAULT_WORKING_DIR, + DOCKER_IMAGE_PREFIX_ENV_VAR, + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, + RUN_WITH_BROWSING_ENV_VAR, + RUNTIME_API_KEY_ENV_VAR, + RUNTIME_API_URL_ENV_VAR, + SDK_SHORT_SHA_ENV_VAR, + SKIP_BUILD_ENV_VAR, + USE_HINT_TEXT_ENV_VAR, + USE_INSTANCE_IMAGE_ENV_VAR, +) from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference from benchmarks.utils.args_parser import get_parser @@ -42,18 +63,33 @@ class MultiSWEBenchEvalMetadata(EvalMetadata): """Extended metadata for Multi-SWE-bench evaluation with language support.""" lang: str = Field( - default="java", description="Language for Multi-SWE-bench dataset" + default=DEFAULT_LANGUAGE, description="Language for Multi-SWE-bench dataset" ) logger = get_logger(__name__) # Environment variables for Multi-SWE-Bench configuration -USE_HINT_TEXT = os.environ.get("USE_HINT_TEXT", "false").lower() == "true" -USE_INSTANCE_IMAGE = os.environ.get("USE_INSTANCE_IMAGE", "true").lower() == "true" -RUN_WITH_BROWSING = os.environ.get("RUN_WITH_BROWSING", "false").lower() == "true" +USE_HINT_TEXT = ( + os.environ.get(USE_HINT_TEXT_ENV_VAR, str(DEFAULT_USE_HINT_TEXT).lower()).lower() + == "true" +) +USE_INSTANCE_IMAGE = ( + os.environ.get( + USE_INSTANCE_IMAGE_ENV_VAR, str(DEFAULT_USE_INSTANCE_IMAGE).lower() + ).lower() + == "true" +) +RUN_WITH_BROWSING = ( + os.environ.get( + RUN_WITH_BROWSING_ENV_VAR, str(DEFAULT_RUN_WITH_BROWSING).lower() + ).lower() + == "true" +) # For Multi-SWE-Bench, force mswebench prefix instead of the general SWE-Bench prefix -DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench") +DOCKER_IMAGE_PREFIX = os.environ.get( + DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX +) logger.info(f"Using docker image prefix: {DOCKER_IMAGE_PREFIX}") @@ -200,7 +236,7 @@ def prepare_workspace( instance.data, docker_image_prefix=DOCKER_IMAGE_PREFIX ) logger.info(f"Using official docker image: {official_docker_image}") - build_target = "source-minimal" + build_target = DEFAULT_BUILD_TARGET custom_tag = extract_custom_tag(official_docker_image) # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" @@ -209,7 +245,7 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in ( + SKIP_BUILD = os.getenv(SKIP_BUILD_ENV_VAR, "0").lower() in ( "1", "true", "yes", @@ -241,15 +277,15 @@ def prepare_workspace( workspace = DockerWorkspace( server_image=agent_server_image, - working_dir="/workspace", + working_dir=DEFAULT_WORKING_DIR, forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": - runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + runtime_api_key = os.getenv(RUNTIME_API_KEY_ENV_VAR) + sdk_short_sha = os.getenv(SDK_SHORT_SHA_ENV_VAR, SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( - "RUNTIME_API_KEY environment variable is not set for remote workspace" + f"{RUNTIME_API_KEY_ENV_VAR} environment variable is not set for remote workspace" ) agent_server_image = ( @@ -264,10 +300,14 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, str(DEFAULT_STARTUP_TIMEOUT) + ) + ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + RUNTIME_API_URL_ENV_VAR, DEFAULT_RUNTIME_API_URL ), runtime_api_key=runtime_api_key, server_image=agent_server_image, @@ -432,7 +472,7 @@ def main() -> None: parser.add_argument( "--lang", type=str, - default="java", + default=DEFAULT_LANGUAGE, help="Language for Multi-SWE-bench dataset", ) args = parser.parse_args() @@ -475,7 +515,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + env_setup_commands=DEFAULT_ENV_SETUP_COMMANDS, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, diff --git a/benchmarks/multiswebench/scripts/data/data_change.py b/benchmarks/multiswebench/scripts/data/data_change.py index 5be0691a..c1356ed3 100644 --- a/benchmarks/multiswebench/scripts/data/data_change.py +++ b/benchmarks/multiswebench/scripts/data/data_change.py @@ -1,5 +1,7 @@ import json +from benchmarks.multiswebench.constants import DEFAULT_VERSION + def format_data_for_inference(input_file, output_file): with ( @@ -21,7 +23,7 @@ def format_data_for_inference(input_file, output_file): ) continue - # 提取原始数据 + # Extract original data org = item.get("org", "") repo = item.get("repo", "") number = str(item.get("number", "")) @@ -39,7 +41,7 @@ def format_data_for_inference(input_file, output_file): new_item["FAIL_TO_PASS"] = [] new_item["PASS_TO_PASS"] = [] new_item["base_commit"] = item["base"].get("sha", "") - new_item["version"] = "0.1" # depends + new_item["version"] = DEFAULT_VERSION output_data = new_item fout.write(json.dumps(output_data, ensure_ascii=False) + "\n") diff --git a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py index c18ca6d7..9e7613d9 100644 --- a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py +++ b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py @@ -2,6 +2,18 @@ import json import os +from benchmarks.multiswebench.constants import ( + DEFAULT_CLEAR_ENV, + DEFAULT_EVAL_MODE, + DEFAULT_FORCE_BUILD, + DEFAULT_LOG_LEVEL, + DEFAULT_MAX_WORKERS, + DEFAULT_MAX_WORKERS_BUILD_IMAGE, + DEFAULT_MAX_WORKERS_RUN_INSTANCE, + DEFAULT_NEED_CLONE, + DEFAULT_STOP_ON_ERROR, + FIX_PATCH_RUN_CMD, +) from benchmarks.multiswebench.scripts.eval.convert import convert_to_eval_format @@ -20,29 +32,25 @@ def update_multi_swe_config(output_jsonl_path, config_path, dataset): # Prepare config dict config = { - "mode": "evaluation", + "mode": DEFAULT_EVAL_MODE, "workdir": os.path.join(path_to_parent, "eval_files", "workdir"), "patch_files": [converted_path], "dataset_files": [dataset], - "force_build": True, + "force_build": DEFAULT_FORCE_BUILD, "output_dir": os.path.join(path_to_parent, "eval_files", "dataset"), "specifics": [], "skips": [], "repo_dir": os.path.join(path_to_parent, "eval_files", "repos"), - "need_clone": True, + "need_clone": DEFAULT_NEED_CLONE, "global_env": [], - "clear_env": True, - "stop_on_error": False, - "max_workers": 5, - "max_workers_build_image": 5, - "max_workers_run_instance": 5, + "clear_env": DEFAULT_CLEAR_ENV, + "stop_on_error": DEFAULT_STOP_ON_ERROR, + "max_workers": DEFAULT_MAX_WORKERS, + "max_workers_build_image": DEFAULT_MAX_WORKERS_BUILD_IMAGE, + "max_workers_run_instance": DEFAULT_MAX_WORKERS_RUN_INSTANCE, "log_dir": os.path.join(path_to_parent, "eval_files", "logs"), - "log_level": "DEBUG", - "fix_patch_run_cmd": ( - 'bash -c "apt update ; apt install -y patch ; ' - "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" - "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\"" - ), + "log_level": DEFAULT_LOG_LEVEL, + "fix_patch_run_cmd": FIX_PATCH_RUN_CMD, } # Save to multibench.config diff --git a/tests/test_multiswebench_constants.py b/tests/test_multiswebench_constants.py new file mode 100644 index 00000000..89726586 --- /dev/null +++ b/tests/test_multiswebench_constants.py @@ -0,0 +1,263 @@ +"""Tests for Multi-SWE-Bench constants module. + +This test suite verifies that: +1. All constants are properly defined and accessible +2. Constants have the expected types and values +3. Constants are correctly imported and used in other modules +""" + +from pathlib import Path + + +class TestDatasetConstants: + """Tests for dataset-related constants.""" + + def test_default_dataset(self): + from benchmarks.multiswebench.constants import DEFAULT_DATASET + + assert DEFAULT_DATASET == "bytedance-research/Multi-SWE-Bench" + assert isinstance(DEFAULT_DATASET, str) + + def test_default_split(self): + from benchmarks.multiswebench.constants import DEFAULT_SPLIT + + assert DEFAULT_SPLIT == "test" + assert isinstance(DEFAULT_SPLIT, str) + + def test_default_language(self): + from benchmarks.multiswebench.constants import DEFAULT_LANGUAGE + + assert DEFAULT_LANGUAGE == "java" + assert isinstance(DEFAULT_LANGUAGE, str) + + def test_default_model_name(self): + from benchmarks.multiswebench.constants import DEFAULT_MODEL_NAME + + assert DEFAULT_MODEL_NAME == "OpenHands" + assert isinstance(DEFAULT_MODEL_NAME, str) + + def test_default_version(self): + from benchmarks.multiswebench.constants import DEFAULT_VERSION + + assert DEFAULT_VERSION == "0.1" + assert isinstance(DEFAULT_VERSION, str) + + +class TestDockerImageConstants: + """Tests for Docker/image-related constants.""" + + def test_default_docker_image_prefix(self): + from benchmarks.multiswebench.constants import DEFAULT_DOCKER_IMAGE_PREFIX + + assert DEFAULT_DOCKER_IMAGE_PREFIX == "mswebench" + assert isinstance(DEFAULT_DOCKER_IMAGE_PREFIX, str) + + def test_default_build_target(self): + from benchmarks.multiswebench.constants import DEFAULT_BUILD_TARGET + + assert DEFAULT_BUILD_TARGET == "source-minimal" + assert isinstance(DEFAULT_BUILD_TARGET, str) + + def test_env_var_names(self): + from benchmarks.multiswebench.constants import ( + DOCKER_IMAGE_PREFIX_ENV_VAR, + LANGUAGE_ENV_VAR, + SKIP_BUILD_ENV_VAR, + ) + + assert DOCKER_IMAGE_PREFIX_ENV_VAR == "EVAL_DOCKER_IMAGE_PREFIX" + assert LANGUAGE_ENV_VAR == "LANGUAGE" + assert SKIP_BUILD_ENV_VAR == "MULTI_SWE_BENCH_SKIP_BUILD" + + +class TestRuntimeConstants: + """Tests for runtime-related constants.""" + + def test_default_runtime_api_url(self): + from benchmarks.multiswebench.constants import DEFAULT_RUNTIME_API_URL + + assert DEFAULT_RUNTIME_API_URL == "https://runtime.eval.all-hands.dev" + assert isinstance(DEFAULT_RUNTIME_API_URL, str) + + def test_default_startup_timeout(self): + from benchmarks.multiswebench.constants import DEFAULT_STARTUP_TIMEOUT + + assert DEFAULT_STARTUP_TIMEOUT == 600 + assert isinstance(DEFAULT_STARTUP_TIMEOUT, int) + + def test_runtime_env_var_names(self): + from benchmarks.multiswebench.constants import ( + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, + RUNTIME_API_KEY_ENV_VAR, + RUNTIME_API_URL_ENV_VAR, + SDK_SHORT_SHA_ENV_VAR, + ) + + assert RUNTIME_API_KEY_ENV_VAR == "RUNTIME_API_KEY" + assert RUNTIME_API_URL_ENV_VAR == "RUNTIME_API_URL" + assert SDK_SHORT_SHA_ENV_VAR == "SDK_SHORT_SHA" + assert ( + REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR == "REMOTE_RUNTIME_STARTUP_TIMEOUT" + ) + + def test_boolean_defaults(self): + from benchmarks.multiswebench.constants import ( + DEFAULT_RUN_WITH_BROWSING, + DEFAULT_USE_HINT_TEXT, + DEFAULT_USE_INSTANCE_IMAGE, + ) + + assert DEFAULT_USE_HINT_TEXT is False + assert DEFAULT_USE_INSTANCE_IMAGE is True + assert DEFAULT_RUN_WITH_BROWSING is False + + +class TestEvaluationConstants: + """Tests for evaluation-related constants.""" + + def test_default_eval_mode(self): + from benchmarks.multiswebench.constants import DEFAULT_EVAL_MODE + + assert DEFAULT_EVAL_MODE == "evaluation" + assert isinstance(DEFAULT_EVAL_MODE, str) + + def test_default_config_values(self): + from benchmarks.multiswebench.constants import ( + DEFAULT_CLEAR_ENV, + DEFAULT_FORCE_BUILD, + DEFAULT_NEED_CLONE, + DEFAULT_STOP_ON_ERROR, + ) + + assert DEFAULT_FORCE_BUILD is True + assert DEFAULT_NEED_CLONE is True + assert DEFAULT_CLEAR_ENV is True + assert DEFAULT_STOP_ON_ERROR is False + + def test_default_worker_counts(self): + from benchmarks.multiswebench.constants import ( + DEFAULT_MAX_WORKERS, + DEFAULT_MAX_WORKERS_BUILD_IMAGE, + DEFAULT_MAX_WORKERS_RUN_INSTANCE, + ) + + assert DEFAULT_MAX_WORKERS == 5 + assert DEFAULT_MAX_WORKERS_BUILD_IMAGE == 5 + assert DEFAULT_MAX_WORKERS_RUN_INSTANCE == 5 + + def test_default_log_level(self): + from benchmarks.multiswebench.constants import DEFAULT_LOG_LEVEL + + assert DEFAULT_LOG_LEVEL == "DEBUG" + assert isinstance(DEFAULT_LOG_LEVEL, str) + + def test_fix_patch_run_cmd(self): + from benchmarks.multiswebench.constants import FIX_PATCH_RUN_CMD + + assert isinstance(FIX_PATCH_RUN_CMD, str) + assert "bash -c" in FIX_PATCH_RUN_CMD + assert "patch" in FIX_PATCH_RUN_CMD + + +class TestPathConstants: + """Tests for path-related constants.""" + + def test_dataset_cache_dir_name(self): + from benchmarks.multiswebench.constants import DATASET_CACHE_DIR_NAME + + assert DATASET_CACHE_DIR_NAME == "data" + assert isinstance(DATASET_CACHE_DIR_NAME, str) + + def test_dataset_cache_dir(self): + from benchmarks.multiswebench.constants import DATASET_CACHE_DIR + + assert isinstance(DATASET_CACHE_DIR, Path) + assert DATASET_CACHE_DIR.name == "data" + # Verify it's relative to the constants module + assert "multiswebench" in str(DATASET_CACHE_DIR) + + +class TestWorkspaceConstants: + """Tests for workspace-related constants.""" + + def test_default_working_dir(self): + from benchmarks.multiswebench.constants import DEFAULT_WORKING_DIR + + assert DEFAULT_WORKING_DIR == "/workspace" + assert isinstance(DEFAULT_WORKING_DIR, str) + + def test_default_env_setup_commands(self): + from benchmarks.multiswebench.constants import DEFAULT_ENV_SETUP_COMMANDS + + assert isinstance(DEFAULT_ENV_SETUP_COMMANDS, list) + assert len(DEFAULT_ENV_SETUP_COMMANDS) > 0 + assert "export PIP_CACHE_DIR=~/.cache/pip" in DEFAULT_ENV_SETUP_COMMANDS + + +class TestConstantsUsageInModules: + """Tests to verify constants are properly used in other modules.""" + + def test_build_images_uses_constants(self): + """Verify build_images.py imports and uses constants.""" + from benchmarks.multiswebench import build_images + + # Check that the module uses the constants + assert hasattr(build_images, "DOCKER_IMAGE_PREFIX") + assert hasattr(build_images, "LANGUAGE") + + def test_download_dataset_uses_constants(self): + """Verify download_dataset.py imports and uses constants.""" + # The module should import DATASET_CACHE_DIR from constants + # We can verify by checking the module's imports + import inspect + + from benchmarks.multiswebench import download_dataset + + source = inspect.getsource(download_dataset) + assert "from benchmarks.multiswebench.constants import" in source + + def test_eval_infer_uses_constants(self): + """Verify eval_infer.py imports and uses constants.""" + import inspect + + from benchmarks.multiswebench import eval_infer + + source = inspect.getsource(eval_infer) + assert "from benchmarks.multiswebench.constants import" in source + + def test_run_infer_uses_constants(self): + """Verify run_infer.py imports and uses constants.""" + import inspect + + from benchmarks.multiswebench import run_infer + + source = inspect.getsource(run_infer) + assert "from benchmarks.multiswebench.constants import" in source + + def test_data_change_uses_constants(self): + """Verify data_change.py imports and uses constants.""" + import inspect + + from benchmarks.multiswebench.scripts.data import data_change + + source = inspect.getsource(data_change) + assert "from benchmarks.multiswebench.constants import" in source + + def test_update_multi_swe_bench_config_uses_constants(self): + """Verify update_multi_swe_bench_config.py imports and uses constants.""" + import inspect + + from benchmarks.multiswebench.scripts.eval import update_multi_swe_bench_config + + source = inspect.getsource(update_multi_swe_bench_config) + assert "from benchmarks.multiswebench.constants import" in source + + +class TestAllConstantsExported: + """Test that all expected constants are exported from the module.""" + + def test_all_constants_importable(self): + """Verify all constants can be imported from the module.""" + + # If we get here without ImportError, all constants are importable + assert True From d9fa9bd17bb0a72389305ac080b3229e14fbbc7a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 10:34:48 +0000 Subject: [PATCH 2/3] Remove test file and revert changes to data_change.py and download_dataset.py - Removed tests/test_multiswebench_constants.py - Reverted benchmarks/multiswebench/scripts/data/data_change.py to original - Reverted benchmarks/multiswebench/download_dataset.py to original - Removed DEFAULT_VERSION and DATASET_CACHE_DIR from constants.py Co-authored-by: openhands --- benchmarks/multiswebench/constants.py | 16 -- benchmarks/multiswebench/download_dataset.py | 5 +- .../multiswebench/scripts/data/data_change.py | 6 +- tests/test_multiswebench_constants.py | 263 ------------------ 4 files changed, 6 insertions(+), 284 deletions(-) delete mode 100644 tests/test_multiswebench_constants.py diff --git a/benchmarks/multiswebench/constants.py b/benchmarks/multiswebench/constants.py index 3d1f2441..197bb91d 100644 --- a/benchmarks/multiswebench/constants.py +++ b/benchmarks/multiswebench/constants.py @@ -5,9 +5,6 @@ used throughout the Multi-SWE-Bench benchmark implementation. """ -from pathlib import Path - - # ============================================================================= # Dataset Configuration # ============================================================================= @@ -24,9 +21,6 @@ # Default model name for predictions DEFAULT_MODEL_NAME = "OpenHands" -# Default version for formatted data -DEFAULT_VERSION = "0.1" - # ============================================================================= # Docker/Image Configuration # ============================================================================= @@ -90,16 +84,6 @@ "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\"" ) -# ============================================================================= -# Paths -# ============================================================================= - -# Cache directory for downloaded datasets (relative to module location) -DATASET_CACHE_DIR_NAME = "data" - -# Get the absolute path to the dataset cache directory -DATASET_CACHE_DIR = Path(__file__).parent / DATASET_CACHE_DIR_NAME - # ============================================================================= # Workspace Configuration # ============================================================================= diff --git a/benchmarks/multiswebench/download_dataset.py b/benchmarks/multiswebench/download_dataset.py index 5b546848..95c9044b 100644 --- a/benchmarks/multiswebench/download_dataset.py +++ b/benchmarks/multiswebench/download_dataset.py @@ -6,15 +6,18 @@ """ import json +from pathlib import Path from huggingface_hub import hf_hub_download, list_repo_files -from benchmarks.multiswebench.constants import DATASET_CACHE_DIR from openhands.sdk import get_logger logger = get_logger(__name__) +# Cache directory for downloaded datasets +DATASET_CACHE_DIR = Path(__file__).parent / "data" + def download_and_concat_dataset(dataset_path: str, language: str) -> str: """ diff --git a/benchmarks/multiswebench/scripts/data/data_change.py b/benchmarks/multiswebench/scripts/data/data_change.py index c1356ed3..5be0691a 100644 --- a/benchmarks/multiswebench/scripts/data/data_change.py +++ b/benchmarks/multiswebench/scripts/data/data_change.py @@ -1,7 +1,5 @@ import json -from benchmarks.multiswebench.constants import DEFAULT_VERSION - def format_data_for_inference(input_file, output_file): with ( @@ -23,7 +21,7 @@ def format_data_for_inference(input_file, output_file): ) continue - # Extract original data + # 提取原始数据 org = item.get("org", "") repo = item.get("repo", "") number = str(item.get("number", "")) @@ -41,7 +39,7 @@ def format_data_for_inference(input_file, output_file): new_item["FAIL_TO_PASS"] = [] new_item["PASS_TO_PASS"] = [] new_item["base_commit"] = item["base"].get("sha", "") - new_item["version"] = DEFAULT_VERSION + new_item["version"] = "0.1" # depends output_data = new_item fout.write(json.dumps(output_data, ensure_ascii=False) + "\n") diff --git a/tests/test_multiswebench_constants.py b/tests/test_multiswebench_constants.py deleted file mode 100644 index 89726586..00000000 --- a/tests/test_multiswebench_constants.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Tests for Multi-SWE-Bench constants module. - -This test suite verifies that: -1. All constants are properly defined and accessible -2. Constants have the expected types and values -3. Constants are correctly imported and used in other modules -""" - -from pathlib import Path - - -class TestDatasetConstants: - """Tests for dataset-related constants.""" - - def test_default_dataset(self): - from benchmarks.multiswebench.constants import DEFAULT_DATASET - - assert DEFAULT_DATASET == "bytedance-research/Multi-SWE-Bench" - assert isinstance(DEFAULT_DATASET, str) - - def test_default_split(self): - from benchmarks.multiswebench.constants import DEFAULT_SPLIT - - assert DEFAULT_SPLIT == "test" - assert isinstance(DEFAULT_SPLIT, str) - - def test_default_language(self): - from benchmarks.multiswebench.constants import DEFAULT_LANGUAGE - - assert DEFAULT_LANGUAGE == "java" - assert isinstance(DEFAULT_LANGUAGE, str) - - def test_default_model_name(self): - from benchmarks.multiswebench.constants import DEFAULT_MODEL_NAME - - assert DEFAULT_MODEL_NAME == "OpenHands" - assert isinstance(DEFAULT_MODEL_NAME, str) - - def test_default_version(self): - from benchmarks.multiswebench.constants import DEFAULT_VERSION - - assert DEFAULT_VERSION == "0.1" - assert isinstance(DEFAULT_VERSION, str) - - -class TestDockerImageConstants: - """Tests for Docker/image-related constants.""" - - def test_default_docker_image_prefix(self): - from benchmarks.multiswebench.constants import DEFAULT_DOCKER_IMAGE_PREFIX - - assert DEFAULT_DOCKER_IMAGE_PREFIX == "mswebench" - assert isinstance(DEFAULT_DOCKER_IMAGE_PREFIX, str) - - def test_default_build_target(self): - from benchmarks.multiswebench.constants import DEFAULT_BUILD_TARGET - - assert DEFAULT_BUILD_TARGET == "source-minimal" - assert isinstance(DEFAULT_BUILD_TARGET, str) - - def test_env_var_names(self): - from benchmarks.multiswebench.constants import ( - DOCKER_IMAGE_PREFIX_ENV_VAR, - LANGUAGE_ENV_VAR, - SKIP_BUILD_ENV_VAR, - ) - - assert DOCKER_IMAGE_PREFIX_ENV_VAR == "EVAL_DOCKER_IMAGE_PREFIX" - assert LANGUAGE_ENV_VAR == "LANGUAGE" - assert SKIP_BUILD_ENV_VAR == "MULTI_SWE_BENCH_SKIP_BUILD" - - -class TestRuntimeConstants: - """Tests for runtime-related constants.""" - - def test_default_runtime_api_url(self): - from benchmarks.multiswebench.constants import DEFAULT_RUNTIME_API_URL - - assert DEFAULT_RUNTIME_API_URL == "https://runtime.eval.all-hands.dev" - assert isinstance(DEFAULT_RUNTIME_API_URL, str) - - def test_default_startup_timeout(self): - from benchmarks.multiswebench.constants import DEFAULT_STARTUP_TIMEOUT - - assert DEFAULT_STARTUP_TIMEOUT == 600 - assert isinstance(DEFAULT_STARTUP_TIMEOUT, int) - - def test_runtime_env_var_names(self): - from benchmarks.multiswebench.constants import ( - REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, - RUNTIME_API_KEY_ENV_VAR, - RUNTIME_API_URL_ENV_VAR, - SDK_SHORT_SHA_ENV_VAR, - ) - - assert RUNTIME_API_KEY_ENV_VAR == "RUNTIME_API_KEY" - assert RUNTIME_API_URL_ENV_VAR == "RUNTIME_API_URL" - assert SDK_SHORT_SHA_ENV_VAR == "SDK_SHORT_SHA" - assert ( - REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR == "REMOTE_RUNTIME_STARTUP_TIMEOUT" - ) - - def test_boolean_defaults(self): - from benchmarks.multiswebench.constants import ( - DEFAULT_RUN_WITH_BROWSING, - DEFAULT_USE_HINT_TEXT, - DEFAULT_USE_INSTANCE_IMAGE, - ) - - assert DEFAULT_USE_HINT_TEXT is False - assert DEFAULT_USE_INSTANCE_IMAGE is True - assert DEFAULT_RUN_WITH_BROWSING is False - - -class TestEvaluationConstants: - """Tests for evaluation-related constants.""" - - def test_default_eval_mode(self): - from benchmarks.multiswebench.constants import DEFAULT_EVAL_MODE - - assert DEFAULT_EVAL_MODE == "evaluation" - assert isinstance(DEFAULT_EVAL_MODE, str) - - def test_default_config_values(self): - from benchmarks.multiswebench.constants import ( - DEFAULT_CLEAR_ENV, - DEFAULT_FORCE_BUILD, - DEFAULT_NEED_CLONE, - DEFAULT_STOP_ON_ERROR, - ) - - assert DEFAULT_FORCE_BUILD is True - assert DEFAULT_NEED_CLONE is True - assert DEFAULT_CLEAR_ENV is True - assert DEFAULT_STOP_ON_ERROR is False - - def test_default_worker_counts(self): - from benchmarks.multiswebench.constants import ( - DEFAULT_MAX_WORKERS, - DEFAULT_MAX_WORKERS_BUILD_IMAGE, - DEFAULT_MAX_WORKERS_RUN_INSTANCE, - ) - - assert DEFAULT_MAX_WORKERS == 5 - assert DEFAULT_MAX_WORKERS_BUILD_IMAGE == 5 - assert DEFAULT_MAX_WORKERS_RUN_INSTANCE == 5 - - def test_default_log_level(self): - from benchmarks.multiswebench.constants import DEFAULT_LOG_LEVEL - - assert DEFAULT_LOG_LEVEL == "DEBUG" - assert isinstance(DEFAULT_LOG_LEVEL, str) - - def test_fix_patch_run_cmd(self): - from benchmarks.multiswebench.constants import FIX_PATCH_RUN_CMD - - assert isinstance(FIX_PATCH_RUN_CMD, str) - assert "bash -c" in FIX_PATCH_RUN_CMD - assert "patch" in FIX_PATCH_RUN_CMD - - -class TestPathConstants: - """Tests for path-related constants.""" - - def test_dataset_cache_dir_name(self): - from benchmarks.multiswebench.constants import DATASET_CACHE_DIR_NAME - - assert DATASET_CACHE_DIR_NAME == "data" - assert isinstance(DATASET_CACHE_DIR_NAME, str) - - def test_dataset_cache_dir(self): - from benchmarks.multiswebench.constants import DATASET_CACHE_DIR - - assert isinstance(DATASET_CACHE_DIR, Path) - assert DATASET_CACHE_DIR.name == "data" - # Verify it's relative to the constants module - assert "multiswebench" in str(DATASET_CACHE_DIR) - - -class TestWorkspaceConstants: - """Tests for workspace-related constants.""" - - def test_default_working_dir(self): - from benchmarks.multiswebench.constants import DEFAULT_WORKING_DIR - - assert DEFAULT_WORKING_DIR == "/workspace" - assert isinstance(DEFAULT_WORKING_DIR, str) - - def test_default_env_setup_commands(self): - from benchmarks.multiswebench.constants import DEFAULT_ENV_SETUP_COMMANDS - - assert isinstance(DEFAULT_ENV_SETUP_COMMANDS, list) - assert len(DEFAULT_ENV_SETUP_COMMANDS) > 0 - assert "export PIP_CACHE_DIR=~/.cache/pip" in DEFAULT_ENV_SETUP_COMMANDS - - -class TestConstantsUsageInModules: - """Tests to verify constants are properly used in other modules.""" - - def test_build_images_uses_constants(self): - """Verify build_images.py imports and uses constants.""" - from benchmarks.multiswebench import build_images - - # Check that the module uses the constants - assert hasattr(build_images, "DOCKER_IMAGE_PREFIX") - assert hasattr(build_images, "LANGUAGE") - - def test_download_dataset_uses_constants(self): - """Verify download_dataset.py imports and uses constants.""" - # The module should import DATASET_CACHE_DIR from constants - # We can verify by checking the module's imports - import inspect - - from benchmarks.multiswebench import download_dataset - - source = inspect.getsource(download_dataset) - assert "from benchmarks.multiswebench.constants import" in source - - def test_eval_infer_uses_constants(self): - """Verify eval_infer.py imports and uses constants.""" - import inspect - - from benchmarks.multiswebench import eval_infer - - source = inspect.getsource(eval_infer) - assert "from benchmarks.multiswebench.constants import" in source - - def test_run_infer_uses_constants(self): - """Verify run_infer.py imports and uses constants.""" - import inspect - - from benchmarks.multiswebench import run_infer - - source = inspect.getsource(run_infer) - assert "from benchmarks.multiswebench.constants import" in source - - def test_data_change_uses_constants(self): - """Verify data_change.py imports and uses constants.""" - import inspect - - from benchmarks.multiswebench.scripts.data import data_change - - source = inspect.getsource(data_change) - assert "from benchmarks.multiswebench.constants import" in source - - def test_update_multi_swe_bench_config_uses_constants(self): - """Verify update_multi_swe_bench_config.py imports and uses constants.""" - import inspect - - from benchmarks.multiswebench.scripts.eval import update_multi_swe_bench_config - - source = inspect.getsource(update_multi_swe_bench_config) - assert "from benchmarks.multiswebench.constants import" in source - - -class TestAllConstantsExported: - """Test that all expected constants are exported from the module.""" - - def test_all_constants_importable(self): - """Verify all constants can be imported from the module.""" - - # If we get here without ImportError, all constants are importable - assert True From 6cea0d8366b2ddaa26d9d84cd321bf69a505db6e Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 10:41:24 +0000 Subject: [PATCH 3/3] Refactor eval harness config to use single dictionary constant Replace individual constants (DEFAULT_EVAL_MODE, DEFAULT_FORCE_BUILD, etc.) with a single DEFAULT_EVAL_HARNESS_CONFIG dictionary that serves as a template for the Multi-SWE-Bench evaluation harness configuration. This is cleaner because: - All config values are used together in one place - The complete config structure is visible at a glance - Single import instead of 10 individual imports - Easier to maintain Co-authored-by: openhands --- benchmarks/multiswebench/constants.py | 43 ++++++++++--------- .../eval/update_multi_swe_bench_config.py | 43 ++++--------------- 2 files changed, 32 insertions(+), 54 deletions(-) diff --git a/benchmarks/multiswebench/constants.py b/benchmarks/multiswebench/constants.py index 197bb91d..f90c158e 100644 --- a/benchmarks/multiswebench/constants.py +++ b/benchmarks/multiswebench/constants.py @@ -61,28 +61,31 @@ DEFAULT_RUN_WITH_BROWSING = False # ============================================================================= -# Evaluation Configuration +# Evaluation Harness Configuration # ============================================================================= -# Default evaluation mode -DEFAULT_EVAL_MODE = "evaluation" - -# Default evaluation config values -DEFAULT_FORCE_BUILD = True -DEFAULT_NEED_CLONE = True -DEFAULT_CLEAR_ENV = True -DEFAULT_STOP_ON_ERROR = False -DEFAULT_MAX_WORKERS = 5 -DEFAULT_MAX_WORKERS_BUILD_IMAGE = 5 -DEFAULT_MAX_WORKERS_RUN_INSTANCE = 5 -DEFAULT_LOG_LEVEL = "DEBUG" - -# Fix patch run command for evaluation harness -FIX_PATCH_RUN_CMD = ( - 'bash -c "apt update ; apt install -y patch ; ' - "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" - "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\"" -) +# Default configuration template for Multi-SWE-Bench evaluation harness. +# Dynamic values (paths) are added at runtime. +DEFAULT_EVAL_HARNESS_CONFIG = { + "mode": "evaluation", + "force_build": True, + "need_clone": True, + "clear_env": True, + "stop_on_error": False, + "max_workers": 5, + "max_workers_build_image": 5, + "max_workers_run_instance": 5, + "log_level": "DEBUG", + "fix_patch_run_cmd": ( + 'bash -c "apt update ; apt install -y patch ; ' + "sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;" + "patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; " + 'chmod +x /home/*.sh ; /home/fix-run.sh"' + ), + "specifics": [], + "skips": [], + "global_env": [], +} # ============================================================================= # Workspace Configuration diff --git a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py index 9e7613d9..4ec4cdb8 100644 --- a/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py +++ b/benchmarks/multiswebench/scripts/eval/update_multi_swe_bench_config.py @@ -2,18 +2,7 @@ import json import os -from benchmarks.multiswebench.constants import ( - DEFAULT_CLEAR_ENV, - DEFAULT_EVAL_MODE, - DEFAULT_FORCE_BUILD, - DEFAULT_LOG_LEVEL, - DEFAULT_MAX_WORKERS, - DEFAULT_MAX_WORKERS_BUILD_IMAGE, - DEFAULT_MAX_WORKERS_RUN_INSTANCE, - DEFAULT_NEED_CLONE, - DEFAULT_STOP_ON_ERROR, - FIX_PATCH_RUN_CMD, -) +from benchmarks.multiswebench.constants import DEFAULT_EVAL_HARNESS_CONFIG from benchmarks.multiswebench.scripts.eval.convert import convert_to_eval_format @@ -30,28 +19,14 @@ def update_multi_swe_config(output_jsonl_path, config_path, dataset): os.makedirs(os.path.join(path_to_parent, "eval_files", "repos"), exist_ok=True) os.makedirs(os.path.join(path_to_parent, "eval_files", "logs"), exist_ok=True) - # Prepare config dict - config = { - "mode": DEFAULT_EVAL_MODE, - "workdir": os.path.join(path_to_parent, "eval_files", "workdir"), - "patch_files": [converted_path], - "dataset_files": [dataset], - "force_build": DEFAULT_FORCE_BUILD, - "output_dir": os.path.join(path_to_parent, "eval_files", "dataset"), - "specifics": [], - "skips": [], - "repo_dir": os.path.join(path_to_parent, "eval_files", "repos"), - "need_clone": DEFAULT_NEED_CLONE, - "global_env": [], - "clear_env": DEFAULT_CLEAR_ENV, - "stop_on_error": DEFAULT_STOP_ON_ERROR, - "max_workers": DEFAULT_MAX_WORKERS, - "max_workers_build_image": DEFAULT_MAX_WORKERS_BUILD_IMAGE, - "max_workers_run_instance": DEFAULT_MAX_WORKERS_RUN_INSTANCE, - "log_dir": os.path.join(path_to_parent, "eval_files", "logs"), - "log_level": DEFAULT_LOG_LEVEL, - "fix_patch_run_cmd": FIX_PATCH_RUN_CMD, - } + # Start with default config and add dynamic paths + config = DEFAULT_EVAL_HARNESS_CONFIG.copy() + config["workdir"] = os.path.join(path_to_parent, "eval_files", "workdir") + config["patch_files"] = [converted_path] + config["dataset_files"] = [dataset] + config["output_dir"] = os.path.join(path_to_parent, "eval_files", "dataset") + config["repo_dir"] = os.path.join(path_to_parent, "eval_files", "repos") + config["log_dir"] = os.path.join(path_to_parent, "eval_files", "logs") # Save to multibench.config os.makedirs(os.path.dirname(config_path), exist_ok=True)