diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py index d32b5dc6..57add1d6 100644 --- a/benchmarks/swebenchmultimodal/build_images.py +++ b/benchmarks/swebenchmultimodal/build_images.py @@ -10,6 +10,7 @@ import sys +from benchmarks.swebenchmultimodal.constants import DOCKER_IMAGE_PREFIX from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -24,7 +25,7 @@ def get_official_docker_image( instance_id: str, - docker_image_prefix="docker.io/swebench/", + docker_image_prefix: str = DOCKER_IMAGE_PREFIX, ) -> str: # For multimodal benchmark, we use regular SWE-bench images as base # since multimodal-specific images (sweb.mm.eval.*) are not available diff --git a/benchmarks/swebenchmultimodal/constants.py b/benchmarks/swebenchmultimodal/constants.py new file mode 100644 index 00000000..f5baa087 --- /dev/null +++ b/benchmarks/swebenchmultimodal/constants.py @@ -0,0 +1,55 @@ +""" +Constants for SWE-Bench Multimodal benchmark. + +This module serves as the single source of truth for all hyperparameters +and constant values used in the SWE-Bench Multimodal evaluation workflow. +""" + +# Dataset configuration +DEFAULT_DATASET = "princeton-nlp/SWE-bench_Multimodal" +DEFAULT_SPLIT = "dev" + +# Docker image configuration +DOCKER_IMAGE_PREFIX = "docker.io/swebench/" + +# Build configuration +BUILD_TARGET = "source-minimal" + +# Workspace configuration +WORKSPACE_DIR = "/workspace" + +# Environment variable names +ENV_SKIP_BUILD = "SKIP_BUILD" +ENV_RUNTIME_API_KEY = "RUNTIME_API_KEY" +ENV_SDK_SHORT_SHA = "SDK_SHORT_SHA" +ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT = "REMOTE_RUNTIME_STARTUP_TIMEOUT" +ENV_RUNTIME_API_URL = "RUNTIME_API_URL" + +# Default values for environment variables +DEFAULT_SKIP_BUILD = "1" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600" +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" + +# Git configuration +GIT_USER_EMAIL = "evaluation@openhands.dev" +GIT_USER_NAME = "OpenHands Evaluation" +GIT_COMMIT_MESSAGE = "patch" + +# Environment setup commands +ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"] + +# Image validation +ALLOWED_IMAGE_TYPES = ["image/jpeg", "image/png", "image/gif", "image/webp"] + +# Evaluation configuration +DEFAULT_EVAL_WORKERS = "12" +DEFAULT_MODEL_NAME = "openhands" + +# Annotation keywords +SOLVEABLE_KEYWORD = "SOLVEABLE" + +# Files to remove from patches during evaluation +SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"] + +# Annotations file name +ANNOTATIONS_FILENAME = "ambiguity_annotations.json" diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 0984b3e5..050f02d2 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -16,6 +16,15 @@ from pathlib import Path from typing import Any +from benchmarks.swebenchmultimodal.constants import ( + ANNOTATIONS_FILENAME, + DEFAULT_DATASET, + DEFAULT_EVAL_WORKERS, + DEFAULT_MODEL_NAME, + DEFAULT_SPLIT, + SETUP_FILES_TO_REMOVE, + SOLVEABLE_KEYWORD, +) from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -24,7 +33,7 @@ logger = get_logger(__name__) # Path to ambiguity annotations relative to this file -ANNOTATIONS_FILE = Path(__file__).parent / "ambiguity_annotations.json" +ANNOTATIONS_FILE = Path(__file__).parent / ANNOTATIONS_FILENAME def load_ambiguity_annotations() -> dict[str, Any]: @@ -77,7 +86,7 @@ def calculate_component_scores( for instance_id, annotation in annotations.items(): keywords = annotation.get("keywords", []) - if "SOLVEABLE" in keywords: + if SOLVEABLE_KEYWORD in keywords: solveable_ids.add(instance_id) else: unsolveable_ids.add(instance_id) @@ -213,8 +222,7 @@ def convert_to_swebench_format( git_patch = "" # postprocess git_patch - setup_files = ["pyproject.toml", "tox.ini", "setup.py"] - git_patch = remove_files_from_patch(git_patch, setup_files) + git_patch = remove_files_from_patch(git_patch, SETUP_FILES_TO_REMOVE) # Create SWE-Bench format entry swebench_entry = { @@ -269,9 +277,9 @@ def find_report_json(predictions_dir: Path, run_id: str) -> Path | None: def run_swebench_multimodal_evaluation( predictions_file: str, - dataset: str = "princeton-nlp/SWE-bench_Multimodal", - split: str = "dev", - workers: str = "12", + dataset: str = DEFAULT_DATASET, + split: str = DEFAULT_SPLIT, + workers: str = DEFAULT_EVAL_WORKERS, run_id: str | None = None, ) -> Path | None: """ @@ -363,10 +371,10 @@ def main() -> None: parser = argparse.ArgumentParser( description="Convert OpenHands output to SWE-Bench format and run multimodal evaluation", formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" + epilog=f""" Examples: uv run swebenchmultimodal-eval output.jsonl - uv run swebenchmultimodal-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Multimodal + uv run swebenchmultimodal-eval /path/to/output.jsonl --dataset {DEFAULT_DATASET} uv run swebenchmultimodal-eval output.jsonl --model-name "MyModel-v1.0" """, ) @@ -375,15 +383,14 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Multimodal", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Multimodal)", + default=DEFAULT_DATASET, + help=f"SWE-Bench dataset to evaluate against (default: {DEFAULT_DATASET})", ) parser.add_argument( "--split", - default="dev", - help="Dataset split to use (default: dev)", + default=DEFAULT_SPLIT, + help=f"Dataset split to use (default: {DEFAULT_SPLIT})", ) parser.add_argument( @@ -400,14 +407,14 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})", ) parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + default=DEFAULT_EVAL_WORKERS, + help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})", ) parser.add_argument( diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 68e4c5b8..93ce610f 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -10,6 +10,25 @@ extract_custom_tag, get_official_docker_image, ) +from benchmarks.swebenchmultimodal.constants import ( + ALLOWED_IMAGE_TYPES, + BUILD_TARGET, + DEFAULT_DATASET, + DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + DEFAULT_RUNTIME_API_URL, + DEFAULT_SKIP_BUILD, + DEFAULT_SPLIT, + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, + ENV_RUNTIME_API_KEY, + ENV_RUNTIME_API_URL, + ENV_SDK_SHORT_SHA, + ENV_SETUP_COMMANDS, + ENV_SKIP_BUILD, + GIT_COMMIT_MESSAGE, + GIT_USER_EMAIL, + GIT_USER_NAME, + WORKSPACE_DIR, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -58,7 +77,7 @@ def is_valid_image_url(url: str, allowed_types: list | None = None) -> bool: True if URL points to a valid image type, False otherwise """ if allowed_types is None: - allowed_types = ["image/jpeg", "image/png", "image/gif", "image/webp"] + allowed_types = ALLOWED_IMAGE_TYPES try: # Send a HEAD request first to check headers without downloading the entire file @@ -152,18 +171,21 @@ def prepare_workspace( """ # Use multimodal image official_docker_image = get_official_docker_image(instance.id) - build_target = "source-minimal" custom_tag = extract_custom_tag(official_docker_image) # For non-binary targets, append target suffix - suffix = f"-{build_target}" if build_target != "binary" else "" + suffix = f"-{BUILD_TARGET}" if BUILD_TARGET != "binary" else "" if self.metadata.workspace_type == "docker": agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: + skip_build = os.getenv(ENV_SKIP_BUILD, DEFAULT_SKIP_BUILD).lower() in ( + "1", + "true", + "yes", + ) + logger.info(f"SKIP_BUILD={skip_build}") + if not skip_build: logger.info( f"Building workspace from {official_docker_image} " f"for instance {instance.id}. " @@ -177,7 +199,7 @@ def prepare_workspace( base_image=official_docker_image, target_image=EVAL_AGENT_SERVER_IMAGE, custom_tag=custom_tag, - target=build_target, + target=BUILD_TARGET, push=False, ) logger.info(f"Image build output: {output}") @@ -190,15 +212,15 @@ def prepare_workspace( workspace = DockerWorkspace( server_image=agent_server_image, - working_dir="/workspace", + working_dir=WORKSPACE_DIR, forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": - runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + runtime_api_key = os.getenv(ENV_RUNTIME_API_KEY) + sdk_short_sha = os.getenv(ENV_SDK_SHORT_SHA, SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( - "RUNTIME_API_KEY environment variable is not set for remote workspace" + f"{ENV_RUNTIME_API_KEY} environment variable is not set for remote workspace" ) agent_server_image = ( @@ -213,16 +235,19 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, + DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + ) + ) workspace = APIRemoteWorkspace( - runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" - ), + runtime_api_url=os.getenv(ENV_RUNTIME_API_URL, DEFAULT_RUNTIME_API_URL), runtime_api_key=runtime_api_key, server_image=agent_server_image, init_timeout=startup_timeout, startup_wait_timeout=startup_timeout, - target_type="source" if "source" in build_target else "binary", + target_type="source" if "source" in BUILD_TARGET else "binary", forward_env=forward_env or [], resource_factor=resource_factor, ) @@ -377,9 +402,9 @@ def evaluate_instance( # and prevent the commit from being created workspace.execute_command( f"cd {repo_path} && " - "git config --global user.email 'evaluation@openhands.dev' && " - "git config --global user.name 'OpenHands Evaluation' && " - "git commit --no-verify -m 'patch'" + f"git config --global user.email '{GIT_USER_EMAIL}' && " + f"git config --global user.name '{GIT_USER_NAME}' && " + f"git commit --no-verify -m '{GIT_COMMIT_MESSAGE}'" ) # Get git patch (same as regular swebench - use base_commit) @@ -424,7 +449,7 @@ def main() -> None: help="Path to prompt template file", ) # Override the default dataset and split for multimodal - parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev") + parser.set_defaults(dataset=DEFAULT_DATASET, split=DEFAULT_SPLIT) args = parser.parse_args() # Validate max_attempts @@ -464,7 +489,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + env_setup_commands=ENV_SETUP_COMMANDS, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, diff --git a/tests/test_swebenchmultimodal.py b/tests/test_swebenchmultimodal.py index 3238ee90..2bd6d221 100644 --- a/tests/test_swebenchmultimodal.py +++ b/tests/test_swebenchmultimodal.py @@ -2,9 +2,119 @@ import tempfile +from benchmarks.swebenchmultimodal import constants from benchmarks.swebenchmultimodal.eval_infer import convert_to_swebench_format +class TestConstants: + """Tests for SWE-Bench Multimodal constants.""" + + def test_default_dataset_is_string(self): + """Test that DEFAULT_DATASET is a non-empty string.""" + assert isinstance(constants.DEFAULT_DATASET, str) + assert len(constants.DEFAULT_DATASET) > 0 + assert "SWE-bench_Multimodal" in constants.DEFAULT_DATASET + + def test_default_split_is_string(self): + """Test that DEFAULT_SPLIT is a non-empty string.""" + assert isinstance(constants.DEFAULT_SPLIT, str) + assert len(constants.DEFAULT_SPLIT) > 0 + + def test_docker_image_prefix_is_string(self): + """Test that DOCKER_IMAGE_PREFIX is a valid docker prefix.""" + assert isinstance(constants.DOCKER_IMAGE_PREFIX, str) + assert constants.DOCKER_IMAGE_PREFIX.endswith("/") + + def test_build_target_is_string(self): + """Test that BUILD_TARGET is a non-empty string.""" + assert isinstance(constants.BUILD_TARGET, str) + assert len(constants.BUILD_TARGET) > 0 + + def test_workspace_dir_is_string(self): + """Test that WORKSPACE_DIR is a valid path string.""" + assert isinstance(constants.WORKSPACE_DIR, str) + assert constants.WORKSPACE_DIR.startswith("/") + + def test_env_variable_names_are_strings(self): + """Test that environment variable names are non-empty strings.""" + env_vars = [ + constants.ENV_SKIP_BUILD, + constants.ENV_RUNTIME_API_KEY, + constants.ENV_SDK_SHORT_SHA, + constants.ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, + constants.ENV_RUNTIME_API_URL, + ] + for env_var in env_vars: + assert isinstance(env_var, str) + assert len(env_var) > 0 + + def test_default_env_values_are_strings(self): + """Test that default environment values are non-empty strings.""" + defaults = [ + constants.DEFAULT_SKIP_BUILD, + constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + constants.DEFAULT_RUNTIME_API_URL, + ] + for default in defaults: + assert isinstance(default, str) + assert len(default) > 0 + + def test_git_config_values_are_strings(self): + """Test that git configuration values are non-empty strings.""" + git_values = [ + constants.GIT_USER_EMAIL, + constants.GIT_USER_NAME, + constants.GIT_COMMIT_MESSAGE, + ] + for value in git_values: + assert isinstance(value, str) + assert len(value) > 0 + + def test_env_setup_commands_is_list(self): + """Test that ENV_SETUP_COMMANDS is a non-empty list of strings.""" + assert isinstance(constants.ENV_SETUP_COMMANDS, list) + assert len(constants.ENV_SETUP_COMMANDS) > 0 + for cmd in constants.ENV_SETUP_COMMANDS: + assert isinstance(cmd, str) + + def test_allowed_image_types_is_list(self): + """Test that ALLOWED_IMAGE_TYPES is a non-empty list of MIME types.""" + assert isinstance(constants.ALLOWED_IMAGE_TYPES, list) + assert len(constants.ALLOWED_IMAGE_TYPES) > 0 + for mime_type in constants.ALLOWED_IMAGE_TYPES: + assert isinstance(mime_type, str) + assert mime_type.startswith("image/") + + def test_eval_workers_is_numeric_string(self): + """Test that DEFAULT_EVAL_WORKERS is a numeric string.""" + assert isinstance(constants.DEFAULT_EVAL_WORKERS, str) + assert constants.DEFAULT_EVAL_WORKERS.isdigit() + + def test_default_model_name_is_string(self): + """Test that DEFAULT_MODEL_NAME is a non-empty string.""" + assert isinstance(constants.DEFAULT_MODEL_NAME, str) + assert len(constants.DEFAULT_MODEL_NAME) > 0 + + def test_solveable_keyword_is_string(self): + """Test that SOLVEABLE_KEYWORD is a non-empty string.""" + assert isinstance(constants.SOLVEABLE_KEYWORD, str) + assert len(constants.SOLVEABLE_KEYWORD) > 0 + + def test_setup_files_to_remove_is_list(self): + """Test that SETUP_FILES_TO_REMOVE is a non-empty list of filenames.""" + assert isinstance(constants.SETUP_FILES_TO_REMOVE, list) + assert len(constants.SETUP_FILES_TO_REMOVE) > 0 + for filename in constants.SETUP_FILES_TO_REMOVE: + assert isinstance(filename, str) + assert len(filename) > 0 + + def test_annotations_filename_is_string(self): + """Test that ANNOTATIONS_FILENAME is a non-empty string.""" + assert isinstance(constants.ANNOTATIONS_FILENAME, str) + assert len(constants.ANNOTATIONS_FILENAME) > 0 + assert constants.ANNOTATIONS_FILENAME.endswith(".json") + + class TestConvertToSwebenchFormat: """Tests for convert_to_swebench_format function."""