diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index cc0ae6b9..2041ed58 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -12,6 +12,7 @@ import sys from pathlib import Path +from benchmarks.swebench import constants from benchmarks.utils.build_utils import ( BuildOutput, build_all_images, @@ -26,19 +27,19 @@ logger = get_logger(__name__) WRAPPER_DOCKERFILE = Path(__file__).with_name("Dockerfile.swebench-deps") -# Repos that require the docutils/roman wrapper layer -WRAPPED_REPOS = {"sphinx-doc"} def get_official_docker_image( instance_id: str, - docker_image_prefix="docker.io/swebench/", + docker_image_prefix: str = constants.DOCKER_IMAGE_PREFIX, ) -> str: # Official SWE-Bench image # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 repo, name = instance_id.split("__") official_image_name = docker_image_prefix.rstrip("/") - official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() + official_image_name += ( + f"/sweb.eval.x86_64.{repo}_1776_{name}:{constants.DOCKER_IMAGE_TAG}".lower() + ) logger.debug(f"Official SWE-Bench image: {official_image_name}") return official_image_name @@ -60,12 +61,12 @@ def should_wrap_custom_tag(custom_tag: str) -> bool: prefix = "sweb.eval.x86_64." if custom_tag.startswith(prefix): custom_tag = custom_tag[len(prefix) :] - return custom_tag.split("_", 1)[0] in WRAPPED_REPOS + return custom_tag.split("_", 1)[0] in constants.WRAPPED_REPOS def should_wrap_instance_id(instance_id: str) -> bool: repo = instance_id.split("__")[0] - return repo in WRAPPED_REPOS + return repo in constants.WRAPPED_REPOS def collect_unique_base_images( diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py new file mode 100644 index 00000000..88d795c8 --- /dev/null +++ b/benchmarks/swebench/constants.py @@ -0,0 +1,48 @@ +""" +SWE-Bench hyperparameters and constant values. + +This module serves as the single source of truth for all constant values +used in the SWE-Bench evaluation workflow. +""" + +from typing import Final, Literal + + +# Dataset +DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" + +# Docker +DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" +DOCKER_IMAGE_TAG: Final[str] = "latest" +WRAPPED_REPOS: Final[frozenset[str]] = frozenset( + {"sphinx-doc"} +) # Repos requiring docutils/roman wrapper + +# Build target type (matches openhands.agent_server.docker.build.TargetType) +TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"] +BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal" +BUILD_TARGET_BINARY: Final[TargetType] = "binary" +DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL + +# Runtime +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 + +# Evaluation +DEFAULT_EVAL_WORKERS: Final[int] = 12 + +# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands" +DEFAULT_MODEL_NAME: Final[str] = "OpenHands" +DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands" + +# Git +GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" +GIT_USER_NAME: Final[str] = "OpenHands Evaluation" +GIT_COMMIT_MESSAGE: Final[str] = "patch" + +# Patch Processing +SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = ( + "pyproject.toml", + "tox.ini", + "setup.py", +) diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index f252a56a..b1c5ee69 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -16,6 +16,7 @@ import sys from pathlib import Path +from benchmarks.swebench import constants from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -26,7 +27,7 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -82,8 +83,9 @@ def convert_to_swebench_format( git_patch = "" # postprocess git_patch - setup_files = ["pyproject.toml", "tox.ini", "setup.py"] - git_patch = remove_files_from_patch(git_patch, setup_files) + git_patch = remove_files_from_patch( + git_patch, constants.SETUP_FILES_TO_REMOVE + ) # Create SWE-Bench format entry swebench_entry = { @@ -114,8 +116,8 @@ def convert_to_swebench_format( def run_swebench_evaluation( predictions_file: str, - dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "12", + dataset: str = constants.DEFAULT_DATASET, + workers: int = constants.DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWE-Bench evaluation on the predictions file. @@ -196,9 +198,8 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + default=constants.DEFAULT_DATASET, + help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})", ) parser.add_argument( @@ -215,14 +216,15 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=constants.DEFAULT_CLI_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})", ) parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + type=int, + default=constants.DEFAULT_EVAL_WORKERS, + help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})", ) args = parser.parse_args() diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 77faafd5..e19f0877 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -4,6 +4,7 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swebench import constants from benchmarks.swebench.build_images import ( extract_custom_tag, get_official_docker_image, @@ -114,10 +115,12 @@ def prepare_workspace( Used by APIRemoteWorkspace for remote runtime allocation. """ official_docker_image = get_official_docker_image(instance.id) - build_target = "source-minimal" + build_target = constants.DEFAULT_BUILD_TARGET custom_tag = extract_custom_tag(official_docker_image) # For non-binary targets, append target suffix - suffix = f"-{build_target}" if build_target != "binary" else "" + suffix = ( + f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" + ) base_agent_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) @@ -183,10 +186,15 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT), + ) + ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL ), runtime_api_key=runtime_api_key, server_image=agent_server_image, @@ -280,9 +288,9 @@ def evaluate_instance( # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " - "git config --global user.email 'evaluation@openhands.dev' && " - "git config --global user.name 'OpenHands Evaluation' && " - "git commit --no-verify -m 'patch'" + f"git config --global user.email '{constants.GIT_USER_EMAIL}' && " + f"git config --global user.name '{constants.GIT_USER_NAME}' && " + f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'" ) # Get git patch