diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 079ad66c..8a8dd7cd 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -9,6 +9,16 @@ import docker +from benchmarks.swtbench.constants import ( + BUILD_MODE_CHOICES, + DEFAULT_BUILD_BATCH_SIZE, + DEFAULT_BUILD_MAX_RETRIES, + DEFAULT_BUILD_MAX_WORKERS, + DEFAULT_BUILD_MODE, + DEFAULT_EVAL_LIMIT, + DEFAULT_SPLIT, + PREBAKED_REGISTRY, +) from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from benchmarks.utils.image_utils import image_exists as remote_image_exists @@ -258,11 +268,11 @@ def main() -> None: description="Build and push prebaked SWT-bench eval env images." ) parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( "--eval-limit", type=int, - default=1, + default=DEFAULT_EVAL_LIMIT, help="Match inference sampling by limiting instances (0 to disable)", ) parser.add_argument( @@ -277,31 +287,31 @@ def main() -> None: ) parser.add_argument( "--image-prefix", - default="ghcr.io/openhands/swtbench-eval", + default=PREBAKED_REGISTRY, help="Registry prefix for pushed images", ) parser.add_argument( "--max-workers", type=int, - default=4, + default=DEFAULT_BUILD_MAX_WORKERS, help="Parallel builds for env images", ) parser.add_argument( "--max-retries", type=int, - default=2, + default=DEFAULT_BUILD_MAX_RETRIES, help="Retries per batch for env image builds", ) parser.add_argument( "--build-batch-size", type=int, - default=10, + default=DEFAULT_BUILD_BATCH_SIZE, help="Number of env images to build per batch", ) parser.add_argument( "--build-mode", - choices=["api", "cli"], - default="cli", + choices=BUILD_MODE_CHOICES, + default=DEFAULT_BUILD_MODE, help="swt-bench build mode", ) parser.add_argument( diff --git a/benchmarks/swtbench/constants.py b/benchmarks/swtbench/constants.py new file mode 100644 index 00000000..05f650c1 --- /dev/null +++ b/benchmarks/swtbench/constants.py @@ -0,0 +1,140 @@ +""" +SWTBench Constants + +This module serves as the single source of truth for all hyperparameters +and constant values used in the SWTBench evaluation workflow. +""" + +from typing import Final, Literal, Tuple + + +# Type alias for build targets (matches openhands.sdk.workspace.TargetType) +TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"] + + +# ============================================================================= +# Docker/Image Related Constants +# ============================================================================= + +# Docker image prefixes +SWEBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" +SWTBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swtbench/" + +# Agent server image base +AGENT_SERVER_IMAGE_BASE: Final[str] = "ghcr.io/all-hands-ai/agent-server" + +# Prebaked evaluation images registry +PREBAKED_REGISTRY: Final[str] = "ghcr.io/openhands/swtbench-eval" + +# Build target for agent server images +DEFAULT_BUILD_TARGET: Final[TargetType] = "source-minimal" + +# Image tag constants +IMAGE_TAG_LATEST: Final[str] = "latest" +IMAGE_NAME_SEPARATOR: Final[str] = "1776" + +# Build mode choices and default +BUILD_MODE_CHOICES: Final[Tuple[str, ...]] = ("api", "cli") +DEFAULT_BUILD_MODE: Final[str] = "cli" + +# ============================================================================= +# Dataset Related Constants +# ============================================================================= + +# Default dataset for evaluation +DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" + +# Default dataset split +DEFAULT_SPLIT: Final[str] = "test" + +# Default model name for predictions +DEFAULT_MODEL_NAME: Final[str] = "OpenHands" + +# ============================================================================= +# Environment Variable Names +# ============================================================================= + +ENV_SKIP_BUILD: Final[str] = "SKIP_BUILD" +ENV_RUNTIME_API_KEY: Final[str] = "RUNTIME_API_KEY" +ENV_SDK_SHORT_SHA: Final[str] = "SDK_SHORT_SHA" +ENV_RUNTIME_API_URL: Final[str] = "RUNTIME_API_URL" +ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[str] = "REMOTE_RUNTIME_STARTUP_TIMEOUT" +ENV_SWTBENCH_FORCE_CONDA: Final[str] = "SWTBENCH_FORCE_CONDA" + +# ============================================================================= +# Default Values +# ============================================================================= + +# Default value for SKIP_BUILD environment variable (truthy string) +DEFAULT_SKIP_BUILD: Final[str] = "1" + +# Default runtime API URL +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" + +# Default startup timeout in seconds +DEFAULT_STARTUP_TIMEOUT: Final[int] = 600 + +# Default number of workers for evaluation +DEFAULT_EVAL_WORKERS: Final[int] = 12 + +# Default eval limit for image building +DEFAULT_EVAL_LIMIT: Final[int] = 1 + +# Default max workers for image building +DEFAULT_BUILD_MAX_WORKERS: Final[int] = 4 + +# Default max retries for image building +DEFAULT_BUILD_MAX_RETRIES: Final[int] = 2 + +# Default batch size for image building +DEFAULT_BUILD_BATCH_SIZE: Final[int] = 10 + +# ============================================================================= +# File/Directory Paths +# ============================================================================= + +# SWT-bench repository directory name +SWT_BENCH_REPO_DIR: Final[str] = "swt-bench" + +# Evaluation results directory name +EVALUATION_RESULTS_DIR: Final[str] = "evaluation_results" + +# Report filename +REPORT_FILENAME: Final[str] = "output.report.json" + +# Run ID prefix for evaluation +EVAL_RUN_ID_PREFIX: Final[str] = "eval_" + +# Eval note prefix +EVAL_NOTE_PREFIX: Final[str] = "SWT-" + +# ============================================================================= +# Git/Repository Related Constants +# ============================================================================= + +# SWT-bench repository URL +SWT_BENCH_REPO_URL: Final[str] = "https://github.com/logic-star-ai/swt-bench.git" + +# Git user configuration for commits +GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" +GIT_USER_NAME: Final[str] = "OpenHands Evaluation" + +# ============================================================================= +# Patch Processing Constants +# ============================================================================= + +# Files to remove from patches during post-processing (tuple for immutability) +SETUP_FILES_TO_REMOVE: Final[Tuple[str, ...]] = ( + "pyproject.toml", + "tox.ini", + "setup.py", +) + +# ============================================================================= +# Environment Setup Commands +# ============================================================================= + +# Default environment setup commands (tuple for immutability) +DEFAULT_ENV_SETUP_COMMANDS: Final[Tuple[str, ...]] = ( + "export PIP_CACHE_DIR=~/.cache/pip", +) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..4f170185 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,19 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.constants import ( + DEFAULT_DATASET, + DEFAULT_EVAL_WORKERS, + DEFAULT_MODEL_NAME, + DEFAULT_SPLIT, + ENV_SWTBENCH_FORCE_CONDA, + EVAL_RUN_ID_PREFIX, + EVALUATION_RESULTS_DIR, + PREBAKED_REGISTRY, + REPORT_FILENAME, + SETUP_FILES_TO_REMOVE, + SWT_BENCH_REPO_DIR, +) from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -30,8 +43,6 @@ logger = get_logger(__name__) -PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" - def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: instance_ids: list[str] = [] @@ -67,7 +78,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: def try_pull_prebaked_images( predictions_file: Path, dataset: str, - split: str = "test", + split: str = DEFAULT_SPLIT, registry: str = PREBAKED_REGISTRY, ) -> None: """ @@ -147,7 +158,7 @@ def update_report_with_submitted_instances( def convert_to_swtbench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME ) -> None: """ Convert OpenHands output.jsonl to SWT-Bench prediction format. @@ -203,8 +214,7 @@ def convert_to_swtbench_format( git_patch = "" # postprocess git_patch - setup_files = ["pyproject.toml", "tox.ini", "setup.py"] - git_patch = remove_files_from_patch(git_patch, setup_files) + git_patch = remove_files_from_patch(git_patch, SETUP_FILES_TO_REMOVE) # Create SWT-Bench format entry swtbench_entry = { @@ -236,8 +246,8 @@ def convert_to_swtbench_format( def run_swtbench_evaluation( predictions_file: str, # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. - dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "12", + dataset: str = DEFAULT_DATASET, + workers: int = DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -252,7 +262,7 @@ def run_swtbench_evaluation( dataset: SWT-Bench dataset to evaluate against workers: Number of workers to use for evaluation """ - use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes") + use_legacy = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() in ("1", "true", "yes") mode = "legacy-conda" if use_legacy else "prebaked-images" logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode) @@ -301,7 +311,7 @@ def run_swtbench_evaluation( "--max_workers", str(workers), "--run_id", - f"eval_{predictions_path.stem}", + f"{EVAL_RUN_ID_PREFIX}{predictions_path.stem}", ] logger.info(f"Using Python executable: {python_executable}") @@ -359,9 +369,8 @@ def main() -> None: # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWT-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + default=DEFAULT_DATASET, + help=f"SWT-Bench dataset to evaluate against (default: {DEFAULT_DATASET})", ) parser.add_argument( @@ -378,14 +387,15 @@ def main() -> None: parser.add_argument( "--model-name", - default="OpenHands", - help="Model name to use in the model_name_or_path field (default: OpenHands)", + default=DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})", ) parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + type=int, + default=DEFAULT_EVAL_WORKERS, + help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})", ) args = parser.parse_args() @@ -414,8 +424,8 @@ def main() -> None: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) - # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. - use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + # Default: use prebaked images; SWTBENCH_FORCE_CONDA opts into legacy flow. + use_prebaked = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() not in ( "1", "true", "yes", @@ -427,7 +437,7 @@ def main() -> None: ) else: logger.info( - "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull " + f"{ENV_SWTBENCH_FORCE_CONDA} set; skipping prebaked image pull " "and using legacy (pre-mamba) evaluation flow" ) @@ -440,14 +450,14 @@ def main() -> None: cleanup_phase_start = monotonic() # Move SWT-Bench evaluation report to same folder as output.jsonl cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" - report_dir = swt_bench_dir / "evaluation_results" - run_id = f"eval_{output_file.stem}" + swt_bench_dir = cache_dir / SWT_BENCH_REPO_DIR + report_dir = swt_bench_dir / EVALUATION_RESULTS_DIR + run_id = f"{EVAL_RUN_ID_PREFIX}{output_file.stem}" model_name_safe = args.model_name.replace("/", "__") report_file = report_dir / f"{model_name_safe}.{run_id}.json" target_dir = input_file.parent - target_file = target_dir / "output.report.json" + target_file = target_dir / REPORT_FILENAME shutil.move(str(report_file), str(target_file)) logger.info(f"Moved evaluation report to: {target_file}") update_report_with_submitted_instances(target_file, output_file) diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index e7aae1f4..3bcacc29 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -7,6 +7,11 @@ from pathlib import Path from typing import Iterable +from benchmarks.swtbench.constants import ( + DEFAULT_SPLIT, + SWT_BENCH_REPO_DIR, + SWT_BENCH_REPO_URL, +) from openhands.sdk import get_logger @@ -20,7 +25,7 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: Returns the repository path under the cache directory. """ cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" + swt_bench_dir = cache_dir / SWT_BENCH_REPO_DIR if swt_bench_dir.exists(): return swt_bench_dir @@ -31,7 +36,7 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: [ "git", "clone", - "https://github.com/logic-star-ai/swt-bench.git", + SWT_BENCH_REPO_URL, str(swt_bench_dir), ], text=True, @@ -131,7 +136,7 @@ def main() -> None: ) parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( "--format", choices=["plain", "json"], diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index a454e580..10bf1416 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -4,6 +4,27 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swtbench.constants import ( + AGENT_SERVER_IMAGE_BASE, + DEFAULT_BUILD_TARGET, + DEFAULT_ENV_SETUP_COMMANDS, + DEFAULT_RUNTIME_API_URL, + DEFAULT_SKIP_BUILD, + DEFAULT_STARTUP_TIMEOUT, + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, + ENV_RUNTIME_API_KEY, + ENV_RUNTIME_API_URL, + ENV_SDK_SHORT_SHA, + ENV_SKIP_BUILD, + EVAL_NOTE_PREFIX, + GIT_USER_EMAIL, + GIT_USER_NAME, + IMAGE_NAME_SEPARATOR, + IMAGE_TAG_LATEST, + SWEBENCH_DOCKER_IMAGE_PREFIX, + SWTBENCH_DOCKER_IMAGE_PREFIX, + TargetType, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -34,26 +55,28 @@ def get_official_docker_image( instance_id: str, - docker_image_prefix="docker.io/swebench/", + docker_image_prefix: str = SWEBENCH_DOCKER_IMAGE_PREFIX, ) -> str: # Official SWE-Bench image # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 repo, name = instance_id.split("__") official_image_name = docker_image_prefix.rstrip("/") - official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() + official_image_name += ( + f"/sweb.eval.x86_64.{repo}_{IMAGE_NAME_SEPARATOR}_{name}:{IMAGE_TAG_LATEST}" + ).lower() logger.debug(f"Using official SWE-Bench image: {official_image_name}") return official_image_name def get_agent_server_docker_image( instance_id: str, - docker_image_prefix="docker.io/swtbench/", - target: str = "source-minimal", + docker_image_prefix: str = SWTBENCH_DOCKER_IMAGE_PREFIX, + target: TargetType = DEFAULT_BUILD_TARGET, ) -> str: """Get the agent server Docker image for an instance.""" official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( - "ghcr.io/all-hands-ai/agent-server" + AGENT_SERVER_IMAGE_BASE + f":v{__version__}_{_base_slug(official_image_name)}_{target}" ) @@ -154,7 +177,7 @@ def prepare_workspace( forward_env: Environment variables to forward into the workspace. """ official_docker_image = get_official_docker_image(instance.id) - build_target = "source-minimal" + build_target: TargetType = DEFAULT_BUILD_TARGET # Create a custom tag for the image name_tag = official_docker_image.split("/")[-1] @@ -166,15 +189,19 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: + skip_build = os.getenv(ENV_SKIP_BUILD, DEFAULT_SKIP_BUILD).lower() in ( + "1", + "true", + "yes", + ) + logger.info(f"{ENV_SKIP_BUILD}={skip_build}") + if not skip_build: logger.info( f"Building workspace from {official_docker_image} " f"for instance {instance.id}. " "This may take a while...\n" "You can run benchmarks/swtbench/build_images.py and set " - "SKIP_BUILD=1 to skip building and use pre-built " + f"{ENV_SKIP_BUILD}=1 to skip building and use pre-built " "agent-server image." ) # For SWT-bench, we use DockerDevWorkspace with base_image @@ -191,11 +218,11 @@ def prepare_workspace( forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": - runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + runtime_api_key = os.getenv(ENV_RUNTIME_API_KEY) + sdk_short_sha = os.getenv(ENV_SDK_SHORT_SHA, SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( - "RUNTIME_API_KEY environment variable is not set for remote workspace" + f"{ENV_RUNTIME_API_KEY} environment variable is not set for remote workspace" ) agent_server_image = ( @@ -210,11 +237,13 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, str(DEFAULT_STARTUP_TIMEOUT) + ) + ) workspace = APIRemoteWorkspace( - runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" - ), + runtime_api_url=os.getenv(ENV_RUNTIME_API_URL, DEFAULT_RUNTIME_API_URL), runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", @@ -283,7 +312,7 @@ def evaluate_instance( logger.info("repo_path: %s", repo_path) cp_testebed_repo = workspace.execute_command( - (f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}") + f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}" ) assert cp_testebed_repo.exit_code == 0, ( f"cp_testebed_repo failed: {cp_testebed_repo.stderr}" @@ -309,8 +338,8 @@ def evaluate_instance( # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " - "git config --global user.email 'evaluation@openhands.dev' && " - "git config --global user.name 'OpenHands Evaluation' && " + f"git config --global user.email '{GIT_USER_EMAIL}' && " + f"git config --global user.name '{GIT_USER_NAME}' && " "git commit --no-verify -m 'patch'" ) @@ -378,7 +407,7 @@ def main() -> None: dataset_name=dataset_description, model_name=llm.model, max_iterations=args.max_iterations, - eval_note="SWT-" + args.note, + eval_note=EVAL_NOTE_PREFIX + args.note, ) critic = create_critic(args) @@ -392,7 +421,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + env_setup_commands=list(DEFAULT_ENV_SETUP_COMMANDS), max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select,