From 328490eac177eb28a0c58047dae4b057f3ffb5d8 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 27 Jan 2026 18:11:16 +0000 Subject: [PATCH 1/4] refactor(swtbench): regroup all hyperparameters in constants.py This commit creates a single source of truth for all SWTBench hyperparameters and constant values by: 1. Creating benchmarks/swtbench/constants.py with all constants: - Docker/Image related constants (prefixes, registries, build targets) - Dataset related constants (default dataset, split, model name) - Environment variable names - Default values for various parameters - File/directory paths (repo dir, evaluation results dir, report filename) - Git/repository related constants - Patch processing constants - Environment setup commands 2. Updating all swtbench modules to import from constants.py: - run_infer.py - eval_infer.py - build_eval_env_images.py - image_utils.py This makes it easier to check and modify hyperparameters for the benchmark as they are now centralized in one location. Fixes #364 Co-authored-by: openhands --- benchmarks/swtbench/build_eval_env_images.py | 26 ++-- benchmarks/swtbench/constants.py | 129 +++++++++++++++++++ benchmarks/swtbench/eval_infer.py | 57 ++++---- benchmarks/swtbench/image_utils.py | 11 +- benchmarks/swtbench/run_infer.py | 70 ++++++---- 5 files changed, 236 insertions(+), 57 deletions(-) create mode 100644 benchmarks/swtbench/constants.py diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 079ad66c..8a8dd7cd 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -9,6 +9,16 @@ import docker +from benchmarks.swtbench.constants import ( + BUILD_MODE_CHOICES, + DEFAULT_BUILD_BATCH_SIZE, + DEFAULT_BUILD_MAX_RETRIES, + DEFAULT_BUILD_MAX_WORKERS, + DEFAULT_BUILD_MODE, + DEFAULT_EVAL_LIMIT, + DEFAULT_SPLIT, + PREBAKED_REGISTRY, +) from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from benchmarks.utils.image_utils import image_exists as remote_image_exists @@ -258,11 +268,11 @@ def main() -> None: description="Build and push prebaked SWT-bench eval env images." ) parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( "--eval-limit", type=int, - default=1, + default=DEFAULT_EVAL_LIMIT, help="Match inference sampling by limiting instances (0 to disable)", ) parser.add_argument( @@ -277,31 +287,31 @@ def main() -> None: ) parser.add_argument( "--image-prefix", - default="ghcr.io/openhands/swtbench-eval", + default=PREBAKED_REGISTRY, help="Registry prefix for pushed images", ) parser.add_argument( "--max-workers", type=int, - default=4, + default=DEFAULT_BUILD_MAX_WORKERS, help="Parallel builds for env images", ) parser.add_argument( "--max-retries", type=int, - default=2, + default=DEFAULT_BUILD_MAX_RETRIES, help="Retries per batch for env image builds", ) parser.add_argument( "--build-batch-size", type=int, - default=10, + default=DEFAULT_BUILD_BATCH_SIZE, help="Number of env images to build per batch", ) parser.add_argument( "--build-mode", - choices=["api", "cli"], - default="cli", + choices=BUILD_MODE_CHOICES, + default=DEFAULT_BUILD_MODE, help="swt-bench build mode", ) parser.add_argument( diff --git a/benchmarks/swtbench/constants.py b/benchmarks/swtbench/constants.py new file mode 100644 index 00000000..36dbdaa2 --- /dev/null +++ b/benchmarks/swtbench/constants.py @@ -0,0 +1,129 @@ +""" +SWTBench Constants + +This module serves as the single source of truth for all hyperparameters +and constant values used in the SWTBench evaluation workflow. +""" + +# ============================================================================= +# Docker/Image Related Constants +# ============================================================================= + +# Docker image prefixes +SWEBENCH_DOCKER_IMAGE_PREFIX = "docker.io/swebench/" +SWTBENCH_DOCKER_IMAGE_PREFIX = "docker.io/swtbench/" + +# Agent server image base +AGENT_SERVER_IMAGE_BASE = "ghcr.io/all-hands-ai/agent-server" + +# Prebaked evaluation images registry +PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" + +# Build target for agent server images +DEFAULT_BUILD_TARGET = "source-minimal" + +# Image tag constants +IMAGE_TAG_LATEST = "latest" +IMAGE_NAME_SEPARATOR = "1776" + +# ============================================================================= +# Dataset Related Constants +# ============================================================================= + +# Default dataset for evaluation +DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified" + +# Default dataset split +DEFAULT_SPLIT = "test" + +# Default model name for predictions +DEFAULT_MODEL_NAME = "OpenHands" + +# ============================================================================= +# Environment Variable Names +# ============================================================================= + +ENV_SKIP_BUILD = "SKIP_BUILD" +ENV_RUNTIME_API_KEY = "RUNTIME_API_KEY" +ENV_SDK_SHORT_SHA = "SDK_SHORT_SHA" +ENV_RUNTIME_API_URL = "RUNTIME_API_URL" +ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT = "REMOTE_RUNTIME_STARTUP_TIMEOUT" +ENV_SWTBENCH_FORCE_CONDA = "SWTBENCH_FORCE_CONDA" + +# ============================================================================= +# Default Values +# ============================================================================= + +# Default value for SKIP_BUILD environment variable +DEFAULT_SKIP_BUILD = "1" + +# Default runtime API URL +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" + +# Default startup timeout in seconds +DEFAULT_STARTUP_TIMEOUT = "600" + +# Default number of workers for evaluation +DEFAULT_EVAL_WORKERS = "12" + +# Default eval limit for image building +DEFAULT_EVAL_LIMIT = 1 + +# Default max workers for image building +DEFAULT_BUILD_MAX_WORKERS = 4 + +# Default max retries for image building +DEFAULT_BUILD_MAX_RETRIES = 2 + +# Default batch size for image building +DEFAULT_BUILD_BATCH_SIZE = 10 + +# Default build mode +DEFAULT_BUILD_MODE = "cli" + +# Build mode choices +BUILD_MODE_CHOICES = ["api", "cli"] + +# ============================================================================= +# File/Directory Paths +# ============================================================================= + +# SWT-bench repository directory name +SWT_BENCH_REPO_DIR = "swt-bench" + +# Evaluation results directory name +EVALUATION_RESULTS_DIR = "evaluation_results" + +# Report filename +REPORT_FILENAME = "output.report.json" + +# Run ID prefix for evaluation +EVAL_RUN_ID_PREFIX = "eval_" + +# Eval note prefix +EVAL_NOTE_PREFIX = "SWT-" + +# ============================================================================= +# Git/Repository Related Constants +# ============================================================================= + +# SWT-bench repository URL +SWT_BENCH_REPO_URL = "https://github.com/logic-star-ai/swt-bench.git" + +# Git user configuration for commits +GIT_USER_EMAIL = "evaluation@openhands.dev" +GIT_USER_NAME = "OpenHands Evaluation" + +# ============================================================================= +# Patch Processing Constants +# ============================================================================= + +# Files to remove from patches during post-processing +SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"] + +# ============================================================================= +# Environment Setup Commands +# ============================================================================= + +# Default environment setup commands +DEFAULT_ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"] diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..72ae403f 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,19 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.constants import ( + DEFAULT_DATASET, + DEFAULT_EVAL_WORKERS, + DEFAULT_MODEL_NAME, + DEFAULT_SPLIT, + ENV_SWTBENCH_FORCE_CONDA, + EVAL_RUN_ID_PREFIX, + EVALUATION_RESULTS_DIR, + PREBAKED_REGISTRY, + REPORT_FILENAME, + SETUP_FILES_TO_REMOVE, + SWT_BENCH_REPO_DIR, +) from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -30,8 +43,6 @@ logger = get_logger(__name__) -PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" - def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: instance_ids: list[str] = [] @@ -67,7 +78,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: def try_pull_prebaked_images( predictions_file: Path, dataset: str, - split: str = "test", + split: str = DEFAULT_SPLIT, registry: str = PREBAKED_REGISTRY, ) -> None: """ @@ -147,7 +158,7 @@ def update_report_with_submitted_instances( def convert_to_swtbench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME ) -> None: """ Convert OpenHands output.jsonl to SWT-Bench prediction format. @@ -203,8 +214,7 @@ def convert_to_swtbench_format( git_patch = "" # postprocess git_patch - setup_files = ["pyproject.toml", "tox.ini", "setup.py"] - git_patch = remove_files_from_patch(git_patch, setup_files) + git_patch = remove_files_from_patch(git_patch, SETUP_FILES_TO_REMOVE) # Create SWT-Bench format entry swtbench_entry = { @@ -236,8 +246,8 @@ def convert_to_swtbench_format( def run_swtbench_evaluation( predictions_file: str, # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. - dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "12", + dataset: str = DEFAULT_DATASET, + workers: str = DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -252,7 +262,7 @@ def run_swtbench_evaluation( dataset: SWT-Bench dataset to evaluate against workers: Number of workers to use for evaluation """ - use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes") + use_legacy = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() in ("1", "true", "yes") mode = "legacy-conda" if use_legacy else "prebaked-images" logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode) @@ -301,7 +311,7 @@ def run_swtbench_evaluation( "--max_workers", str(workers), "--run_id", - f"eval_{predictions_path.stem}", + f"{EVAL_RUN_ID_PREFIX}{predictions_path.stem}", ] logger.info(f"Using Python executable: {python_executable}") @@ -359,9 +369,8 @@ def main() -> None: # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWT-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + default=DEFAULT_DATASET, + help=f"SWT-Bench dataset to evaluate against (default: {DEFAULT_DATASET})", ) parser.add_argument( @@ -378,14 +387,14 @@ def main() -> None: parser.add_argument( "--model-name", - default="OpenHands", - help="Model name to use in the model_name_or_path field (default: OpenHands)", + default=DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})", ) parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + default=DEFAULT_EVAL_WORKERS, + help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})", ) args = parser.parse_args() @@ -414,8 +423,8 @@ def main() -> None: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) - # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. - use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + # Default: use prebaked images; SWTBENCH_FORCE_CONDA opts into legacy flow. + use_prebaked = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() not in ( "1", "true", "yes", @@ -427,7 +436,7 @@ def main() -> None: ) else: logger.info( - "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull " + f"{ENV_SWTBENCH_FORCE_CONDA} set; skipping prebaked image pull " "and using legacy (pre-mamba) evaluation flow" ) @@ -440,14 +449,14 @@ def main() -> None: cleanup_phase_start = monotonic() # Move SWT-Bench evaluation report to same folder as output.jsonl cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" - report_dir = swt_bench_dir / "evaluation_results" - run_id = f"eval_{output_file.stem}" + swt_bench_dir = cache_dir / SWT_BENCH_REPO_DIR + report_dir = swt_bench_dir / EVALUATION_RESULTS_DIR + run_id = f"{EVAL_RUN_ID_PREFIX}{output_file.stem}" model_name_safe = args.model_name.replace("/", "__") report_file = report_dir / f"{model_name_safe}.{run_id}.json" target_dir = input_file.parent - target_file = target_dir / "output.report.json" + target_file = target_dir / REPORT_FILENAME shutil.move(str(report_file), str(target_file)) logger.info(f"Moved evaluation report to: {target_file}") update_report_with_submitted_instances(target_file, output_file) diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index e7aae1f4..3bcacc29 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -7,6 +7,11 @@ from pathlib import Path from typing import Iterable +from benchmarks.swtbench.constants import ( + DEFAULT_SPLIT, + SWT_BENCH_REPO_DIR, + SWT_BENCH_REPO_URL, +) from openhands.sdk import get_logger @@ -20,7 +25,7 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: Returns the repository path under the cache directory. """ cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" + swt_bench_dir = cache_dir / SWT_BENCH_REPO_DIR if swt_bench_dir.exists(): return swt_bench_dir @@ -31,7 +36,7 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: [ "git", "clone", - "https://github.com/logic-star-ai/swt-bench.git", + SWT_BENCH_REPO_URL, str(swt_bench_dir), ], text=True, @@ -131,7 +136,7 @@ def main() -> None: ) parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split") parser.add_argument( "--format", choices=["plain", "json"], diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index a454e580..94a0b814 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -4,6 +4,26 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swtbench.constants import ( + AGENT_SERVER_IMAGE_BASE, + DEFAULT_BUILD_TARGET, + DEFAULT_ENV_SETUP_COMMANDS, + DEFAULT_RUNTIME_API_URL, + DEFAULT_SKIP_BUILD, + DEFAULT_STARTUP_TIMEOUT, + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, + ENV_RUNTIME_API_KEY, + ENV_RUNTIME_API_URL, + ENV_SDK_SHORT_SHA, + ENV_SKIP_BUILD, + EVAL_NOTE_PREFIX, + GIT_USER_EMAIL, + GIT_USER_NAME, + IMAGE_NAME_SEPARATOR, + IMAGE_TAG_LATEST, + SWEBENCH_DOCKER_IMAGE_PREFIX, + SWTBENCH_DOCKER_IMAGE_PREFIX, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -34,26 +54,28 @@ def get_official_docker_image( instance_id: str, - docker_image_prefix="docker.io/swebench/", + docker_image_prefix: str = SWEBENCH_DOCKER_IMAGE_PREFIX, ) -> str: # Official SWE-Bench image # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 repo, name = instance_id.split("__") official_image_name = docker_image_prefix.rstrip("/") - official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() + official_image_name += ( + f"/sweb.eval.x86_64.{repo}_{IMAGE_NAME_SEPARATOR}_{name}:{IMAGE_TAG_LATEST}" + ).lower() logger.debug(f"Using official SWE-Bench image: {official_image_name}") return official_image_name def get_agent_server_docker_image( instance_id: str, - docker_image_prefix="docker.io/swtbench/", - target: str = "source-minimal", + docker_image_prefix: str = SWTBENCH_DOCKER_IMAGE_PREFIX, + target: str = DEFAULT_BUILD_TARGET, ) -> str: """Get the agent server Docker image for an instance.""" official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( - "ghcr.io/all-hands-ai/agent-server" + AGENT_SERVER_IMAGE_BASE + f":v{__version__}_{_base_slug(official_image_name)}_{target}" ) @@ -154,7 +176,7 @@ def prepare_workspace( forward_env: Environment variables to forward into the workspace. """ official_docker_image = get_official_docker_image(instance.id) - build_target = "source-minimal" + build_target = DEFAULT_BUILD_TARGET # Create a custom tag for the image name_tag = official_docker_image.split("/")[-1] @@ -166,15 +188,19 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: + skip_build = os.getenv(ENV_SKIP_BUILD, DEFAULT_SKIP_BUILD).lower() in ( + "1", + "true", + "yes", + ) + logger.info(f"{ENV_SKIP_BUILD}={skip_build}") + if not skip_build: logger.info( f"Building workspace from {official_docker_image} " f"for instance {instance.id}. " "This may take a while...\n" "You can run benchmarks/swtbench/build_images.py and set " - "SKIP_BUILD=1 to skip building and use pre-built " + f"{ENV_SKIP_BUILD}=1 to skip building and use pre-built " "agent-server image." ) # For SWT-bench, we use DockerDevWorkspace with base_image @@ -191,11 +217,11 @@ def prepare_workspace( forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": - runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + runtime_api_key = os.getenv(ENV_RUNTIME_API_KEY) + sdk_short_sha = os.getenv(ENV_SDK_SHORT_SHA, SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( - "RUNTIME_API_KEY environment variable is not set for remote workspace" + f"{ENV_RUNTIME_API_KEY} environment variable is not set for remote workspace" ) agent_server_image = ( @@ -210,11 +236,11 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv(ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, DEFAULT_STARTUP_TIMEOUT) + ) workspace = APIRemoteWorkspace( - runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" - ), + runtime_api_url=os.getenv(ENV_RUNTIME_API_URL, DEFAULT_RUNTIME_API_URL), runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", @@ -283,7 +309,7 @@ def evaluate_instance( logger.info("repo_path: %s", repo_path) cp_testebed_repo = workspace.execute_command( - (f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}") + f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}" ) assert cp_testebed_repo.exit_code == 0, ( f"cp_testebed_repo failed: {cp_testebed_repo.stderr}" @@ -309,8 +335,8 @@ def evaluate_instance( # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " - "git config --global user.email 'evaluation@openhands.dev' && " - "git config --global user.name 'OpenHands Evaluation' && " + f"git config --global user.email '{GIT_USER_EMAIL}' && " + f"git config --global user.name '{GIT_USER_NAME}' && " "git commit --no-verify -m 'patch'" ) @@ -378,7 +404,7 @@ def main() -> None: dataset_name=dataset_description, model_name=llm.model, max_iterations=args.max_iterations, - eval_note="SWT-" + args.note, + eval_note=EVAL_NOTE_PREFIX + args.note, ) critic = create_critic(args) @@ -392,7 +418,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + env_setup_commands=DEFAULT_ENV_SETUP_COMMANDS, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, From 53af2bd9b7ee98b5534f06b789895a741c42cd3c Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 14:22:31 +0000 Subject: [PATCH 2/4] refactor(swtbench): improve constants.py with type safety and immutability - Add typing.Final annotations to all constants for type safety - Convert mutable lists to immutable tuples: - SETUP_FILES_TO_REMOVE - BUILD_MODE_CHOICES - DEFAULT_ENV_SETUP_COMMANDS - Add BuildMode enum for type-safe build mode selection - Convert string numeric constants to proper int types: - DEFAULT_STARTUP_TIMEOUT: '600' -> 600 - DEFAULT_EVAL_WORKERS: '12' -> 12 - Update callers to handle type changes: - eval_infer.py: workers parameter now int, add type=int to argparse - run_infer.py: convert tuple to list for env_setup_commands, convert int to str for os.getenv default Co-authored-by: openhands --- benchmarks/swtbench/constants.py | 104 ++++++++++++++++++------------ benchmarks/swtbench/eval_infer.py | 3 +- benchmarks/swtbench/run_infer.py | 6 +- 3 files changed, 67 insertions(+), 46 deletions(-) diff --git a/benchmarks/swtbench/constants.py b/benchmarks/swtbench/constants.py index 36dbdaa2..105316ce 100644 --- a/benchmarks/swtbench/constants.py +++ b/benchmarks/swtbench/constants.py @@ -5,125 +5,143 @@ and constant values used in the SWTBench evaluation workflow. """ +from enum import Enum +from typing import Final, Tuple + + # ============================================================================= # Docker/Image Related Constants # ============================================================================= # Docker image prefixes -SWEBENCH_DOCKER_IMAGE_PREFIX = "docker.io/swebench/" -SWTBENCH_DOCKER_IMAGE_PREFIX = "docker.io/swtbench/" +SWEBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" +SWTBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swtbench/" # Agent server image base -AGENT_SERVER_IMAGE_BASE = "ghcr.io/all-hands-ai/agent-server" +AGENT_SERVER_IMAGE_BASE: Final[str] = "ghcr.io/all-hands-ai/agent-server" # Prebaked evaluation images registry -PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" +PREBAKED_REGISTRY: Final[str] = "ghcr.io/openhands/swtbench-eval" # Build target for agent server images -DEFAULT_BUILD_TARGET = "source-minimal" +DEFAULT_BUILD_TARGET: Final[str] = "source-minimal" # Image tag constants -IMAGE_TAG_LATEST = "latest" -IMAGE_NAME_SEPARATOR = "1776" +IMAGE_TAG_LATEST: Final[str] = "latest" +IMAGE_NAME_SEPARATOR: Final[str] = "1776" + + +class BuildMode(str, Enum): + """Build mode options for SWT-bench evaluation.""" + + API = "api" + CLI = "cli" + + +# Default build mode +DEFAULT_BUILD_MODE: Final[str] = BuildMode.CLI.value + +# Build mode choices (tuple for immutability) +BUILD_MODE_CHOICES: Final[Tuple[str, ...]] = tuple(m.value for m in BuildMode) # ============================================================================= # Dataset Related Constants # ============================================================================= # Default dataset for evaluation -DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified" +DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" # Default dataset split -DEFAULT_SPLIT = "test" +DEFAULT_SPLIT: Final[str] = "test" # Default model name for predictions -DEFAULT_MODEL_NAME = "OpenHands" +DEFAULT_MODEL_NAME: Final[str] = "OpenHands" # ============================================================================= # Environment Variable Names # ============================================================================= -ENV_SKIP_BUILD = "SKIP_BUILD" -ENV_RUNTIME_API_KEY = "RUNTIME_API_KEY" -ENV_SDK_SHORT_SHA = "SDK_SHORT_SHA" -ENV_RUNTIME_API_URL = "RUNTIME_API_URL" -ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT = "REMOTE_RUNTIME_STARTUP_TIMEOUT" -ENV_SWTBENCH_FORCE_CONDA = "SWTBENCH_FORCE_CONDA" +ENV_SKIP_BUILD: Final[str] = "SKIP_BUILD" +ENV_RUNTIME_API_KEY: Final[str] = "RUNTIME_API_KEY" +ENV_SDK_SHORT_SHA: Final[str] = "SDK_SHORT_SHA" +ENV_RUNTIME_API_URL: Final[str] = "RUNTIME_API_URL" +ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[str] = "REMOTE_RUNTIME_STARTUP_TIMEOUT" +ENV_SWTBENCH_FORCE_CONDA: Final[str] = "SWTBENCH_FORCE_CONDA" # ============================================================================= # Default Values # ============================================================================= -# Default value for SKIP_BUILD environment variable -DEFAULT_SKIP_BUILD = "1" +# Default value for SKIP_BUILD environment variable (truthy string) +DEFAULT_SKIP_BUILD: Final[str] = "1" # Default runtime API URL -DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" # Default startup timeout in seconds -DEFAULT_STARTUP_TIMEOUT = "600" +DEFAULT_STARTUP_TIMEOUT: Final[int] = 600 # Default number of workers for evaluation -DEFAULT_EVAL_WORKERS = "12" +DEFAULT_EVAL_WORKERS: Final[int] = 12 # Default eval limit for image building -DEFAULT_EVAL_LIMIT = 1 +DEFAULT_EVAL_LIMIT: Final[int] = 1 # Default max workers for image building -DEFAULT_BUILD_MAX_WORKERS = 4 +DEFAULT_BUILD_MAX_WORKERS: Final[int] = 4 # Default max retries for image building -DEFAULT_BUILD_MAX_RETRIES = 2 +DEFAULT_BUILD_MAX_RETRIES: Final[int] = 2 # Default batch size for image building -DEFAULT_BUILD_BATCH_SIZE = 10 - -# Default build mode -DEFAULT_BUILD_MODE = "cli" - -# Build mode choices -BUILD_MODE_CHOICES = ["api", "cli"] +DEFAULT_BUILD_BATCH_SIZE: Final[int] = 10 # ============================================================================= # File/Directory Paths # ============================================================================= # SWT-bench repository directory name -SWT_BENCH_REPO_DIR = "swt-bench" +SWT_BENCH_REPO_DIR: Final[str] = "swt-bench" # Evaluation results directory name -EVALUATION_RESULTS_DIR = "evaluation_results" +EVALUATION_RESULTS_DIR: Final[str] = "evaluation_results" # Report filename -REPORT_FILENAME = "output.report.json" +REPORT_FILENAME: Final[str] = "output.report.json" # Run ID prefix for evaluation -EVAL_RUN_ID_PREFIX = "eval_" +EVAL_RUN_ID_PREFIX: Final[str] = "eval_" # Eval note prefix -EVAL_NOTE_PREFIX = "SWT-" +EVAL_NOTE_PREFIX: Final[str] = "SWT-" # ============================================================================= # Git/Repository Related Constants # ============================================================================= # SWT-bench repository URL -SWT_BENCH_REPO_URL = "https://github.com/logic-star-ai/swt-bench.git" +SWT_BENCH_REPO_URL: Final[str] = "https://github.com/logic-star-ai/swt-bench.git" # Git user configuration for commits -GIT_USER_EMAIL = "evaluation@openhands.dev" -GIT_USER_NAME = "OpenHands Evaluation" +GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" +GIT_USER_NAME: Final[str] = "OpenHands Evaluation" # ============================================================================= # Patch Processing Constants # ============================================================================= -# Files to remove from patches during post-processing -SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"] +# Files to remove from patches during post-processing (tuple for immutability) +SETUP_FILES_TO_REMOVE: Final[Tuple[str, ...]] = ( + "pyproject.toml", + "tox.ini", + "setup.py", +) # ============================================================================= # Environment Setup Commands # ============================================================================= -# Default environment setup commands -DEFAULT_ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"] +# Default environment setup commands (tuple for immutability) +DEFAULT_ENV_SETUP_COMMANDS: Final[Tuple[str, ...]] = ( + "export PIP_CACHE_DIR=~/.cache/pip", +) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 72ae403f..4f170185 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -247,7 +247,7 @@ def run_swtbench_evaluation( predictions_file: str, # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. dataset: str = DEFAULT_DATASET, - workers: str = DEFAULT_EVAL_WORKERS, + workers: int = DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -393,6 +393,7 @@ def main() -> None: parser.add_argument( "--workers", + type=int, default=DEFAULT_EVAL_WORKERS, help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})", ) diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 94a0b814..297f82e9 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -237,7 +237,9 @@ def prepare_workspace( f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float( - os.getenv(ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, DEFAULT_STARTUP_TIMEOUT) + os.getenv( + ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT, str(DEFAULT_STARTUP_TIMEOUT) + ) ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv(ENV_RUNTIME_API_URL, DEFAULT_RUNTIME_API_URL), @@ -418,7 +420,7 @@ def main() -> None: details={}, prompt_path=args.prompt_path, eval_limit=args.n_limit, - env_setup_commands=DEFAULT_ENV_SETUP_COMMANDS, + env_setup_commands=list(DEFAULT_ENV_SETUP_COMMANDS), max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, From cd5cd071db68293603d22160e134090dd7df56bf Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 14:28:20 +0000 Subject: [PATCH 3/4] fix(swtbench): fix pyright type error for TargetType - Add TargetType alias to constants.py matching openhands.sdk.workspace.TargetType - Update DEFAULT_BUILD_TARGET to use TargetType instead of str - Update run_infer.py to use TargetType for build_target variable and function parameter Co-authored-by: openhands --- benchmarks/swtbench/constants.py | 8 ++++++-- benchmarks/swtbench/run_infer.py | 5 +++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/swtbench/constants.py b/benchmarks/swtbench/constants.py index 105316ce..05a9d936 100644 --- a/benchmarks/swtbench/constants.py +++ b/benchmarks/swtbench/constants.py @@ -6,7 +6,11 @@ """ from enum import Enum -from typing import Final, Tuple +from typing import Final, Literal, Tuple + + +# Type alias for build targets (matches openhands.sdk.workspace.TargetType) +TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"] # ============================================================================= @@ -24,7 +28,7 @@ PREBAKED_REGISTRY: Final[str] = "ghcr.io/openhands/swtbench-eval" # Build target for agent server images -DEFAULT_BUILD_TARGET: Final[str] = "source-minimal" +DEFAULT_BUILD_TARGET: Final[TargetType] = "source-minimal" # Image tag constants IMAGE_TAG_LATEST: Final[str] = "latest" diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 297f82e9..10bf1416 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -23,6 +23,7 @@ IMAGE_TAG_LATEST, SWEBENCH_DOCKER_IMAGE_PREFIX, SWTBENCH_DOCKER_IMAGE_PREFIX, + TargetType, ) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -70,7 +71,7 @@ def get_official_docker_image( def get_agent_server_docker_image( instance_id: str, docker_image_prefix: str = SWTBENCH_DOCKER_IMAGE_PREFIX, - target: str = DEFAULT_BUILD_TARGET, + target: TargetType = DEFAULT_BUILD_TARGET, ) -> str: """Get the agent server Docker image for an instance.""" official_image_name = get_official_docker_image(instance_id, docker_image_prefix) @@ -176,7 +177,7 @@ def prepare_workspace( forward_env: Environment variables to forward into the workspace. """ official_docker_image = get_official_docker_image(instance.id) - build_target = DEFAULT_BUILD_TARGET + build_target: TargetType = DEFAULT_BUILD_TARGET # Create a custom tag for the image name_tag = official_docker_image.split("/")[-1] From 0fa09ce22faed447650584a0ea6de3c604578df7 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 15:58:34 +0000 Subject: [PATCH 4/4] refactor(swtbench): simplify BuildMode by removing unused enum Remove the BuildMode enum class and replace with simple string constants. The enum was only used to generate choices and default values, which can be done more simply with a tuple and string constant. Co-authored-by: openhands --- benchmarks/swtbench/constants.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/benchmarks/swtbench/constants.py b/benchmarks/swtbench/constants.py index 05a9d936..05f650c1 100644 --- a/benchmarks/swtbench/constants.py +++ b/benchmarks/swtbench/constants.py @@ -5,7 +5,6 @@ and constant values used in the SWTBench evaluation workflow. """ -from enum import Enum from typing import Final, Literal, Tuple @@ -34,19 +33,9 @@ IMAGE_TAG_LATEST: Final[str] = "latest" IMAGE_NAME_SEPARATOR: Final[str] = "1776" - -class BuildMode(str, Enum): - """Build mode options for SWT-bench evaluation.""" - - API = "api" - CLI = "cli" - - -# Default build mode -DEFAULT_BUILD_MODE: Final[str] = BuildMode.CLI.value - -# Build mode choices (tuple for immutability) -BUILD_MODE_CHOICES: Final[Tuple[str, ...]] = tuple(m.value for m in BuildMode) +# Build mode choices and default +BUILD_MODE_CHOICES: Final[Tuple[str, ...]] = ("api", "cli") +DEFAULT_BUILD_MODE: Final[str] = "cli" # ============================================================================= # Dataset Related Constants