From 545be812a29c0f6afda1fb372b598fd3c77f69ec Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 27 Jan 2026 18:11:56 +0000 Subject: [PATCH 1/6] Regroup all swebench hyperparameters in a single source of truth constants.py This commit creates a new constants.py module in benchmarks/swebench that serves as the single source of truth for all constant values used in the SWE-Bench evaluation workflow. Changes: - Create benchmarks/swebench/constants.py with all constant values: - Dataset configuration (DEFAULT_DATASET, DEFAULT_SPLIT) - Docker image configuration (DOCKER_IMAGE_PREFIX, DOCKER_IMAGE_TAG) - Build configuration (BUILD_TARGET_*, DEFAULT_BUILD_TARGET) - Runtime configuration (DEFAULT_RUNTIME_API_URL, DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT) - Evaluation configuration (DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_WORKERS, etc.) - Model configuration (DEFAULT_MODEL_NAME) - Git configuration (GIT_USER_EMAIL, GIT_USER_NAME, GIT_COMMIT_MESSAGE) - Patch processing (SETUP_FILES_TO_REMOVE) - Update run_infer.py, eval_infer.py, and build_images.py to import and use constants from the constants module Fixes #348 Co-authored-by: openhands --- benchmarks/swebench/build_images.py | 7 +++- benchmarks/swebench/constants.py | 60 +++++++++++++++++++++++++++++ benchmarks/swebench/eval_infer.py | 25 ++++++------ benchmarks/swebench/run_infer.py | 22 +++++++---- 4 files changed, 93 insertions(+), 21 deletions(-) create mode 100644 benchmarks/swebench/constants.py diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index cc0ae6b9..b6849581 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -12,6 +12,7 @@ import sys from pathlib import Path +from benchmarks.swebench import constants from benchmarks.utils.build_utils import ( BuildOutput, build_all_images, @@ -32,13 +33,15 @@ def get_official_docker_image( instance_id: str, - docker_image_prefix="docker.io/swebench/", + docker_image_prefix: str = constants.DOCKER_IMAGE_PREFIX, ) -> str: # Official SWE-Bench image # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 repo, name = instance_id.split("__") official_image_name = docker_image_prefix.rstrip("/") - official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() + official_image_name += ( + f"/sweb.eval.x86_64.{repo}_1776_{name}:{constants.DOCKER_IMAGE_TAG}".lower() + ) logger.debug(f"Official SWE-Bench image: {official_image_name}") return official_image_name diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py new file mode 100644 index 00000000..1206b507 --- /dev/null +++ b/benchmarks/swebench/constants.py @@ -0,0 +1,60 @@ +""" +SWE-Bench hyperparameters and constant values. + +This module serves as the single source of truth for all constant values +used in the SWE-Bench evaluation workflow. +""" + +# ============================================================================= +# Dataset Configuration +# ============================================================================= +DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified" +DEFAULT_SPLIT = "test" + +# ============================================================================= +# Docker Image Configuration +# ============================================================================= +DOCKER_IMAGE_PREFIX = "docker.io/swebench/" +DOCKER_IMAGE_TAG = "latest" + +# ============================================================================= +# Build Configuration +# ============================================================================= +BUILD_TARGET_SOURCE_MINIMAL = "source-minimal" +BUILD_TARGET_BINARY = "binary" +DEFAULT_BUILD_TARGET = BUILD_TARGET_SOURCE_MINIMAL + +# ============================================================================= +# Runtime Configuration +# ============================================================================= +DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600" + +# ============================================================================= +# Evaluation Configuration +# ============================================================================= +DEFAULT_MAX_ITERATIONS = 100 +DEFAULT_NUM_WORKERS = 1 +DEFAULT_MAX_ATTEMPTS = 3 +DEFAULT_MAX_RETRIES = 3 +DEFAULT_EVAL_WORKERS = "12" +DEFAULT_N_LIMIT = 0 +DEFAULT_NOTE = "initial" +DEFAULT_OUTPUT_DIR = "./eval_outputs" + +# ============================================================================= +# Model Configuration +# ============================================================================= +DEFAULT_MODEL_NAME = "openhands" + +# ============================================================================= +# Git Configuration +# ============================================================================= +GIT_USER_EMAIL = "evaluation@openhands.dev" +GIT_USER_NAME = "OpenHands Evaluation" +GIT_COMMIT_MESSAGE = "patch" + +# ============================================================================= +# Patch Processing +# ============================================================================= +SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"] diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index f252a56a..c3ae28f9 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -16,6 +16,7 @@ import sys from pathlib import Path +from benchmarks.swebench import constants from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -26,7 +27,7 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -82,8 +83,9 @@ def convert_to_swebench_format( git_patch = "" # postprocess git_patch - setup_files = ["pyproject.toml", "tox.ini", "setup.py"] - git_patch = remove_files_from_patch(git_patch, setup_files) + git_patch = remove_files_from_patch( + git_patch, constants.SETUP_FILES_TO_REMOVE + ) # Create SWE-Bench format entry swebench_entry = { @@ -114,8 +116,8 @@ def convert_to_swebench_format( def run_swebench_evaluation( predictions_file: str, - dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "12", + dataset: str = constants.DEFAULT_DATASET, + workers: str = constants.DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWE-Bench evaluation on the predictions file. @@ -196,9 +198,8 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + default=constants.DEFAULT_DATASET, + help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})", ) parser.add_argument( @@ -215,14 +216,14 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=constants.DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})", ) parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + default=constants.DEFAULT_EVAL_WORKERS, + help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})", ) args = parser.parse_args() diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 77faafd5..8c9050b6 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -4,6 +4,7 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swebench import constants from benchmarks.swebench.build_images import ( extract_custom_tag, get_official_docker_image, @@ -114,10 +115,12 @@ def prepare_workspace( Used by APIRemoteWorkspace for remote runtime allocation. """ official_docker_image = get_official_docker_image(instance.id) - build_target = "source-minimal" + build_target = constants.DEFAULT_BUILD_TARGET custom_tag = extract_custom_tag(official_docker_image) # For non-binary targets, append target suffix - suffix = f"-{build_target}" if build_target != "binary" else "" + suffix = ( + f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" + ) base_agent_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) @@ -183,10 +186,15 @@ def prepare_workspace( f"Using remote workspace with image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + ) + ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL ), runtime_api_key=runtime_api_key, server_image=agent_server_image, @@ -280,9 +288,9 @@ def evaluate_instance( # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail workspace.execute_command( f"cd {repo_path} && " - "git config --global user.email 'evaluation@openhands.dev' && " - "git config --global user.name 'OpenHands Evaluation' && " - "git commit --no-verify -m 'patch'" + f"git config --global user.email '{constants.GIT_USER_EMAIL}' && " + f"git config --global user.name '{constants.GIT_USER_NAME}' && " + f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'" ) # Get git patch From 1bc9344d303a2e86b866e88b41bbd0471a3ca185 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 13:35:47 +0000 Subject: [PATCH 2/6] refactor: clean up constants.py and fix behavior-altering issues - Fix DEFAULT_MODEL_NAME case: 'openhands' -> 'OpenHands' to match original function default - Move WRAPPED_REPOS from build_images.py to constants.py - Remove unused constants: DEFAULT_SPLIT, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_WORKERS, DEFAULT_MAX_ATTEMPTS, DEFAULT_MAX_RETRIES, DEFAULT_N_LIMIT, DEFAULT_NOTE, DEFAULT_OUTPUT_DIR - Fix type: DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is now int (600) instead of string Co-authored-by: openhands --- benchmarks/swebench/build_images.py | 6 ++---- benchmarks/swebench/constants.py | 14 ++++---------- benchmarks/swebench/run_infer.py | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index b6849581..2041ed58 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -27,8 +27,6 @@ logger = get_logger(__name__) WRAPPER_DOCKERFILE = Path(__file__).with_name("Dockerfile.swebench-deps") -# Repos that require the docutils/roman wrapper layer -WRAPPED_REPOS = {"sphinx-doc"} def get_official_docker_image( @@ -63,12 +61,12 @@ def should_wrap_custom_tag(custom_tag: str) -> bool: prefix = "sweb.eval.x86_64." if custom_tag.startswith(prefix): custom_tag = custom_tag[len(prefix) :] - return custom_tag.split("_", 1)[0] in WRAPPED_REPOS + return custom_tag.split("_", 1)[0] in constants.WRAPPED_REPOS def should_wrap_instance_id(instance_id: str) -> bool: repo = instance_id.split("__")[0] - return repo in WRAPPED_REPOS + return repo in constants.WRAPPED_REPOS def collect_unique_base_images( diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index 1206b507..a9c58891 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -9,13 +9,14 @@ # Dataset Configuration # ============================================================================= DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified" -DEFAULT_SPLIT = "test" # ============================================================================= # Docker Image Configuration # ============================================================================= DOCKER_IMAGE_PREFIX = "docker.io/swebench/" DOCKER_IMAGE_TAG = "latest" +# Repos that require the docutils/roman wrapper layer +WRAPPED_REPOS = {"sphinx-doc"} # ============================================================================= # Build Configuration @@ -28,24 +29,17 @@ # Runtime Configuration # ============================================================================= DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" -DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600 # ============================================================================= # Evaluation Configuration # ============================================================================= -DEFAULT_MAX_ITERATIONS = 100 -DEFAULT_NUM_WORKERS = 1 -DEFAULT_MAX_ATTEMPTS = 3 -DEFAULT_MAX_RETRIES = 3 DEFAULT_EVAL_WORKERS = "12" -DEFAULT_N_LIMIT = 0 -DEFAULT_NOTE = "initial" -DEFAULT_OUTPUT_DIR = "./eval_outputs" # ============================================================================= # Model Configuration # ============================================================================= -DEFAULT_MODEL_NAME = "openhands" +DEFAULT_MODEL_NAME = "OpenHands" # ============================================================================= # Git Configuration diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 8c9050b6..e19f0877 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -189,7 +189,7 @@ def prepare_workspace( startup_timeout = float( os.getenv( "REMOTE_RUNTIME_STARTUP_TIMEOUT", - constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT, + str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT), ) ) workspace = APIRemoteWorkspace( From f1447e5c7a085bfdb5544cddf07d2fac333512cf Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 14:48:37 +0000 Subject: [PATCH 3/6] refactor: improve constants.py with type safety and behavior preservation - Fix breaking change: add DEFAULT_CLI_MODEL_NAME to preserve original CLI default ('openhands') - Use typing.Final for all constants to indicate immutability - Use frozenset for WRAPPED_REPOS (immutable) - Use tuple for SETUP_FILES_TO_REMOVE (immutable) - Change DEFAULT_EVAL_WORKERS to int type with proper conversion at usage sites - Simplify section headers for cleaner code - Add type=int to --workers argparse argument for proper type handling Co-authored-by: openhands --- benchmarks/swebench/constants.py | 85 ++++++++++++++----------------- benchmarks/swebench/eval_infer.py | 7 +-- 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index a9c58891..21312293 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -5,50 +5,43 @@ used in the SWE-Bench evaluation workflow. """ -# ============================================================================= -# Dataset Configuration -# ============================================================================= -DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified" - -# ============================================================================= -# Docker Image Configuration -# ============================================================================= -DOCKER_IMAGE_PREFIX = "docker.io/swebench/" -DOCKER_IMAGE_TAG = "latest" -# Repos that require the docutils/roman wrapper layer -WRAPPED_REPOS = {"sphinx-doc"} - -# ============================================================================= -# Build Configuration -# ============================================================================= -BUILD_TARGET_SOURCE_MINIMAL = "source-minimal" -BUILD_TARGET_BINARY = "binary" -DEFAULT_BUILD_TARGET = BUILD_TARGET_SOURCE_MINIMAL - -# ============================================================================= -# Runtime Configuration -# ============================================================================= -DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev" -DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600 - -# ============================================================================= -# Evaluation Configuration -# ============================================================================= -DEFAULT_EVAL_WORKERS = "12" - -# ============================================================================= -# Model Configuration -# ============================================================================= -DEFAULT_MODEL_NAME = "OpenHands" - -# ============================================================================= -# Git Configuration -# ============================================================================= -GIT_USER_EMAIL = "evaluation@openhands.dev" -GIT_USER_NAME = "OpenHands Evaluation" -GIT_COMMIT_MESSAGE = "patch" - -# ============================================================================= +from typing import Final + + +# Dataset +DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" + +# Docker +DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" +DOCKER_IMAGE_TAG: Final[str] = "latest" +WRAPPED_REPOS: Final[frozenset[str]] = frozenset( + {"sphinx-doc"} +) # Repos requiring docutils/roman wrapper + +# Build +BUILD_TARGET_SOURCE_MINIMAL: Final[str] = "source-minimal" +BUILD_TARGET_BINARY: Final[str] = "binary" +DEFAULT_BUILD_TARGET: Final[str] = BUILD_TARGET_SOURCE_MINIMAL + +# Runtime +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 + +# Evaluation +DEFAULT_EVAL_WORKERS: Final[int] = 12 + +# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands" +DEFAULT_MODEL_NAME: Final[str] = "OpenHands" +DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands" + +# Git +GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" +GIT_USER_NAME: Final[str] = "OpenHands Evaluation" +GIT_COMMIT_MESSAGE: Final[str] = "patch" + # Patch Processing -# ============================================================================= -SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"] +SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = ( + "pyproject.toml", + "tox.ini", + "setup.py", +) diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index c3ae28f9..b1c5ee69 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -117,7 +117,7 @@ def convert_to_swebench_format( def run_swebench_evaluation( predictions_file: str, dataset: str = constants.DEFAULT_DATASET, - workers: str = constants.DEFAULT_EVAL_WORKERS, + workers: int = constants.DEFAULT_EVAL_WORKERS, ) -> None: """ Run SWE-Bench evaluation on the predictions file. @@ -216,12 +216,13 @@ def main() -> None: parser.add_argument( "--model-name", - default=constants.DEFAULT_MODEL_NAME, - help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})", + default=constants.DEFAULT_CLI_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})", ) parser.add_argument( "--workers", + type=int, default=constants.DEFAULT_EVAL_WORKERS, help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})", ) From fee639d8b0000f8b917b3df359ab0a1ccf94f609 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 15:17:22 +0000 Subject: [PATCH 4/6] fix: use proper Literal type for build target constants The pyright type checker was failing because DEFAULT_BUILD_TARGET was typed as Final[str] but build_image() expects a TargetType which is Literal['binary', 'binary-minimal', 'source', 'source-minimal']. This fix adds a local TargetType alias and properly types the build target constants to match the expected type signature. --- benchmarks/swebench/constants.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index 21312293..88d795c8 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -5,7 +5,7 @@ used in the SWE-Bench evaluation workflow. """ -from typing import Final +from typing import Final, Literal # Dataset @@ -18,10 +18,11 @@ {"sphinx-doc"} ) # Repos requiring docutils/roman wrapper -# Build -BUILD_TARGET_SOURCE_MINIMAL: Final[str] = "source-minimal" -BUILD_TARGET_BINARY: Final[str] = "binary" -DEFAULT_BUILD_TARGET: Final[str] = BUILD_TARGET_SOURCE_MINIMAL +# Build target type (matches openhands.agent_server.docker.build.TargetType) +TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"] +BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal" +BUILD_TARGET_BINARY: Final[TargetType] = "binary" +DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL # Runtime DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" From 214c3abdfe4272ada5045f648dc3ac006cae9b41 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 16:42:08 +0000 Subject: [PATCH 5/6] refactor: centralize CLI argument defaults in utils/constants.py - Remove hardcoded dataset default from args_parser.py (now None) - Each benchmark sets its own dataset default via parser.set_defaults() - Add shared constants to utils/constants.py: - DEFAULT_WORKSPACE='remote' (behavior change from 'docker') - DEFAULT_SPLIT, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_EVAL_WORKERS - DEFAULT_OUTPUT_DIR, DEFAULT_MAX_ATTEMPTS, DEFAULT_MAX_RETRIES - DEFAULT_NOTE, DEFAULT_N_LIMIT, DEFAULT_CRITIC - Update args_parser.py to use constants for all defaults - Update critics.py to use DEFAULT_CRITIC constant - Update swebench/run_infer.py to set its own dataset default Co-authored-by: openhands --- benchmarks/swebench/run_infer.py | 2 ++ benchmarks/utils/args_parser.py | 54 ++++++++++++++++++++++---------- benchmarks/utils/constants.py | 33 +++++++++++++++++-- benchmarks/utils/critics.py | 5 +-- 4 files changed, 73 insertions(+), 21 deletions(-) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index e19f0877..942dbbbb 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -334,6 +334,8 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) + # Set SWE-bench specific default dataset + parser.set_defaults(dataset=constants.DEFAULT_DATASET) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 60f08d73..4391054e 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -1,15 +1,19 @@ """ -Argument parsing utilities for SWE-bench benchmarks. +Argument parsing utilities for benchmarks. """ import argparse +from benchmarks.utils import constants from benchmarks.utils.critics import add_critic_args def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: """Create and return argument parser. + Note: --dataset has no default. Each benchmark should set its own default + using parser.set_defaults(dataset=). + Returns: ArgumentParser instance """ @@ -23,41 +27,57 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dataset", type=str, - default="princeton-nlp/SWE-bench_Verified", - help="Dataset name", + default=None, + help="Dataset name (required unless benchmark provides default)", + ) + parser.add_argument( + "--split", + type=str, + default=constants.DEFAULT_SPLIT, + help=f"Dataset split (default: {constants.DEFAULT_SPLIT})", ) - parser.add_argument("--split", type=str, default="test", help="Dataset split") parser.add_argument( "--workspace", type=str, - default="docker", + default=constants.DEFAULT_WORKSPACE, choices=["docker", "remote"], - help="Type of workspace to use (default: docker)", + help=f"Type of workspace to use (default: {constants.DEFAULT_WORKSPACE})", ) parser.add_argument( - "--max-iterations", type=int, default=100, help="Maximum iterations" + "--max-iterations", + type=int, + default=constants.DEFAULT_MAX_ITERATIONS, + help=f"Maximum iterations (default: {constants.DEFAULT_MAX_ITERATIONS})", ) parser.add_argument( - "--num-workers", type=int, default=1, help="Number of evaluation workers" + "--num-workers", + type=int, + default=constants.DEFAULT_NUM_EVAL_WORKERS, + help=f"Number of evaluation workers (default: {constants.DEFAULT_NUM_EVAL_WORKERS})", + ) + parser.add_argument( + "--note", + type=str, + default=constants.DEFAULT_NOTE, + help=f"Evaluation note (default: {constants.DEFAULT_NOTE})", ) - parser.add_argument("--note", type=str, default="initial", help="Evaluation note") parser.add_argument( "--output-dir", type=str, - default="./eval_outputs", - help="Evaluation output directory", + default=constants.DEFAULT_OUTPUT_DIR, + help=f"Evaluation output directory (default: {constants.DEFAULT_OUTPUT_DIR})", ) parser.add_argument( "--n-limit", type=int, - default=0, - help="Limit number of instances to evaluate", + default=constants.DEFAULT_N_LIMIT, + help=f"Limit number of instances to evaluate (default: {constants.DEFAULT_N_LIMIT})", ) parser.add_argument( "--max-attempts", type=int, - default=3, - help="Maximum number of attempts for iterative mode (default: 3, min: 1)", + default=constants.DEFAULT_MAX_ATTEMPTS, + help=f"Maximum number of attempts for iterative mode (default: {constants.DEFAULT_MAX_ATTEMPTS}, min: 1)", ) # Add critic arguments @@ -72,7 +92,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--max-retries", type=int, - default=3, - help="Maximum retries for instances that throw exceptions (default: 3)", + default=constants.DEFAULT_MAX_RETRIES, + help=f"Maximum retries for instances that throw exceptions (default: {constants.DEFAULT_MAX_RETRIES})", ) return parser diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index 9337b847..124eb041 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,2 +1,31 @@ -OUTPUT_FILENAME = "output.jsonl" -EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" +""" +Shared constants for all benchmarks. + +This module contains default values used across multiple benchmarks. +Benchmark-specific constants should be defined in their own constants.py files. +""" + +from typing import Final + + +# Output +OUTPUT_FILENAME: Final[str] = "output.jsonl" + +# Docker +EVAL_AGENT_SERVER_IMAGE: Final[str] = "ghcr.io/openhands/eval-agent-server" + +# Workspace +DEFAULT_WORKSPACE: Final[str] = "remote" +DEFAULT_SPLIT: Final[str] = "test" + +# Evaluation +DEFAULT_MAX_ITERATIONS: Final[int] = 100 +DEFAULT_NUM_EVAL_WORKERS: Final[int] = 1 +DEFAULT_OUTPUT_DIR: Final[str] = "./eval_outputs" +DEFAULT_MAX_ATTEMPTS: Final[int] = 3 +DEFAULT_MAX_RETRIES: Final[int] = 3 +DEFAULT_NOTE: Final[str] = "initial" +DEFAULT_N_LIMIT: Final[int] = 0 + +# Critic +DEFAULT_CRITIC: Final[str] = "pass" diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index af9c55ae..b97083a1 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -11,6 +11,7 @@ from pathlib import Path from typing import Set +from benchmarks.utils import constants from benchmarks.utils.models import EvalInstanceID, EvalOutput from openhands.sdk import get_logger from openhands.sdk.critic import ( @@ -37,9 +38,9 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, - default="pass", + default=constants.DEFAULT_CRITIC, help=( - "Name of the critic to use for evaluation (default: 'pass'). " + f"Name of the critic to use for evaluation (default: '{constants.DEFAULT_CRITIC}'). " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: " From 76069a935c89012cc636cc73ebc0d4d3b03f9618 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 16:50:31 +0000 Subject: [PATCH 6/6] Revert "refactor: centralize CLI argument defaults in utils/constants.py" This reverts commit 214c3abdfe4272ada5045f648dc3ac006cae9b41. --- benchmarks/swebench/run_infer.py | 2 -- benchmarks/utils/args_parser.py | 54 ++++++++++---------------------- benchmarks/utils/constants.py | 33 ++----------------- benchmarks/utils/critics.py | 5 ++- 4 files changed, 21 insertions(+), 73 deletions(-) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 942dbbbb..e19f0877 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -334,8 +334,6 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # Set SWE-bench specific default dataset - parser.set_defaults(dataset=constants.DEFAULT_DATASET) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 4391054e..60f08d73 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -1,19 +1,15 @@ """ -Argument parsing utilities for benchmarks. +Argument parsing utilities for SWE-bench benchmarks. """ import argparse -from benchmarks.utils import constants from benchmarks.utils.critics import add_critic_args def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: """Create and return argument parser. - Note: --dataset has no default. Each benchmark should set its own default - using parser.set_defaults(dataset=). - Returns: ArgumentParser instance """ @@ -27,57 +23,41 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dataset", type=str, - default=None, - help="Dataset name (required unless benchmark provides default)", - ) - parser.add_argument( - "--split", - type=str, - default=constants.DEFAULT_SPLIT, - help=f"Dataset split (default: {constants.DEFAULT_SPLIT})", + default="princeton-nlp/SWE-bench_Verified", + help="Dataset name", ) + parser.add_argument("--split", type=str, default="test", help="Dataset split") parser.add_argument( "--workspace", type=str, - default=constants.DEFAULT_WORKSPACE, + default="docker", choices=["docker", "remote"], - help=f"Type of workspace to use (default: {constants.DEFAULT_WORKSPACE})", + help="Type of workspace to use (default: docker)", ) parser.add_argument( - "--max-iterations", - type=int, - default=constants.DEFAULT_MAX_ITERATIONS, - help=f"Maximum iterations (default: {constants.DEFAULT_MAX_ITERATIONS})", + "--max-iterations", type=int, default=100, help="Maximum iterations" ) parser.add_argument( - "--num-workers", - type=int, - default=constants.DEFAULT_NUM_EVAL_WORKERS, - help=f"Number of evaluation workers (default: {constants.DEFAULT_NUM_EVAL_WORKERS})", - ) - parser.add_argument( - "--note", - type=str, - default=constants.DEFAULT_NOTE, - help=f"Evaluation note (default: {constants.DEFAULT_NOTE})", + "--num-workers", type=int, default=1, help="Number of evaluation workers" ) + parser.add_argument("--note", type=str, default="initial", help="Evaluation note") parser.add_argument( "--output-dir", type=str, - default=constants.DEFAULT_OUTPUT_DIR, - help=f"Evaluation output directory (default: {constants.DEFAULT_OUTPUT_DIR})", + default="./eval_outputs", + help="Evaluation output directory", ) parser.add_argument( "--n-limit", type=int, - default=constants.DEFAULT_N_LIMIT, - help=f"Limit number of instances to evaluate (default: {constants.DEFAULT_N_LIMIT})", + default=0, + help="Limit number of instances to evaluate", ) parser.add_argument( "--max-attempts", type=int, - default=constants.DEFAULT_MAX_ATTEMPTS, - help=f"Maximum number of attempts for iterative mode (default: {constants.DEFAULT_MAX_ATTEMPTS}, min: 1)", + default=3, + help="Maximum number of attempts for iterative mode (default: 3, min: 1)", ) # Add critic arguments @@ -92,7 +72,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--max-retries", type=int, - default=constants.DEFAULT_MAX_RETRIES, - help=f"Maximum retries for instances that throw exceptions (default: {constants.DEFAULT_MAX_RETRIES})", + default=3, + help="Maximum retries for instances that throw exceptions (default: 3)", ) return parser diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index 124eb041..9337b847 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,31 +1,2 @@ -""" -Shared constants for all benchmarks. - -This module contains default values used across multiple benchmarks. -Benchmark-specific constants should be defined in their own constants.py files. -""" - -from typing import Final - - -# Output -OUTPUT_FILENAME: Final[str] = "output.jsonl" - -# Docker -EVAL_AGENT_SERVER_IMAGE: Final[str] = "ghcr.io/openhands/eval-agent-server" - -# Workspace -DEFAULT_WORKSPACE: Final[str] = "remote" -DEFAULT_SPLIT: Final[str] = "test" - -# Evaluation -DEFAULT_MAX_ITERATIONS: Final[int] = 100 -DEFAULT_NUM_EVAL_WORKERS: Final[int] = 1 -DEFAULT_OUTPUT_DIR: Final[str] = "./eval_outputs" -DEFAULT_MAX_ATTEMPTS: Final[int] = 3 -DEFAULT_MAX_RETRIES: Final[int] = 3 -DEFAULT_NOTE: Final[str] = "initial" -DEFAULT_N_LIMIT: Final[int] = 0 - -# Critic -DEFAULT_CRITIC: Final[str] = "pass" +OUTPUT_FILENAME = "output.jsonl" +EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index b97083a1..af9c55ae 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -11,7 +11,6 @@ from pathlib import Path from typing import Set -from benchmarks.utils import constants from benchmarks.utils.models import EvalInstanceID, EvalOutput from openhands.sdk import get_logger from openhands.sdk.critic import ( @@ -38,9 +37,9 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, - default=constants.DEFAULT_CRITIC, + default="pass", help=( - f"Name of the critic to use for evaluation (default: '{constants.DEFAULT_CRITIC}'). " + "Name of the critic to use for evaluation (default: 'pass'). " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: "