From 545be812a29c0f6afda1fb372b598fd3c77f69ec Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 27 Jan 2026 18:11:56 +0000
Subject: [PATCH 1/6] Regroup all swebench hyperparameters in a single source
 of truth constants.py

This commit creates a new constants.py module in benchmarks/swebench that
serves as the single source of truth for all constant values used in the
SWE-Bench evaluation workflow.

Changes:
- Create benchmarks/swebench/constants.py with all constant values:
  - Dataset configuration (DEFAULT_DATASET, DEFAULT_SPLIT)
  - Docker image configuration (DOCKER_IMAGE_PREFIX, DOCKER_IMAGE_TAG)
  - Build configuration (BUILD_TARGET_*, DEFAULT_BUILD_TARGET)
  - Runtime configuration (DEFAULT_RUNTIME_API_URL, DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT)
  - Evaluation configuration (DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_WORKERS, etc.)
  - Model configuration (DEFAULT_MODEL_NAME)
  - Git configuration (GIT_USER_EMAIL, GIT_USER_NAME, GIT_COMMIT_MESSAGE)
  - Patch processing (SETUP_FILES_TO_REMOVE)

- Update run_infer.py, eval_infer.py, and build_images.py to import and use
  constants from the constants module

Fixes #348

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/build_images.py |  7 +++-
 benchmarks/swebench/constants.py    | 60 +++++++++++++++++++++++++++++
 benchmarks/swebench/eval_infer.py   | 25 ++++++------
 benchmarks/swebench/run_infer.py    | 22 +++++++----
 4 files changed, 93 insertions(+), 21 deletions(-)
 create mode 100644 benchmarks/swebench/constants.py

diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py
index cc0ae6b9..b6849581 100644
--- a/benchmarks/swebench/build_images.py
+++ b/benchmarks/swebench/build_images.py
@@ -12,6 +12,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.swebench import constants
 from benchmarks.utils.build_utils import (
     BuildOutput,
     build_all_images,
@@ -32,13 +33,15 @@
 
 def get_official_docker_image(
     instance_id: str,
-    docker_image_prefix="docker.io/swebench/",
+    docker_image_prefix: str = constants.DOCKER_IMAGE_PREFIX,
 ) -> str:
     # Official SWE-Bench image
     # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
     repo, name = instance_id.split("__")
     official_image_name = docker_image_prefix.rstrip("/")
-    official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower()
+    official_image_name += (
+        f"/sweb.eval.x86_64.{repo}_1776_{name}:{constants.DOCKER_IMAGE_TAG}".lower()
+    )
     logger.debug(f"Official SWE-Bench image: {official_image_name}")
     return official_image_name
 
diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
new file mode 100644
index 00000000..1206b507
--- /dev/null
+++ b/benchmarks/swebench/constants.py
@@ -0,0 +1,60 @@
+"""
+SWE-Bench hyperparameters and constant values.
+
+This module serves as the single source of truth for all constant values
+used in the SWE-Bench evaluation workflow.
+"""
+
+# =============================================================================
+# Dataset Configuration
+# =============================================================================
+DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified"
+DEFAULT_SPLIT = "test"
+
+# =============================================================================
+# Docker Image Configuration
+# =============================================================================
+DOCKER_IMAGE_PREFIX = "docker.io/swebench/"
+DOCKER_IMAGE_TAG = "latest"
+
+# =============================================================================
+# Build Configuration
+# =============================================================================
+BUILD_TARGET_SOURCE_MINIMAL = "source-minimal"
+BUILD_TARGET_BINARY = "binary"
+DEFAULT_BUILD_TARGET = BUILD_TARGET_SOURCE_MINIMAL
+
+# =============================================================================
+# Runtime Configuration
+# =============================================================================
+DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600"
+
+# =============================================================================
+# Evaluation Configuration
+# =============================================================================
+DEFAULT_MAX_ITERATIONS = 100
+DEFAULT_NUM_WORKERS = 1
+DEFAULT_MAX_ATTEMPTS = 3
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_EVAL_WORKERS = "12"
+DEFAULT_N_LIMIT = 0
+DEFAULT_NOTE = "initial"
+DEFAULT_OUTPUT_DIR = "./eval_outputs"
+
+# =============================================================================
+# Model Configuration
+# =============================================================================
+DEFAULT_MODEL_NAME = "openhands"
+
+# =============================================================================
+# Git Configuration
+# =============================================================================
+GIT_USER_EMAIL = "evaluation@openhands.dev"
+GIT_USER_NAME = "OpenHands Evaluation"
+GIT_COMMIT_MESSAGE = "patch"
+
+# =============================================================================
+# Patch Processing
+# =============================================================================
+SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"]
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index f252a56a..c3ae28f9 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -16,6 +16,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.swebench import constants
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -26,7 +27,7 @@
 
 
 def convert_to_swebench_format(
-    input_file: str, output_file: str, model_name: str = "OpenHands"
+    input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ -82,8 +83,9 @@ def convert_to_swebench_format(
                     git_patch = ""
 
                 # postprocess git_patch
-                setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
-                git_patch = remove_files_from_patch(git_patch, setup_files)
+                git_patch = remove_files_from_patch(
+                    git_patch, constants.SETUP_FILES_TO_REMOVE
+                )
 
                 # Create SWE-Bench format entry
                 swebench_entry = {
@@ -114,8 +116,8 @@ def convert_to_swebench_format(
 
 def run_swebench_evaluation(
     predictions_file: str,
-    dataset: str = "princeton-nlp/SWE-bench_Verified",
-    workers: str = "12",
+    dataset: str = constants.DEFAULT_DATASET,
+    workers: str = constants.DEFAULT_EVAL_WORKERS,
 ) -> None:
     """
     Run SWE-Bench evaluation on the predictions file.
@@ -196,9 +198,8 @@ def main() -> None:
 
     parser.add_argument(
         "--dataset",
-        default="princeton-nlp/SWE-bench_Verified",
-        help="SWE-Bench dataset to evaluate against "
-        "(default: princeton-nlp/SWE-bench_Verified)",
+        default=constants.DEFAULT_DATASET,
+        help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})",
     )
 
     parser.add_argument(
@@ -215,14 +216,14 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        default=constants.DEFAULT_MODEL_NAME,
+        help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})",
     )
 
     parser.add_argument(
         "--workers",
-        default="12",
-        help="Number of workers to use when evaluating",
+        default=constants.DEFAULT_EVAL_WORKERS,
+        help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})",
     )
 
     args = parser.parse_args()
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 77faafd5..8c9050b6 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -4,6 +4,7 @@
 
 from jinja2 import Environment, FileSystemLoader
 
+from benchmarks.swebench import constants
 from benchmarks.swebench.build_images import (
     extract_custom_tag,
     get_official_docker_image,
@@ -114,10 +115,12 @@ def prepare_workspace(
                            Used by APIRemoteWorkspace for remote runtime allocation.
         """
         official_docker_image = get_official_docker_image(instance.id)
-        build_target = "source-minimal"
+        build_target = constants.DEFAULT_BUILD_TARGET
         custom_tag = extract_custom_tag(official_docker_image)
         # For non-binary targets, append target suffix
-        suffix = f"-{build_target}" if build_target != "binary" else ""
+        suffix = (
+            f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
+        )
         base_agent_image = (
             f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
         )
@@ -183,10 +186,15 @@ def prepare_workspace(
                 f"Using remote workspace with image {agent_server_image} "
                 f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
-            startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
+            startup_timeout = float(
+                os.getenv(
+                    "REMOTE_RUNTIME_STARTUP_TIMEOUT",
+                    constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
+                )
+            )
             workspace = APIRemoteWorkspace(
                 runtime_api_url=os.getenv(
-                    "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
+                    "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL
                 ),
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
@@ -280,9 +288,9 @@ def evaluate_instance(
         # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
         workspace.execute_command(
             f"cd {repo_path} && "
-            "git config --global user.email 'evaluation@openhands.dev' && "
-            "git config --global user.name 'OpenHands Evaluation' && "
-            "git commit --no-verify -m 'patch'"
+            f"git config --global user.email '{constants.GIT_USER_EMAIL}' && "
+            f"git config --global user.name '{constants.GIT_USER_NAME}' && "
+            f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'"
         )
 
         # Get git patch

From 1bc9344d303a2e86b866e88b41bbd0471a3ca185 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 13:35:47 +0000
Subject: [PATCH 2/6] refactor: clean up constants.py and fix behavior-altering
 issues

- Fix DEFAULT_MODEL_NAME case: 'openhands' -> 'OpenHands' to match original function default
- Move WRAPPED_REPOS from build_images.py to constants.py
- Remove unused constants: DEFAULT_SPLIT, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_WORKERS,
  DEFAULT_MAX_ATTEMPTS, DEFAULT_MAX_RETRIES, DEFAULT_N_LIMIT, DEFAULT_NOTE, DEFAULT_OUTPUT_DIR
- Fix type: DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT is now int (600) instead of string

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/build_images.py |  6 ++----
 benchmarks/swebench/constants.py    | 14 ++++----------
 benchmarks/swebench/run_infer.py    |  2 +-
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py
index b6849581..2041ed58 100644
--- a/benchmarks/swebench/build_images.py
+++ b/benchmarks/swebench/build_images.py
@@ -27,8 +27,6 @@
 
 logger = get_logger(__name__)
 WRAPPER_DOCKERFILE = Path(__file__).with_name("Dockerfile.swebench-deps")
-# Repos that require the docutils/roman wrapper layer
-WRAPPED_REPOS = {"sphinx-doc"}
 
 
 def get_official_docker_image(
@@ -63,12 +61,12 @@ def should_wrap_custom_tag(custom_tag: str) -> bool:
     prefix = "sweb.eval.x86_64."
     if custom_tag.startswith(prefix):
         custom_tag = custom_tag[len(prefix) :]
-    return custom_tag.split("_", 1)[0] in WRAPPED_REPOS
+    return custom_tag.split("_", 1)[0] in constants.WRAPPED_REPOS
 
 
 def should_wrap_instance_id(instance_id: str) -> bool:
     repo = instance_id.split("__")[0]
-    return repo in WRAPPED_REPOS
+    return repo in constants.WRAPPED_REPOS
 
 
 def collect_unique_base_images(
diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
index 1206b507..a9c58891 100644
--- a/benchmarks/swebench/constants.py
+++ b/benchmarks/swebench/constants.py
@@ -9,13 +9,14 @@
 # Dataset Configuration
 # =============================================================================
 DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified"
-DEFAULT_SPLIT = "test"
 
 # =============================================================================
 # Docker Image Configuration
 # =============================================================================
 DOCKER_IMAGE_PREFIX = "docker.io/swebench/"
 DOCKER_IMAGE_TAG = "latest"
+# Repos that require the docutils/roman wrapper layer
+WRAPPED_REPOS = {"sphinx-doc"}
 
 # =============================================================================
 # Build Configuration
@@ -28,24 +29,17 @@
 # Runtime Configuration
 # =============================================================================
 DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
-DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600
 
 # =============================================================================
 # Evaluation Configuration
 # =============================================================================
-DEFAULT_MAX_ITERATIONS = 100
-DEFAULT_NUM_WORKERS = 1
-DEFAULT_MAX_ATTEMPTS = 3
-DEFAULT_MAX_RETRIES = 3
 DEFAULT_EVAL_WORKERS = "12"
-DEFAULT_N_LIMIT = 0
-DEFAULT_NOTE = "initial"
-DEFAULT_OUTPUT_DIR = "./eval_outputs"
 
 # =============================================================================
 # Model Configuration
 # =============================================================================
-DEFAULT_MODEL_NAME = "openhands"
+DEFAULT_MODEL_NAME = "OpenHands"
 
 # =============================================================================
 # Git Configuration
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 8c9050b6..e19f0877 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -189,7 +189,7 @@ def prepare_workspace(
             startup_timeout = float(
                 os.getenv(
                     "REMOTE_RUNTIME_STARTUP_TIMEOUT",
-                    constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
+                    str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),
                 )
             )
             workspace = APIRemoteWorkspace(

From f1447e5c7a085bfdb5544cddf07d2fac333512cf Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 14:48:37 +0000
Subject: [PATCH 3/6] refactor: improve constants.py with type safety and
 behavior preservation

- Fix breaking change: add DEFAULT_CLI_MODEL_NAME to preserve original CLI default ('openhands')
- Use typing.Final for all constants to indicate immutability
- Use frozenset for WRAPPED_REPOS (immutable)
- Use tuple for SETUP_FILES_TO_REMOVE (immutable)
- Change DEFAULT_EVAL_WORKERS to int type with proper conversion at usage sites
- Simplify section headers for cleaner code
- Add type=int to --workers argparse argument for proper type handling

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/constants.py  | 85 ++++++++++++++-----------------
 benchmarks/swebench/eval_infer.py |  7 +--
 2 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
index a9c58891..21312293 100644
--- a/benchmarks/swebench/constants.py
+++ b/benchmarks/swebench/constants.py
@@ -5,50 +5,43 @@
 used in the SWE-Bench evaluation workflow.
 """
 
-# =============================================================================
-# Dataset Configuration
-# =============================================================================
-DEFAULT_DATASET = "princeton-nlp/SWE-bench_Verified"
-
-# =============================================================================
-# Docker Image Configuration
-# =============================================================================
-DOCKER_IMAGE_PREFIX = "docker.io/swebench/"
-DOCKER_IMAGE_TAG = "latest"
-# Repos that require the docutils/roman wrapper layer
-WRAPPED_REPOS = {"sphinx-doc"}
-
-# =============================================================================
-# Build Configuration
-# =============================================================================
-BUILD_TARGET_SOURCE_MINIMAL = "source-minimal"
-BUILD_TARGET_BINARY = "binary"
-DEFAULT_BUILD_TARGET = BUILD_TARGET_SOURCE_MINIMAL
-
-# =============================================================================
-# Runtime Configuration
-# =============================================================================
-DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
-DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600
-
-# =============================================================================
-# Evaluation Configuration
-# =============================================================================
-DEFAULT_EVAL_WORKERS = "12"
-
-# =============================================================================
-# Model Configuration
-# =============================================================================
-DEFAULT_MODEL_NAME = "OpenHands"
-
-# =============================================================================
-# Git Configuration
-# =============================================================================
-GIT_USER_EMAIL = "evaluation@openhands.dev"
-GIT_USER_NAME = "OpenHands Evaluation"
-GIT_COMMIT_MESSAGE = "patch"
-
-# =============================================================================
+from typing import Final
+
+
+# Dataset
+DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified"
+
+# Docker
+DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
+DOCKER_IMAGE_TAG: Final[str] = "latest"
+WRAPPED_REPOS: Final[frozenset[str]] = frozenset(
+    {"sphinx-doc"}
+)  # Repos requiring docutils/roman wrapper
+
+# Build
+BUILD_TARGET_SOURCE_MINIMAL: Final[str] = "source-minimal"
+BUILD_TARGET_BINARY: Final[str] = "binary"
+DEFAULT_BUILD_TARGET: Final[str] = BUILD_TARGET_SOURCE_MINIMAL
+
+# Runtime
+DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600
+
+# Evaluation
+DEFAULT_EVAL_WORKERS: Final[int] = 12
+
+# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands"
+DEFAULT_MODEL_NAME: Final[str] = "OpenHands"
+DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands"
+
+# Git
+GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
+GIT_USER_NAME: Final[str] = "OpenHands Evaluation"
+GIT_COMMIT_MESSAGE: Final[str] = "patch"
+
 # Patch Processing
-# =============================================================================
-SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"]
+SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = (
+    "pyproject.toml",
+    "tox.ini",
+    "setup.py",
+)
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index c3ae28f9..b1c5ee69 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -117,7 +117,7 @@ def convert_to_swebench_format(
 def run_swebench_evaluation(
     predictions_file: str,
     dataset: str = constants.DEFAULT_DATASET,
-    workers: str = constants.DEFAULT_EVAL_WORKERS,
+    workers: int = constants.DEFAULT_EVAL_WORKERS,
 ) -> None:
     """
     Run SWE-Bench evaluation on the predictions file.
@@ -216,12 +216,13 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default=constants.DEFAULT_MODEL_NAME,
-        help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})",
+        default=constants.DEFAULT_CLI_MODEL_NAME,
+        help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})",
     )
 
     parser.add_argument(
         "--workers",
+        type=int,
         default=constants.DEFAULT_EVAL_WORKERS,
         help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})",
     )

From fee639d8b0000f8b917b3df359ab0a1ccf94f609 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 15:17:22 +0000
Subject: [PATCH 4/6] fix: use proper Literal type for build target constants

The pyright type checker was failing because DEFAULT_BUILD_TARGET was
typed as Final[str] but build_image() expects a TargetType which is
Literal['binary', 'binary-minimal', 'source', 'source-minimal'].

This fix adds a local TargetType alias and properly types the build
target constants to match the expected type signature.
---
 benchmarks/swebench/constants.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
index 21312293..88d795c8 100644
--- a/benchmarks/swebench/constants.py
+++ b/benchmarks/swebench/constants.py
@@ -5,7 +5,7 @@
 used in the SWE-Bench evaluation workflow.
 """
 
-from typing import Final
+from typing import Final, Literal
 
 
 # Dataset
@@ -18,10 +18,11 @@
     {"sphinx-doc"}
 )  # Repos requiring docutils/roman wrapper
 
-# Build
-BUILD_TARGET_SOURCE_MINIMAL: Final[str] = "source-minimal"
-BUILD_TARGET_BINARY: Final[str] = "binary"
-DEFAULT_BUILD_TARGET: Final[str] = BUILD_TARGET_SOURCE_MINIMAL
+# Build target type (matches openhands.agent_server.docker.build.TargetType)
+TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"]
+BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal"
+BUILD_TARGET_BINARY: Final[TargetType] = "binary"
+DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL
 
 # Runtime
 DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"

From 214c3abdfe4272ada5045f648dc3ac006cae9b41 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 16:42:08 +0000
Subject: [PATCH 5/6] refactor: centralize CLI argument defaults in
 utils/constants.py

- Remove hardcoded dataset default from args_parser.py (now None)
- Each benchmark sets its own dataset default via parser.set_defaults()
- Add shared constants to utils/constants.py:
  - DEFAULT_WORKSPACE='remote' (behavior change from 'docker')
  - DEFAULT_SPLIT, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_EVAL_WORKERS
  - DEFAULT_OUTPUT_DIR, DEFAULT_MAX_ATTEMPTS, DEFAULT_MAX_RETRIES
  - DEFAULT_NOTE, DEFAULT_N_LIMIT, DEFAULT_CRITIC
- Update args_parser.py to use constants for all defaults
- Update critics.py to use DEFAULT_CRITIC constant
- Update swebench/run_infer.py to set its own dataset default

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/run_infer.py |  2 ++
 benchmarks/utils/args_parser.py  | 54 ++++++++++++++++++++++----------
 benchmarks/utils/constants.py    | 33 +++++++++++++++++--
 benchmarks/utils/critics.py      |  5 +--
 4 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index e19f0877..942dbbbb 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -334,6 +334,8 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
+    # Set SWE-bench specific default dataset
+    parser.set_defaults(dataset=constants.DEFAULT_DATASET)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 60f08d73..4391054e 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -1,15 +1,19 @@
 """
-Argument parsing utilities for SWE-bench benchmarks.
+Argument parsing utilities for benchmarks.
 """
 
 import argparse
 
+from benchmarks.utils import constants
 from benchmarks.utils.critics import add_critic_args
 
 
 def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     """Create and return argument parser.
 
+    Note: --dataset has no default. Each benchmark should set its own default
+    using parser.set_defaults(dataset=<benchmark_specific_constant>).
+
     Returns:
         ArgumentParser instance
     """
@@ -23,41 +27,57 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--dataset",
         type=str,
-        default="princeton-nlp/SWE-bench_Verified",
-        help="Dataset name",
+        default=None,
+        help="Dataset name (required unless benchmark provides default)",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default=constants.DEFAULT_SPLIT,
+        help=f"Dataset split (default: {constants.DEFAULT_SPLIT})",
     )
-    parser.add_argument("--split", type=str, default="test", help="Dataset split")
     parser.add_argument(
         "--workspace",
         type=str,
-        default="docker",
+        default=constants.DEFAULT_WORKSPACE,
         choices=["docker", "remote"],
-        help="Type of workspace to use (default: docker)",
+        help=f"Type of workspace to use (default: {constants.DEFAULT_WORKSPACE})",
     )
     parser.add_argument(
-        "--max-iterations", type=int, default=100, help="Maximum iterations"
+        "--max-iterations",
+        type=int,
+        default=constants.DEFAULT_MAX_ITERATIONS,
+        help=f"Maximum iterations (default: {constants.DEFAULT_MAX_ITERATIONS})",
     )
     parser.add_argument(
-        "--num-workers", type=int, default=1, help="Number of evaluation workers"
+        "--num-workers",
+        type=int,
+        default=constants.DEFAULT_NUM_EVAL_WORKERS,
+        help=f"Number of evaluation workers (default: {constants.DEFAULT_NUM_EVAL_WORKERS})",
+    )
+    parser.add_argument(
+        "--note",
+        type=str,
+        default=constants.DEFAULT_NOTE,
+        help=f"Evaluation note (default: {constants.DEFAULT_NOTE})",
     )
-    parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
     parser.add_argument(
         "--output-dir",
         type=str,
-        default="./eval_outputs",
-        help="Evaluation output directory",
+        default=constants.DEFAULT_OUTPUT_DIR,
+        help=f"Evaluation output directory (default: {constants.DEFAULT_OUTPUT_DIR})",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        default=0,
-        help="Limit number of instances to evaluate",
+        default=constants.DEFAULT_N_LIMIT,
+        help=f"Limit number of instances to evaluate (default: {constants.DEFAULT_N_LIMIT})",
     )
     parser.add_argument(
         "--max-attempts",
         type=int,
-        default=3,
-        help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
+        default=constants.DEFAULT_MAX_ATTEMPTS,
+        help=f"Maximum number of attempts for iterative mode (default: {constants.DEFAULT_MAX_ATTEMPTS}, min: 1)",
     )
 
     # Add critic arguments
@@ -72,7 +92,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--max-retries",
         type=int,
-        default=3,
-        help="Maximum retries for instances that throw exceptions (default: 3)",
+        default=constants.DEFAULT_MAX_RETRIES,
+        help=f"Maximum retries for instances that throw exceptions (default: {constants.DEFAULT_MAX_RETRIES})",
     )
     return parser
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index 9337b847..124eb041 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1,2 +1,31 @@
-OUTPUT_FILENAME = "output.jsonl"
-EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"
+"""
+Shared constants for all benchmarks.
+
+This module contains default values used across multiple benchmarks.
+Benchmark-specific constants should be defined in their own constants.py files.
+"""
+
+from typing import Final
+
+
+# Output
+OUTPUT_FILENAME: Final[str] = "output.jsonl"
+
+# Docker
+EVAL_AGENT_SERVER_IMAGE: Final[str] = "ghcr.io/openhands/eval-agent-server"
+
+# Workspace
+DEFAULT_WORKSPACE: Final[str] = "remote"
+DEFAULT_SPLIT: Final[str] = "test"
+
+# Evaluation
+DEFAULT_MAX_ITERATIONS: Final[int] = 100
+DEFAULT_NUM_EVAL_WORKERS: Final[int] = 1
+DEFAULT_OUTPUT_DIR: Final[str] = "./eval_outputs"
+DEFAULT_MAX_ATTEMPTS: Final[int] = 3
+DEFAULT_MAX_RETRIES: Final[int] = 3
+DEFAULT_NOTE: Final[str] = "initial"
+DEFAULT_N_LIMIT: Final[int] = 0
+
+# Critic
+DEFAULT_CRITIC: Final[str] = "pass"
diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py
index af9c55ae..b97083a1 100644
--- a/benchmarks/utils/critics.py
+++ b/benchmarks/utils/critics.py
@@ -11,6 +11,7 @@
 from pathlib import Path
 from typing import Set
 
+from benchmarks.utils import constants
 from benchmarks.utils.models import EvalInstanceID, EvalOutput
 from openhands.sdk import get_logger
 from openhands.sdk.critic import (
@@ -37,9 +38,9 @@ def add_critic_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--critic",
         type=str,
-        default="pass",
+        default=constants.DEFAULT_CRITIC,
         help=(
-            "Name of the critic to use for evaluation (default: 'pass'). "
+            f"Name of the critic to use for evaluation (default: '{constants.DEFAULT_CRITIC}'). "
             "Critics determine whether an agent's output is considered successful "
             "and whether another attempt should be made in iterative evaluation mode. "
             "Available critics: "

From 76069a935c89012cc636cc73ebc0d4d3b03f9618 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 16:50:31 +0000
Subject: [PATCH 6/6] Revert "refactor: centralize CLI argument defaults in
 utils/constants.py"

This reverts commit 214c3abdfe4272ada5045f648dc3ac006cae9b41.
---
 benchmarks/swebench/run_infer.py |  2 --
 benchmarks/utils/args_parser.py  | 54 ++++++++++----------------------
 benchmarks/utils/constants.py    | 33 ++-----------------
 benchmarks/utils/critics.py      |  5 ++-
 4 files changed, 21 insertions(+), 73 deletions(-)

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 942dbbbb..e19f0877 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -334,8 +334,6 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
-    # Set SWE-bench specific default dataset
-    parser.set_defaults(dataset=constants.DEFAULT_DATASET)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 4391054e..60f08d73 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -1,19 +1,15 @@
 """
-Argument parsing utilities for benchmarks.
+Argument parsing utilities for SWE-bench benchmarks.
 """
 
 import argparse
 
-from benchmarks.utils import constants
 from benchmarks.utils.critics import add_critic_args
 
 
 def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     """Create and return argument parser.
 
-    Note: --dataset has no default. Each benchmark should set its own default
-    using parser.set_defaults(dataset=<benchmark_specific_constant>).
-
     Returns:
         ArgumentParser instance
     """
@@ -27,57 +23,41 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--dataset",
         type=str,
-        default=None,
-        help="Dataset name (required unless benchmark provides default)",
-    )
-    parser.add_argument(
-        "--split",
-        type=str,
-        default=constants.DEFAULT_SPLIT,
-        help=f"Dataset split (default: {constants.DEFAULT_SPLIT})",
+        default="princeton-nlp/SWE-bench_Verified",
+        help="Dataset name",
     )
+    parser.add_argument("--split", type=str, default="test", help="Dataset split")
     parser.add_argument(
         "--workspace",
         type=str,
-        default=constants.DEFAULT_WORKSPACE,
+        default="docker",
         choices=["docker", "remote"],
-        help=f"Type of workspace to use (default: {constants.DEFAULT_WORKSPACE})",
+        help="Type of workspace to use (default: docker)",
     )
     parser.add_argument(
-        "--max-iterations",
-        type=int,
-        default=constants.DEFAULT_MAX_ITERATIONS,
-        help=f"Maximum iterations (default: {constants.DEFAULT_MAX_ITERATIONS})",
+        "--max-iterations", type=int, default=100, help="Maximum iterations"
     )
     parser.add_argument(
-        "--num-workers",
-        type=int,
-        default=constants.DEFAULT_NUM_EVAL_WORKERS,
-        help=f"Number of evaluation workers (default: {constants.DEFAULT_NUM_EVAL_WORKERS})",
-    )
-    parser.add_argument(
-        "--note",
-        type=str,
-        default=constants.DEFAULT_NOTE,
-        help=f"Evaluation note (default: {constants.DEFAULT_NOTE})",
+        "--num-workers", type=int, default=1, help="Number of evaluation workers"
     )
+    parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
     parser.add_argument(
         "--output-dir",
         type=str,
-        default=constants.DEFAULT_OUTPUT_DIR,
-        help=f"Evaluation output directory (default: {constants.DEFAULT_OUTPUT_DIR})",
+        default="./eval_outputs",
+        help="Evaluation output directory",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        default=constants.DEFAULT_N_LIMIT,
-        help=f"Limit number of instances to evaluate (default: {constants.DEFAULT_N_LIMIT})",
+        default=0,
+        help="Limit number of instances to evaluate",
     )
     parser.add_argument(
         "--max-attempts",
         type=int,
-        default=constants.DEFAULT_MAX_ATTEMPTS,
-        help=f"Maximum number of attempts for iterative mode (default: {constants.DEFAULT_MAX_ATTEMPTS}, min: 1)",
+        default=3,
+        help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
     )
 
     # Add critic arguments
@@ -92,7 +72,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--max-retries",
         type=int,
-        default=constants.DEFAULT_MAX_RETRIES,
-        help=f"Maximum retries for instances that throw exceptions (default: {constants.DEFAULT_MAX_RETRIES})",
+        default=3,
+        help="Maximum retries for instances that throw exceptions (default: 3)",
     )
     return parser
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index 124eb041..9337b847 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1,31 +1,2 @@
-"""
-Shared constants for all benchmarks.
-
-This module contains default values used across multiple benchmarks.
-Benchmark-specific constants should be defined in their own constants.py files.
-"""
-
-from typing import Final
-
-
-# Output
-OUTPUT_FILENAME: Final[str] = "output.jsonl"
-
-# Docker
-EVAL_AGENT_SERVER_IMAGE: Final[str] = "ghcr.io/openhands/eval-agent-server"
-
-# Workspace
-DEFAULT_WORKSPACE: Final[str] = "remote"
-DEFAULT_SPLIT: Final[str] = "test"
-
-# Evaluation
-DEFAULT_MAX_ITERATIONS: Final[int] = 100
-DEFAULT_NUM_EVAL_WORKERS: Final[int] = 1
-DEFAULT_OUTPUT_DIR: Final[str] = "./eval_outputs"
-DEFAULT_MAX_ATTEMPTS: Final[int] = 3
-DEFAULT_MAX_RETRIES: Final[int] = 3
-DEFAULT_NOTE: Final[str] = "initial"
-DEFAULT_N_LIMIT: Final[int] = 0
-
-# Critic
-DEFAULT_CRITIC: Final[str] = "pass"
+OUTPUT_FILENAME = "output.jsonl"
+EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"
diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py
index b97083a1..af9c55ae 100644
--- a/benchmarks/utils/critics.py
+++ b/benchmarks/utils/critics.py
@@ -11,7 +11,6 @@
 from pathlib import Path
 from typing import Set
 
-from benchmarks.utils import constants
 from benchmarks.utils.models import EvalInstanceID, EvalOutput
 from openhands.sdk import get_logger
 from openhands.sdk.critic import (
@@ -38,9 +37,9 @@ def add_critic_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--critic",
         type=str,
-        default=constants.DEFAULT_CRITIC,
+        default="pass",
         help=(
-            f"Name of the critic to use for evaluation (default: '{constants.DEFAULT_CRITIC}'). "
+            "Name of the critic to use for evaluation (default: 'pass'). "
             "Critics determine whether an agent's output is considered successful "
             "and whether another attempt should be made in iterative evaluation mode. "
             "Available critics: "