From 1f3437b3bf616a9bd736de17edb86e6740ec4de6 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 17:34:56 +0000 Subject: [PATCH 01/33] Align default argument values with evaluation repository Update args_parser.py and benchmark-specific run_infer.py files to use default values that match the evaluation repository (OpenHands/evaluation) eval-job/values.yaml configuration. Shared defaults updated in args_parser.py: - workspace: 'docker' -> 'remote' - max-iterations: 100 -> 500 - critic: 'pass' -> 'finish_with_patch' Benchmark-specific overrides using parser.set_defaults(): - gaia: dataset='gaia-benchmark/GAIA' - swtbench: dataset='eth-sri/SWT-bench_Verified_bm25_27k_zsp' - commit0: max_attempts=1, max_retries=1 (in addition to existing dataset) Also updated AGENTS.md to document the default values alignment pattern. Co-authored-by: openhands --- AGENTS.md | 20 ++++++++++++++++++++ benchmarks/commit0/run_infer.py | 8 ++++++-- benchmarks/gaia/run_infer.py | 4 +++- benchmarks/swtbench/run_infer.py | 2 ++ benchmarks/utils/args_parser.py | 29 +++++++++++++++++++++-------- benchmarks/utils/critics.py | 8 ++++---- 6 files changed, 56 insertions(+), 15 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0206a51d..dae512f9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -83,6 +83,26 @@ make build # Rebuild environment 4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]` 5. Update README.md with usage instructions +# Default Values Alignment +Default values in `benchmarks/utils/args_parser.py` are aligned with the evaluation +repository (OpenHands/evaluation) `eval-job/values.yaml`. This ensures consistency +between local development and production runs. + +**Shared defaults in args_parser.py:** +- `--workspace`: "remote" (production uses remote workspaces) +- `--max-iterations`: 500 (sufficient for complex tasks) +- `--critic`: "finish_with_patch" (ensures agent produces valid patches) +- `--max-attempts`: 3 (allows retries on critic failures) +- `--max-retries`: 3 (handles transient errors) + +**Benchmark-specific overrides:** Use `parser.set_defaults()` in each benchmark's +`run_infer.py` before calling `parse_args()`: +- `gaia`: dataset="gaia-benchmark/GAIA" +- `swebench`: dataset="princeton-nlp/SWE-bench_Verified" (default) +- `swtbench`: dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp" +- `commit0`: dataset="wentingzhao/commit0_combined", max_attempts=1, max_retries=1 +- `swebenchmultimodal`: dataset="princeton-nlp/SWE-bench_Multimodal", split="dev" + # LLM Configuration LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93): ```json diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2e473669..c3ecf84e 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -596,8 +596,12 @@ def main() -> None: default="lite", help="all, lite, or each repo name", ) - # Override the default dataset for commit0 - parser.set_defaults(dataset="wentingzhao/commit0_combined") + # Override defaults for commit0 (matches evaluation repository values.yaml) + parser.set_defaults( + dataset="wentingzhao/commit0_combined", + max_attempts=1, + max_retries=1, + ) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 9a0a700d..18287950 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -551,6 +551,8 @@ def main() -> None: required=True, help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)", ) + # Override defaults for GAIA (matches evaluation repository values.yaml) + parser.set_defaults(dataset="gaia-benchmark/GAIA") args = parser.parse_args() # Create critic instance from parsed arguments @@ -585,7 +587,7 @@ def main() -> None: # Create metadata metadata = EvalMetadata( llm=llm, - dataset="gaia-benchmark/GAIA", + dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, eval_output_dir=structured_output_dir, diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index a454e580..147f30cf 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -355,6 +355,8 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) + # Override defaults for SWT-bench (matches evaluation repository values.yaml) + parser.set_defaults(dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp") args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 60f08d73..7572e843 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -1,5 +1,11 @@ """ -Argument parsing utilities for SWE-bench benchmarks. +Argument parsing utilities for benchmarks. + +Default values are aligned with the evaluation repository (OpenHands/evaluation) +to ensure consistency between local development and production runs. + +Benchmark-specific values should be set via parser.set_defaults() in each +benchmark's run_infer.py to override these common defaults. """ import argparse @@ -8,10 +14,17 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: - """Create and return argument parser. + """Create and return argument parser with common defaults. + + Default values match the most common settings used across benchmarks + in the evaluation repository. Individual benchmarks can override + these using parser.set_defaults() before calling parse_args(). + + Args: + add_llm_config: Whether to add the llm_config_path positional argument. Returns: - ArgumentParser instance + ArgumentParser instance with common benchmark arguments. """ parser = argparse.ArgumentParser(description="Run Evaluation inference") if add_llm_config: @@ -30,15 +43,15 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--workspace", type=str, - default="docker", + default="remote", choices=["docker", "remote"], - help="Type of workspace to use (default: docker)", + help="Type of workspace to use (default: remote)", ) parser.add_argument( - "--max-iterations", type=int, default=100, help="Maximum iterations" + "--max-iterations", type=int, default=500, help="Maximum iterations" ) parser.add_argument( - "--num-workers", type=int, default=1, help="Number of evaluation workers" + "--num-workers", type=int, default=1, help="Number of inference workers" ) parser.add_argument("--note", type=str, default="initial", help="Evaluation note") parser.add_argument( @@ -60,7 +73,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Maximum number of attempts for iterative mode (default: 3, min: 1)", ) - # Add critic arguments + # Add critic arguments (default: finish_with_patch) add_critic_args(parser) parser.add_argument( diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index af9c55ae..6bc78bea 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -37,17 +37,17 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, - default="pass", + default="finish_with_patch", help=( - "Name of the critic to use for evaluation (default: 'pass'). " + "Name of the critic to use for evaluation (default: 'finish_with_patch'). " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: " "'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), " "'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, " "'empty_patch_critic' - Only requires non-empty git patch. " - "For single-attempt runs (default), 'pass' is recommended as the actual evaluation " - "is performed by the benchmark's own scoring system." + "For production runs, 'finish_with_patch' is recommended as it ensures " + "the agent produces a valid patch before completing." ), ) parser.add_argument( From e58ddb79272aee520e1e12dc16cb530c67933b97 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 17:38:42 +0000 Subject: [PATCH 02/33] Add explicit set_defaults for swebench and update comment for swebenchmultimodal - swebench: Add explicit set_defaults(dataset, split) for consistency with other benchmarks, even though values match global defaults - swebenchmultimodal: Update comment to match the pattern used in other benchmarks Co-authored-by: openhands --- benchmarks/swebench/run_infer.py | 3 +++ benchmarks/swebenchmultimodal/run_infer.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index e19f0877..7124d863 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -334,6 +334,9 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) + # SWE-bench defaults match the global args_parser defaults (evaluation repository values.yaml) + # Explicit set_defaults for consistency with other benchmarks + parser.set_defaults(dataset="princeton-nlp/SWE-bench_Verified", split="test") args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 68e4c5b8..d47f3c74 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -423,7 +423,7 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # Override the default dataset and split for multimodal + # Override defaults for SWE-bench Multimodal (matches evaluation repository values.yaml) parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev") args = parser.parse_args() From dcb940f311d9f44f6d517be45be89c31d484b5c4 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 17:40:48 +0000 Subject: [PATCH 03/33] Remove default dataset from args_parser.py Each benchmark now sets its own dataset default via set_defaults(), so no global default is needed. Co-authored-by: openhands --- benchmarks/utils/args_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 7572e843..63263a7c 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -36,8 +36,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dataset", type=str, - default="princeton-nlp/SWE-bench_Verified", - help="Dataset name", + help="Dataset name (each benchmark sets its default via set_defaults)", ) parser.add_argument("--split", type=str, default="test", help="Dataset split") parser.add_argument( From c34d730d024626199a52e9c4ce6c8ae4642a110f Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 17:42:57 +0000 Subject: [PATCH 04/33] Add default value for llm_config_path All benchmarks in the evaluation repository use .llm_config/runtime.json as the LLM config path, so use this as the default. Co-authored-by: openhands --- benchmarks/utils/args_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 63263a7c..698dea7a 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -31,7 +31,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "llm_config_path", type=str, - help="Path to JSON LLM configuration", + nargs="?", + default=".llm_config/runtime.json", + help="Path to JSON LLM configuration (default: .llm_config/runtime.json)", ) parser.add_argument( "--dataset", From 6af188a8f723968419dc6411978402fa707181cd Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 28 Jan 2026 17:44:36 +0000 Subject: [PATCH 05/33] Revert "Add default value for llm_config_path" This reverts commit c34d730d024626199a52e9c4ce6c8ae4642a110f. --- benchmarks/utils/args_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 698dea7a..63263a7c 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -31,9 +31,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "llm_config_path", type=str, - nargs="?", - default=".llm_config/runtime.json", - help="Path to JSON LLM configuration (default: .llm_config/runtime.json)", + help="Path to JSON LLM configuration", ) parser.add_argument( "--dataset", From 5fcb61d45a5687273186792ecae95f01ad2345ec Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 07:52:42 +0000 Subject: [PATCH 06/33] WIP: Add config.py files and refactor to use INFER_DEFAULTS - Created config.py with INFER_DEFAULTS and EVAL_DEFAULTS for each benchmark - Removed all defaults from utils/args_parser.py - Removed default from critics.py - Updated swebench, gaia, swtbench run_infer.py to use INFER_DEFAULTS - Started commit0 update (import added) Co-authored-by: openhands --- benchmarks/commit0/config.py | 26 ++++++++++++++ benchmarks/commit0/run_infer.py | 1 + benchmarks/gaia/config.py | 25 +++++++++++++ benchmarks/gaia/run_infer.py | 4 +-- benchmarks/swebench/config.py | 27 ++++++++++++++ benchmarks/swebench/run_infer.py | 5 ++- benchmarks/swebenchmultimodal/config.py | 28 +++++++++++++++ benchmarks/swtbench/config.py | 28 +++++++++++++++ benchmarks/swtbench/eval_infer.py | 6 ++-- benchmarks/swtbench/run_infer.py | 4 +-- benchmarks/utils/args_parser.py | 47 +++++++++---------------- benchmarks/utils/critics.py | 9 ++--- 12 files changed, 164 insertions(+), 46 deletions(-) create mode 100644 benchmarks/commit0/config.py create mode 100644 benchmarks/gaia/config.py create mode 100644 benchmarks/swebench/config.py create mode 100644 benchmarks/swebenchmultimodal/config.py create mode 100644 benchmarks/swtbench/config.py diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py new file mode 100644 index 00000000..5855adbf --- /dev/null +++ b/benchmarks/commit0/config.py @@ -0,0 +1,26 @@ +""" +Commit0 benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "wentingzhao/commit0_combined", + "split": "test", + "repo_split": "lite", + "workspace": "remote", + "num_workers": 8, + "max_iterations": 500, + "max_attempts": 1, + "max_retries": 1, + "critic": "finish_with_patch", + "output_dir": "./eval_outputs", + "n_limit": 0, + "note": "initial", +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "model_name": "openhands", +} diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index c3ecf84e..e1e79f06 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,6 +12,7 @@ extract_custom_tag, get_base_docker_image, ) +from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py new file mode 100644 index 00000000..8a6977d4 --- /dev/null +++ b/benchmarks/gaia/config.py @@ -0,0 +1,25 @@ +""" +GAIA benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "gaia-benchmark/GAIA", + "split": "validation", + "workspace": "remote", + "num_workers": 30, + "max_iterations": 500, + "max_attempts": 3, + "max_retries": 3, + "critic": "finish_with_patch", + "output_dir": "./eval_outputs", + "n_limit": 0, + "note": "initial", +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "model_name": "openhands", +} diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 18287950..78e65581 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -11,6 +11,7 @@ from datasets import DatasetDict, load_dataset from PIL import Image +from benchmarks.gaia.config import INFER_DEFAULTS from benchmarks.gaia.scorer import question_scorer from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url from benchmarks.utils.args_parser import get_parser @@ -551,8 +552,7 @@ def main() -> None: required=True, help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)", ) - # Override defaults for GAIA (matches evaluation repository values.yaml) - parser.set_defaults(dataset="gaia-benchmark/GAIA") + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Create critic instance from parsed arguments diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py new file mode 100644 index 00000000..13d0839c --- /dev/null +++ b/benchmarks/swebench/config.py @@ -0,0 +1,27 @@ +""" +SWE-bench benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "split": "test", + "workspace": "remote", + "num_workers": 30, + "max_iterations": 500, + "max_attempts": 3, + "max_retries": 3, + "critic": "finish_with_patch", + "output_dir": "./eval_outputs", + "n_limit": 0, + "note": "initial", +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "model_name": "openhands", + "workers": 12, +} diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 7124d863..1064fb42 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -5,6 +5,7 @@ from jinja2 import Environment, FileSystemLoader from benchmarks.swebench import constants +from benchmarks.swebench.config import INFER_DEFAULTS from benchmarks.swebench.build_images import ( extract_custom_tag, get_official_docker_image, @@ -334,9 +335,7 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # SWE-bench defaults match the global args_parser defaults (evaluation repository values.yaml) - # Explicit set_defaults for consistency with other benchmarks - parser.set_defaults(dataset="princeton-nlp/SWE-bench_Verified", split="test") + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py new file mode 100644 index 00000000..d43306c0 --- /dev/null +++ b/benchmarks/swebenchmultimodal/config.py @@ -0,0 +1,28 @@ +""" +SWE-bench Multimodal benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Multimodal", + "split": "dev", + "workspace": "remote", + "num_workers": 30, + "max_iterations": 500, + "max_attempts": 3, + "max_retries": 3, + "critic": "finish_with_patch", + "output_dir": "./eval_outputs", + "n_limit": 0, + "note": "initial", +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Multimodal", + "split": "dev", + "model_name": "openhands", + "workers": 12, +} diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py new file mode 100644 index 00000000..2643f46d --- /dev/null +++ b/benchmarks/swtbench/config.py @@ -0,0 +1,28 @@ +""" +SWT-bench benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", + "split": "test", + "workspace": "remote", + "num_workers": 30, + "max_iterations": 500, + "max_attempts": 3, + "max_retries": 3, + "critic": "finish_with_patch", + "output_dir": "./eval_outputs", + "n_limit": 0, + "note": "initial", +} + +# Evaluation defaults (used by eval_infer.py) +# Note: eval uses SWE-bench dataset, not SWT-bench dataset +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "model_name": "OpenHands", + "workers": 24, +} diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..5cb3a4d2 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -237,7 +237,7 @@ def run_swtbench_evaluation( predictions_file: str, # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "12", + workers: str = "24", ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -384,8 +384,8 @@ def main() -> None: parser.add_argument( "--workers", - default="12", - help="Number of workers to use when evaluating", + default="24", + help="Number of workers to use when evaluating (default: 24)", ) args = parser.parse_args() diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 147f30cf..7c863539 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -4,6 +4,7 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swtbench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -355,8 +356,7 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # Override defaults for SWT-bench (matches evaluation repository values.yaml) - parser.set_defaults(dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp") + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 63263a7c..cae2f928 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -1,11 +1,9 @@ """ Argument parsing utilities for benchmarks. -Default values are aligned with the evaluation repository (OpenHands/evaluation) -to ensure consistency between local development and production runs. - -Benchmark-specific values should be set via parser.set_defaults() in each -benchmark's run_infer.py to override these common defaults. +This module defines common arguments used across all benchmarks. +No default values are set here - each benchmark must set its own defaults +via parser.set_defaults() to match the evaluation repository configuration. """ import argparse @@ -14,17 +12,16 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: - """Create and return argument parser with common defaults. + """Create and return argument parser without defaults. - Default values match the most common settings used across benchmarks - in the evaluation repository. Individual benchmarks can override - these using parser.set_defaults() before calling parse_args(). + Each benchmark must call parser.set_defaults() before parse_args() + to set values matching the evaluation repository (OpenHands/evaluation). Args: add_llm_config: Whether to add the llm_config_path positional argument. Returns: - ArgumentParser instance with common benchmark arguments. + ArgumentParser instance with common benchmark arguments (no defaults). """ parser = argparse.ArgumentParser(description="Run Evaluation inference") if add_llm_config: @@ -36,55 +33,45 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dataset", type=str, - help="Dataset name (each benchmark sets its default via set_defaults)", + help="Dataset name", ) - parser.add_argument("--split", type=str, default="test", help="Dataset split") + parser.add_argument("--split", type=str, help="Dataset split") parser.add_argument( "--workspace", type=str, - default="remote", choices=["docker", "remote"], - help="Type of workspace to use (default: remote)", - ) - parser.add_argument( - "--max-iterations", type=int, default=500, help="Maximum iterations" - ) - parser.add_argument( - "--num-workers", type=int, default=1, help="Number of inference workers" + help="Type of workspace to use", ) - parser.add_argument("--note", type=str, default="initial", help="Evaluation note") + parser.add_argument("--max-iterations", type=int, help="Maximum iterations") + parser.add_argument("--num-workers", type=int, help="Number of inference workers") + parser.add_argument("--note", type=str, help="Evaluation note") parser.add_argument( "--output-dir", type=str, - default="./eval_outputs", help="Evaluation output directory", ) parser.add_argument( "--n-limit", type=int, - default=0, - help="Limit number of instances to evaluate", + help="Limit number of instances to evaluate (0 = no limit)", ) parser.add_argument( "--max-attempts", type=int, - default=3, - help="Maximum number of attempts for iterative mode (default: 3, min: 1)", + help="Maximum number of attempts for iterative mode (min: 1)", ) - # Add critic arguments (default: finish_with_patch) + # Add critic arguments (no default) add_critic_args(parser) parser.add_argument( "--select", type=str, - default=None, help="Path to text file containing instance IDs to select (one per line)", ) parser.add_argument( "--max-retries", type=int, - default=3, - help="Maximum retries for instances that throw exceptions (default: 3)", + help="Maximum retries for instances that throw exceptions", ) return parser diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index 6bc78bea..b2978294 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -37,17 +37,14 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, - default="finish_with_patch", help=( - "Name of the critic to use for evaluation (default: 'finish_with_patch'). " + "Name of the critic to use for evaluation. " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: " - "'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), " + "'pass' - Always accepts the output (no retry logic), " "'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, " - "'empty_patch_critic' - Only requires non-empty git patch. " - "For production runs, 'finish_with_patch' is recommended as it ensures " - "the agent produces a valid patch before completing." + "'empty_patch_critic' - Only requires non-empty git patch." ), ) parser.add_argument( From 7fec81f96c344e911f25494b1392bf23945ca5fa Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:08:29 +0000 Subject: [PATCH 07/33] Fix import ordering to pass ruff lint checks Co-authored-by: openhands --- benchmarks/commit0/run_infer.py | 1 - benchmarks/swebench/run_infer.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index e1e79f06..c3ecf84e 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,7 +12,6 @@ extract_custom_tag, get_base_docker_image, ) -from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 1064fb42..259e9163 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -5,13 +5,13 @@ from jinja2 import Environment, FileSystemLoader from benchmarks.swebench import constants -from benchmarks.swebench.config import INFER_DEFAULTS from benchmarks.swebench.build_images import ( extract_custom_tag, get_official_docker_image, should_wrap_instance_id, wrap_image, ) +from benchmarks.swebench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE From 4ed5b722c69bcdedb7d1cd1edbb7af0436bbdaf4 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:14:56 +0000 Subject: [PATCH 08/33] Add missing workers field to GAIA and Commit0 EVAL_DEFAULTS Align EVAL_DEFAULTS with NUM_EVAL_WORKERS from evaluation repository values.yaml: - GAIA: workers=1 - Commit0: workers=1 Co-authored-by: openhands --- benchmarks/commit0/config.py | 1 + benchmarks/gaia/config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index 5855adbf..5acfbe62 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -23,4 +23,5 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { "model_name": "openhands", + "workers": 1, } diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 8a6977d4..af62c044 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -22,4 +22,5 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { "model_name": "openhands", + "workers": 1, } From 2c1a9e19e0bf2f9fc7ba0a3c0fb0d192e28b2ee6 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:19:02 +0000 Subject: [PATCH 09/33] Use EVAL_DEFAULTS from config in eval_infer.py files Update eval_infer.py files to import and use EVAL_DEFAULTS from their respective config.py files via parser.set_defaults(): - swebench/eval_infer.py: uses EVAL_DEFAULTS for dataset, model_name, workers - swtbench/eval_infer.py: uses EVAL_DEFAULTS for dataset, model_name, workers - swebenchmultimodal/eval_infer.py: uses EVAL_DEFAULTS for dataset, split, model_name, workers This ensures the default values defined in config.py are actually used by the evaluation scripts, aligning with the pattern used in run_infer.py files for INFER_DEFAULTS. Co-authored-by: openhands --- benchmarks/swebench/eval_infer.py | 7 +++++-- benchmarks/swebenchmultimodal/eval_infer.py | 15 +++++++-------- benchmarks/swtbench/eval_infer.py | 10 ++++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index b1c5ee69..8bb7a7eb 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -17,6 +17,7 @@ from pathlib import Path from benchmarks.swebench import constants +from benchmarks.swebench.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -223,10 +224,12 @@ def main() -> None: parser.add_argument( "--workers", type=int, - default=constants.DEFAULT_EVAL_WORKERS, - help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})", + help="Number of workers to use when evaluating", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 0984b3e5..1e675b7d 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import Any +from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -375,15 +376,12 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Multimodal", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Multimodal)", + help="SWE-Bench dataset to evaluate against", ) parser.add_argument( "--split", - default="dev", - help="Dataset split to use (default: dev)", + help="Dataset split to use", ) parser.add_argument( @@ -400,13 +398,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + help="Model name to use in the model_name_or_path field", ) parser.add_argument( "--workers", - default="12", help="Number of workers to use when evaluating", ) @@ -415,6 +411,9 @@ def main() -> None: help="Run ID for the evaluation (default: eval_)", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 5cb3a4d2..5fb0cc5c 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,7 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -378,16 +379,17 @@ def main() -> None: parser.add_argument( "--model-name", - default="OpenHands", - help="Model name to use in the model_name_or_path field (default: OpenHands)", + help="Model name to use in the model_name_or_path field", ) parser.add_argument( "--workers", - default="24", - help="Number of workers to use when evaluating (default: 24)", + help="Number of workers to use when evaluating", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file From 4e8cb53b8813a659937230304a8c4b37371637fa Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:22:04 +0000 Subject: [PATCH 10/33] Use INFER_DEFAULTS from config in commit0 and swebenchmultimodal run_infer.py Update run_infer.py files to import and use INFER_DEFAULTS from their respective config.py files via parser.set_defaults(): - commit0/run_infer.py: uses INFER_DEFAULTS for all inference settings - swebenchmultimodal/run_infer.py: uses INFER_DEFAULTS for all inference settings This ensures the default values defined in config.py are actually used by the inference scripts, completing the alignment with the evaluation repository values.yaml. Co-authored-by: openhands --- benchmarks/commit0/run_infer.py | 10 +++------- benchmarks/swebenchmultimodal/run_infer.py | 5 +++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index c3ecf84e..4cc166d8 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,6 +12,7 @@ extract_custom_tag, get_base_docker_image, ) +from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -593,15 +594,10 @@ def main() -> None: parser.add_argument( "--repo-split", type=str, - default="lite", help="all, lite, or each repo name", ) - # Override defaults for commit0 (matches evaluation repository values.yaml) - parser.set_defaults( - dataset="wentingzhao/commit0_combined", - max_attempts=1, - max_retries=1, - ) + # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index d47f3c74..85fc8254 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -10,6 +10,7 @@ extract_custom_tag, get_official_docker_image, ) +from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -423,8 +424,8 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # Override defaults for SWE-bench Multimodal (matches evaluation repository values.yaml) - parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev") + # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts From 7b8ab3dee67dc242c59168393830b6d11759888c Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:30:41 +0000 Subject: [PATCH 11/33] Use EVAL_DEFAULTS from config in commit0 and gaia eval_infer.py Update eval_infer.py files to import and use EVAL_DEFAULTS from their respective config.py files via parser.set_defaults(): - commit0/eval_infer.py: uses EVAL_DEFAULTS for model_name - gaia/eval_infer.py: uses EVAL_DEFAULTS for model_name This ensures all benchmarks consistently use their config.py defaults. Co-authored-by: openhands --- benchmarks/commit0/eval_infer.py | 7 +++++-- benchmarks/gaia/eval_infer.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index f03e73f6..4470b36d 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -15,6 +15,7 @@ import sys from pathlib import Path +from benchmarks.commit0.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report @@ -174,10 +175,12 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + help="Model name to use in the model_name_or_path field", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 889d132d..715211f3 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -18,6 +18,7 @@ import sys from pathlib import Path +from benchmarks.gaia.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -197,10 +198,12 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + help="Model name to use in the model_name_or_path field", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file From 05d34f9740e5726b09b55b7ea8e5ed3f2ba93c21 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:39:41 +0000 Subject: [PATCH 12/33] Move common defaults (note, n_limit, output_dir) to args_parser.py These fields are not benchmark-specific and should have global defaults: - note: 'initial' (user-facing option for run identification) - n_limit: 0 (no limit by default) - output_dir: OUTPUT_DIR from constants.py ('./eval_outputs') Added OUTPUT_DIR constant to benchmarks/utils/constants.py. This keeps INFER_DEFAULTS focused on benchmark-specific values from the evaluation repository's values.yaml. Co-authored-by: openhands --- benchmarks/commit0/config.py | 3 --- benchmarks/gaia/config.py | 3 --- benchmarks/swebench/config.py | 3 --- benchmarks/swebenchmultimodal/config.py | 3 --- benchmarks/swtbench/config.py | 3 --- benchmarks/utils/args_parser.py | 9 ++++++--- benchmarks/utils/constants.py | 1 + 7 files changed, 7 insertions(+), 18 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index 5acfbe62..9c84c35b 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -15,9 +15,6 @@ "max_attempts": 1, "max_retries": 1, "critic": "finish_with_patch", - "output_dir": "./eval_outputs", - "n_limit": 0, - "note": "initial", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index af62c044..192114ae 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -14,9 +14,6 @@ "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", - "output_dir": "./eval_outputs", - "n_limit": 0, - "note": "initial", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 13d0839c..4eba91f6 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -14,9 +14,6 @@ "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", - "output_dir": "./eval_outputs", - "n_limit": 0, - "note": "initial", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index d43306c0..e9affe05 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -14,9 +14,6 @@ "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", - "output_dir": "./eval_outputs", - "n_limit": 0, - "note": "initial", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index 2643f46d..9b3e727c 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -14,9 +14,6 @@ "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", - "output_dir": "./eval_outputs", - "n_limit": 0, - "note": "initial", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index cae2f928..28c9444a 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -2,12 +2,13 @@ Argument parsing utilities for benchmarks. This module defines common arguments used across all benchmarks. -No default values are set here - each benchmark must set its own defaults -via parser.set_defaults() to match the evaluation repository configuration. +Benchmark-specific defaults should be set via parser.set_defaults() +to match the evaluation repository configuration. """ import argparse +from benchmarks.utils.constants import OUTPUT_DIR from benchmarks.utils.critics import add_critic_args @@ -44,15 +45,17 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: ) parser.add_argument("--max-iterations", type=int, help="Maximum iterations") parser.add_argument("--num-workers", type=int, help="Number of inference workers") - parser.add_argument("--note", type=str, help="Evaluation note") + parser.add_argument("--note", type=str, default="initial", help="Evaluation note") parser.add_argument( "--output-dir", type=str, + default=OUTPUT_DIR, help="Evaluation output directory", ) parser.add_argument( "--n-limit", type=int, + default=0, help="Limit number of instances to evaluate (0 = no limit)", ) parser.add_argument( diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index 9337b847..e7f4f42b 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,2 +1,3 @@ OUTPUT_FILENAME = "output.jsonl" +OUTPUT_DIR = "./eval_outputs" EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" From 3d6955a607142df9020884f16ef90e8cd1c1dfea Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:50:09 +0000 Subject: [PATCH 13/33] Remove unused fields from INFER_DEFAULTS and EVAL_DEFAULTS - gaia: Remove max_retries from INFER_DEFAULTS (not used in run_infer.py) - gaia: Remove workers from EVAL_DEFAULTS (not used in eval_infer.py) - commit0: Remove workers from EVAL_DEFAULTS (not used in eval_infer.py) Each config now only contains fields that are actually used by the corresponding run_infer.py and eval_infer.py scripts. Co-authored-by: openhands --- benchmarks/commit0/config.py | 1 - benchmarks/gaia/config.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index 9c84c35b..f003693d 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -20,5 +20,4 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { "model_name": "openhands", - "workers": 1, } diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 192114ae..32d15960 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -12,12 +12,10 @@ "num_workers": 30, "max_iterations": 500, "max_attempts": 3, - "max_retries": 3, "critic": "finish_with_patch", } # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { "model_name": "openhands", - "workers": 1, } From 26d428b29b00bbca22c7f1ca00a8156f06a23724 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 08:53:21 +0000 Subject: [PATCH 14/33] Make --note optional with no default Remove the default value 'initial' from --note argument. When not specified, no note identifier is appended to the output directory. The construct_eval_output_dir function already handles None/empty values gracefully by not appending the _N_ suffix. Co-authored-by: openhands --- benchmarks/utils/args_parser.py | 2 +- benchmarks/utils/evaluation_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 28c9444a..6e6485e2 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -45,7 +45,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: ) parser.add_argument("--max-iterations", type=int, help="Maximum iterations") parser.add_argument("--num-workers", type=int, help="Number of inference workers") - parser.add_argument("--note", type=str, default="initial", help="Evaluation note") + parser.add_argument("--note", type=str, help="Optional evaluation note") parser.add_argument( "--output-dir", type=str, diff --git a/benchmarks/utils/evaluation_utils.py b/benchmarks/utils/evaluation_utils.py index 517b85d3..030457ea 100644 --- a/benchmarks/utils/evaluation_utils.py +++ b/benchmarks/utils/evaluation_utils.py @@ -18,7 +18,7 @@ def construct_eval_output_dir( dataset_name: str, model_name: str, max_iterations: int, - eval_note: str, + eval_note: str | None, ) -> str: """Construct the structured evaluation output directory path.""" # Format: eval_out/-// From e53928f66c45c067a6fdbbf7637a708baf090bbe Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:17:42 +0000 Subject: [PATCH 15/33] Use INFER_DEFAULTS for commit0 hardcoded values Replace hardcoded dataset, split, and repo_split values with references to INFER_DEFAULTS in: - commit0/run_infer.py: Commit0Evaluation class __init__ and prepare_instances - commit0/build_images.py: set only the specific defaults needed (dataset, split, repo_split) This ensures all commit0 code uses the centralized config values. Co-authored-by: openhands --- benchmarks/commit0/build_images.py | 10 +++++++--- benchmarks/commit0/run_infer.py | 18 +++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index b59704ea..642bce72 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -4,7 +4,7 @@ Example: uv run benchmarks/commit0/build_images.py \ - --dataset wentingzhao/commit0_combined --split test --repo-split lite \ + --repo-split lite \ --image ghcr.io/openhands/eval-agent-server --push --max-workers 16 """ @@ -13,6 +13,7 @@ from commit0.harness.constants import SPLIT +from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -90,7 +91,6 @@ def main(argv: list[str]) -> int: parser.add_argument( "--repo-split", type=str, - default="lite", help="Commit0 repo split (lite, all, or repo name)", ) parser.add_argument( @@ -99,7 +99,11 @@ def main(argv: list[str]) -> int: default="", help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)", ) - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults( + dataset=INFER_DEFAULTS["dataset"], + split=INFER_DEFAULTS["split"], + repo_split=INFER_DEFAULTS["repo_split"], + ) args = parser.parse_args(argv) docker_image_prefix = args.docker_image_prefix or None diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 4cc166d8..7ec21b6e 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -111,9 +111,9 @@ def __init__( self, metadata: EvalMetadata, num_workers: int = 1, - repo_split: str = "lite", - dataset_name: str = "wentingzhao/commit0_combined", - dataset_split: str = "test", + repo_split: str | None = None, + dataset_name: str | None = None, + dataset_split: str | None = None, ): super().__init__(metadata=metadata, num_workers=num_workers) # Store additional parameters in metadata.details for access in methods @@ -121,9 +121,9 @@ def __init__( metadata.details = {} metadata.details.update( { - "repo_split": repo_split, - "dataset_name": dataset_name, - "dataset_split": dataset_split, + "repo_split": repo_split or INFER_DEFAULTS["repo_split"], + "dataset_name": dataset_name or INFER_DEFAULTS["dataset"], + "dataset_split": dataset_split or INFER_DEFAULTS["split"], } ) @@ -131,9 +131,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up Commit0 evaluation data") details = self.metadata.details or {} - dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined") - dataset_split = details.get("dataset_split", "test") - repo_split = details.get("repo_split", "lite") + dataset_name = details.get("dataset_name", INFER_DEFAULTS["dataset"]) + dataset_split = details.get("dataset_split", INFER_DEFAULTS["split"]) + repo_split = details.get("repo_split", INFER_DEFAULTS["repo_split"]) dataset = load_dataset(dataset_name, split=dataset_split) df = commit0_setup(dataset, repo_split) From 9fe08b9f3f7a43f2a93ebea93360d29047f140a4 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:24:14 +0000 Subject: [PATCH 16/33] Revert commit0/eval_infer.py and remove EVAL_DEFAULTS The commit0 eval_infer.py is a simple JSON processor that doesn't need centralized defaults. Reverted to main version. Co-authored-by: openhands --- benchmarks/commit0/config.py | 5 ----- benchmarks/commit0/eval_infer.py | 7 ++----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index f003693d..922feca4 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -16,8 +16,3 @@ "max_retries": 1, "critic": "finish_with_patch", } - -# Evaluation defaults (used by eval_infer.py) -EVAL_DEFAULTS = { - "model_name": "openhands", -} diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index 4470b36d..f03e73f6 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -15,7 +15,6 @@ import sys from pathlib import Path -from benchmarks.commit0.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report @@ -175,12 +174,10 @@ def main() -> None: parser.add_argument( "--model-name", - help="Model name to use in the model_name_or_path field", + default="openhands", + help="Model name to use in the model_name_or_path field (default: openhands)", ) - # Apply EVAL_DEFAULTS from config - parser.set_defaults(**EVAL_DEFAULTS) - args = parser.parse_args() # Validate input file From 5b77dfa102817b8295097bb85bd14ac7d84611e3 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:26:56 +0000 Subject: [PATCH 17/33] Revert gaia/eval_infer.py and remove EVAL_DEFAULTS The gaia eval_infer.py is a simple JSON processor that doesn't need centralized defaults. Reverted to main version. Co-authored-by: openhands --- benchmarks/gaia/config.py | 5 ----- benchmarks/gaia/eval_infer.py | 7 ++----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 32d15960..4f2ce74c 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -14,8 +14,3 @@ "max_attempts": 3, "critic": "finish_with_patch", } - -# Evaluation defaults (used by eval_infer.py) -EVAL_DEFAULTS = { - "model_name": "openhands", -} diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 715211f3..889d132d 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -18,7 +18,6 @@ import sys from pathlib import Path -from benchmarks.gaia.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -198,12 +197,10 @@ def main() -> None: parser.add_argument( "--model-name", - help="Model name to use in the model_name_or_path field", + default="openhands", + help="Model name to use in the model_name_or_path field (default: openhands)", ) - # Apply EVAL_DEFAULTS from config - parser.set_defaults(**EVAL_DEFAULTS) - args = parser.parse_args() # Validate input file From b19fb1dc67b150f8a0de6e5811e449587f17f988 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:30:23 +0000 Subject: [PATCH 18/33] Use constants.py values in swebench/config.py Import DEFAULT_DATASET, DEFAULT_CLI_MODEL_NAME, DEFAULT_EVAL_WORKERS from constants.py instead of duplicating the values. This ensures constants.py remains the single source of truth for these values. Co-authored-by: openhands --- benchmarks/swebench/config.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 4eba91f6..13f0cd2a 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -4,9 +4,16 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +from benchmarks.swebench.constants import ( + DEFAULT_CLI_MODEL_NAME, + DEFAULT_DATASET, + DEFAULT_EVAL_WORKERS, +) + + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { - "dataset": "princeton-nlp/SWE-bench_Verified", + "dataset": DEFAULT_DATASET, "split": "test", "workspace": "remote", "num_workers": 30, @@ -18,7 +25,7 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { - "dataset": "princeton-nlp/SWE-bench_Verified", - "model_name": "openhands", - "workers": 12, + "dataset": DEFAULT_DATASET, + "model_name": DEFAULT_CLI_MODEL_NAME, + "workers": DEFAULT_EVAL_WORKERS, } From 5b68a77f065e24ba630a00d7bbaeab4c37245bc3 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:35:48 +0000 Subject: [PATCH 19/33] Move DEFAULT_DATASET, DEFAULT_EVAL_WORKERS, DEFAULT_CLI_MODEL_NAME to config.py Remove these constants from constants.py and update eval_infer.py to use EVAL_DEFAULTS from config.py instead. config.py is now the single source of truth for dataset, model_name, and workers defaults. Co-authored-by: openhands --- benchmarks/swebench/config.py | 15 ++++----------- benchmarks/swebench/constants.py | 13 +++---------- benchmarks/swebench/eval_infer.py | 12 +++++------- 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 13f0cd2a..4eba91f6 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -4,16 +4,9 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ -from benchmarks.swebench.constants import ( - DEFAULT_CLI_MODEL_NAME, - DEFAULT_DATASET, - DEFAULT_EVAL_WORKERS, -) - - # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { - "dataset": DEFAULT_DATASET, + "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", "workspace": "remote", "num_workers": 30, @@ -25,7 +18,7 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { - "dataset": DEFAULT_DATASET, - "model_name": DEFAULT_CLI_MODEL_NAME, - "workers": DEFAULT_EVAL_WORKERS, + "dataset": "princeton-nlp/SWE-bench_Verified", + "model_name": "openhands", + "workers": 12, } diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index 88d795c8..6cfd4809 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -1,16 +1,13 @@ """ SWE-Bench hyperparameters and constant values. -This module serves as the single source of truth for all constant values -used in the SWE-Bench evaluation workflow. +This module provides constant values used in the SWE-Bench evaluation workflow. +For dataset, model, and worker defaults, see config.py (INFER_DEFAULTS, EVAL_DEFAULTS). """ from typing import Final, Literal -# Dataset -DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" - # Docker DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" DOCKER_IMAGE_TAG: Final[str] = "latest" @@ -28,12 +25,8 @@ DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 -# Evaluation -DEFAULT_EVAL_WORKERS: Final[int] = 12 - -# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands" +# Model - preserving original behavior: function default is "OpenHands" DEFAULT_MODEL_NAME: Final[str] = "OpenHands" -DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands" # Git GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index 8bb7a7eb..eefcc4a4 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -28,7 +28,7 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME + input_file: str, output_file: str, model_name: str = EVAL_DEFAULTS["model_name"] ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -117,8 +117,8 @@ def convert_to_swebench_format( def run_swebench_evaluation( predictions_file: str, - dataset: str = constants.DEFAULT_DATASET, - workers: int = constants.DEFAULT_EVAL_WORKERS, + dataset: str = EVAL_DEFAULTS["dataset"], + workers: int = EVAL_DEFAULTS["workers"], ) -> None: """ Run SWE-Bench evaluation on the predictions file. @@ -199,8 +199,7 @@ def main() -> None: parser.add_argument( "--dataset", - default=constants.DEFAULT_DATASET, - help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})", + help="SWE-Bench dataset to evaluate against", ) parser.add_argument( @@ -217,8 +216,7 @@ def main() -> None: parser.add_argument( "--model-name", - default=constants.DEFAULT_CLI_MODEL_NAME, - help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})", + help="Model name to use in the model_name_or_path field", ) parser.add_argument( From e3b2b2d522eba2740657a07302a86b92040b4b14 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:44:44 +0000 Subject: [PATCH 20/33] Keep DEFAULT_CLI_MODEL_NAME in constants.py, remove model_name from EVAL_DEFAULTS model_name is specific to the CLI and should stay in constants.py. EVAL_DEFAULTS now only contains dataset and workers. Co-authored-by: openhands --- benchmarks/swebench/config.py | 1 - benchmarks/swebench/constants.py | 3 ++- benchmarks/swebench/eval_infer.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 4eba91f6..a6ea209a 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -19,6 +19,5 @@ # Evaluation defaults (used by eval_infer.py) EVAL_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", - "model_name": "openhands", "workers": 12, } diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index 6cfd4809..46ca83ea 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -25,8 +25,9 @@ DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 -# Model - preserving original behavior: function default is "OpenHands" +# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands" DEFAULT_MODEL_NAME: Final[str] = "OpenHands" +DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands" # Git GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index eefcc4a4..0d688f31 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -28,7 +28,9 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = EVAL_DEFAULTS["model_name"] + input_file: str, + output_file: str, + model_name: str = constants.DEFAULT_CLI_MODEL_NAME, ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. From 24ba5bd6dd297216419d143fa179bdeea8a604d1 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:48:39 +0000 Subject: [PATCH 21/33] Remove model_name from swebenchmultimodal and swtbench EVAL_DEFAULTS Revert eval_infer.py files to main and remove model_name from EVAL_DEFAULTS. The model_name is hardcoded in the eval_infer.py files. Co-authored-by: openhands --- benchmarks/swebenchmultimodal/config.py | 1 - benchmarks/swebenchmultimodal/eval_infer.py | 15 ++++++++------- benchmarks/swtbench/config.py | 1 - benchmarks/swtbench/eval_infer.py | 10 ++++------ 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index e9affe05..53550855 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -20,6 +20,5 @@ EVAL_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", - "model_name": "openhands", "workers": 12, } diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 1e675b7d..0984b3e5 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -16,7 +16,6 @@ from pathlib import Path from typing import Any -from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -376,12 +375,15 @@ def main() -> None: parser.add_argument( "--dataset", - help="SWE-Bench dataset to evaluate against", + default="princeton-nlp/SWE-bench_Multimodal", + help="SWE-Bench dataset to evaluate against " + "(default: princeton-nlp/SWE-bench_Multimodal)", ) parser.add_argument( "--split", - help="Dataset split to use", + default="dev", + help="Dataset split to use (default: dev)", ) parser.add_argument( @@ -398,11 +400,13 @@ def main() -> None: parser.add_argument( "--model-name", - help="Model name to use in the model_name_or_path field", + default="openhands", + help="Model name to use in the model_name_or_path field (default: openhands)", ) parser.add_argument( "--workers", + default="12", help="Number of workers to use when evaluating", ) @@ -411,9 +415,6 @@ def main() -> None: help="Run ID for the evaluation (default: eval_)", ) - # Apply EVAL_DEFAULTS from config - parser.set_defaults(**EVAL_DEFAULTS) - args = parser.parse_args() # Validate input file diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index 9b3e727c..f2473d58 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -20,6 +20,5 @@ # Note: eval uses SWE-bench dataset, not SWT-bench dataset EVAL_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", - "model_name": "OpenHands", "workers": 24, } diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 5fb0cc5c..4f5f0632 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,7 +18,6 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -238,7 +237,7 @@ def run_swtbench_evaluation( predictions_file: str, # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. dataset: str = "princeton-nlp/SWE-bench_Verified", - workers: str = "24", + workers: str = "12", ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -379,17 +378,16 @@ def main() -> None: parser.add_argument( "--model-name", - help="Model name to use in the model_name_or_path field", + default="OpenHands", + help="Model name to use in the model_name_or_path field (default: OpenHands)", ) parser.add_argument( "--workers", + default="12", help="Number of workers to use when evaluating", ) - # Apply EVAL_DEFAULTS from config - parser.set_defaults(**EVAL_DEFAULTS) - args = parser.parse_args() # Validate input file From 19b214f5474a18f7c99fdf459eca37ec31397aba Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:53:25 +0000 Subject: [PATCH 22/33] Use EVAL_DEFAULTS for dataset, split, workers in swebenchmultimodal and swtbench eval_infer Import EVAL_DEFAULTS and use parser.set_defaults() to apply them. model_name remains hardcoded in the argument parser. Co-authored-by: openhands --- benchmarks/swebenchmultimodal/eval_infer.py | 12 ++++++------ benchmarks/swtbench/eval_infer.py | 9 +++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 0984b3e5..b65b0c66 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import Any +from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -375,15 +376,12 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Multimodal", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Multimodal)", + help="SWE-Bench dataset to evaluate against", ) parser.add_argument( "--split", - default="dev", - help="Dataset split to use (default: dev)", + help="Dataset split to use", ) parser.add_argument( @@ -406,10 +404,12 @@ def main() -> None: parser.add_argument( "--workers", - default="12", + type=int, help="Number of workers to use when evaluating", ) + parser.set_defaults(**EVAL_DEFAULTS) + parser.add_argument( "--run-id", help="Run ID for the evaluation (default: eval_)", diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..c21bc0cd 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,7 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -359,9 +360,7 @@ def main() -> None: # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWT-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + help="SWT-Bench dataset to evaluate against", ) parser.add_argument( @@ -384,10 +383,12 @@ def main() -> None: parser.add_argument( "--workers", - default="12", + type=int, help="Number of workers to use when evaluating", ) + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file From c11887c6fb58997ebba903157c0c23152a167f77 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 10:58:25 +0000 Subject: [PATCH 23/33] Use INFER_DEFAULTS for dataset/split in swtbench image_utils and build_eval_env_images Update image_utils.py, build_eval_env_images.py, and eval_infer.py to import and use INFER_DEFAULTS instead of hardcoding dataset and split values. Co-authored-by: openhands --- benchmarks/swtbench/build_eval_env_images.py | 9 +++++++-- benchmarks/swtbench/eval_infer.py | 4 ++-- benchmarks/swtbench/image_utils.py | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 079ad66c..1c6e0820 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -9,6 +9,7 @@ import docker +from benchmarks.swtbench.config import INFER_DEFAULTS from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from benchmarks.utils.image_utils import image_exists as remote_image_exists @@ -257,8 +258,12 @@ def main() -> None: parser = argparse.ArgumentParser( description="Build and push prebaked SWT-bench eval env images." ) - parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", help="Dataset name") + parser.add_argument("--split", help="Dataset split") + parser.set_defaults( + dataset=INFER_DEFAULTS["dataset"], + split=INFER_DEFAULTS["split"], + ) parser.add_argument( "--eval-limit", type=int, diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index c21bc0cd..1464cc01 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,7 +18,7 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.config import EVAL_DEFAULTS +from benchmarks.swtbench.config import EVAL_DEFAULTS, INFER_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -68,7 +68,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: def try_pull_prebaked_images( predictions_file: Path, dataset: str, - split: str = "test", + split: str = INFER_DEFAULTS["split"], registry: str = PREBAKED_REGISTRY, ) -> None: """ diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index e7aae1f4..1459ee13 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Iterable +from benchmarks.swtbench.config import INFER_DEFAULTS from openhands.sdk import get_logger @@ -130,8 +131,12 @@ def main() -> None: description="List SWT-bench base/env images required for a predictions file." ) parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") - parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", help="Dataset name") + parser.add_argument("--split", help="Dataset split") + parser.set_defaults( + dataset=INFER_DEFAULTS["dataset"], + split=INFER_DEFAULTS["split"], + ) parser.add_argument( "--format", choices=["plain", "json"], From 0b229371af40503711b06799916b77dc55580ea6 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:04:30 +0000 Subject: [PATCH 24/33] Fix swtbench: use EVAL_DEFAULTS for eval-related files, add split to EVAL_DEFAULTS image_utils.py and build_eval_env_images.py are used for evaluation, so they should use EVAL_DEFAULTS (princeton-nlp/SWE-bench_Verified) not INFER_DEFAULTS (eth-sri/SWT-bench_Verified_bm25_27k_zsp). Added split='test' to EVAL_DEFAULTS to match values.yaml. Co-authored-by: openhands --- benchmarks/swtbench/build_eval_env_images.py | 6 +++--- benchmarks/swtbench/config.py | 1 + benchmarks/swtbench/eval_infer.py | 4 ++-- benchmarks/swtbench/image_utils.py | 7 ++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 1c6e0820..fde30ed9 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -9,7 +9,7 @@ import docker -from benchmarks.swtbench.config import INFER_DEFAULTS +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from benchmarks.utils.image_utils import image_exists as remote_image_exists @@ -261,8 +261,8 @@ def main() -> None: parser.add_argument("--dataset", help="Dataset name") parser.add_argument("--split", help="Dataset split") parser.set_defaults( - dataset=INFER_DEFAULTS["dataset"], - split=INFER_DEFAULTS["split"], + dataset=EVAL_DEFAULTS["dataset"], + split=EVAL_DEFAULTS["split"], ) parser.add_argument( "--eval-limit", diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index f2473d58..a9f3276a 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -20,5 +20,6 @@ # Note: eval uses SWE-bench dataset, not SWT-bench dataset EVAL_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", + "split": "test", "workers": 24, } diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 1464cc01..8c37775f 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,7 +18,7 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.config import EVAL_DEFAULTS, INFER_DEFAULTS +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -68,7 +68,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: def try_pull_prebaked_images( predictions_file: Path, dataset: str, - split: str = INFER_DEFAULTS["split"], + split: str = EVAL_DEFAULTS["split"], registry: str = PREBAKED_REGISTRY, ) -> None: """ diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index 1459ee13..c5d34035 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Iterable -from benchmarks.swtbench.config import INFER_DEFAULTS +from benchmarks.swtbench.config import EVAL_DEFAULTS from openhands.sdk import get_logger @@ -133,10 +133,7 @@ def main() -> None: parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") parser.add_argument("--dataset", help="Dataset name") parser.add_argument("--split", help="Dataset split") - parser.set_defaults( - dataset=INFER_DEFAULTS["dataset"], - split=INFER_DEFAULTS["split"], - ) + parser.set_defaults(**EVAL_DEFAULTS) parser.add_argument( "--format", choices=["plain", "json"], From 98bc7b4f354a4013e7e67f86c55d4a113da4d914 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:07:02 +0000 Subject: [PATCH 25/33] Revert AGENTS.md and fix commit0/build_images.py docstring Revert AGENTS.md to main version. Restore original docstring example in build_images.py. Co-authored-by: openhands --- AGENTS.md | 20 -------------------- benchmarks/commit0/build_images.py | 2 +- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index dae512f9..0206a51d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -83,26 +83,6 @@ make build # Rebuild environment 4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]` 5. Update README.md with usage instructions -# Default Values Alignment -Default values in `benchmarks/utils/args_parser.py` are aligned with the evaluation -repository (OpenHands/evaluation) `eval-job/values.yaml`. This ensures consistency -between local development and production runs. - -**Shared defaults in args_parser.py:** -- `--workspace`: "remote" (production uses remote workspaces) -- `--max-iterations`: 500 (sufficient for complex tasks) -- `--critic`: "finish_with_patch" (ensures agent produces valid patches) -- `--max-attempts`: 3 (allows retries on critic failures) -- `--max-retries`: 3 (handles transient errors) - -**Benchmark-specific overrides:** Use `parser.set_defaults()` in each benchmark's -`run_infer.py` before calling `parse_args()`: -- `gaia`: dataset="gaia-benchmark/GAIA" -- `swebench`: dataset="princeton-nlp/SWE-bench_Verified" (default) -- `swtbench`: dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp" -- `commit0`: dataset="wentingzhao/commit0_combined", max_attempts=1, max_retries=1 -- `swebenchmultimodal`: dataset="princeton-nlp/SWE-bench_Multimodal", split="dev" - # LLM Configuration LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93): ```json diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index 642bce72..8b891d85 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -4,7 +4,7 @@ Example: uv run benchmarks/commit0/build_images.py \ - --repo-split lite \ + --dataset wentingzhao/commit0_combined --split test --repo-split lite \ --image ghcr.io/openhands/eval-agent-server --push --max-workers 16 """ From a6507ed723bf14228f59a6fdd466789a0080f3b3 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:10:27 +0000 Subject: [PATCH 26/33] Move workspace default to args_parser.py, remove from INFER_DEFAULTS Set workspace default='remote' in args_parser.py since it's the same for all benchmarks. Remove workspace from all INFER_DEFAULTS in config.py files. Co-authored-by: openhands --- benchmarks/commit0/config.py | 1 - benchmarks/gaia/config.py | 1 - benchmarks/swebench/config.py | 1 - benchmarks/swebenchmultimodal/config.py | 1 - benchmarks/swtbench/config.py | 1 - benchmarks/utils/args_parser.py | 3 ++- 6 files changed, 2 insertions(+), 6 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index 922feca4..fb3c10d9 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -9,7 +9,6 @@ "dataset": "wentingzhao/commit0_combined", "split": "test", "repo_split": "lite", - "workspace": "remote", "num_workers": 8, "max_iterations": 500, "max_attempts": 1, diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 4f2ce74c..d4d529c4 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -8,7 +8,6 @@ INFER_DEFAULTS = { "dataset": "gaia-benchmark/GAIA", "split": "validation", - "workspace": "remote", "num_workers": 30, "max_iterations": 500, "max_attempts": 3, diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index a6ea209a..c265963c 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -8,7 +8,6 @@ INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", - "workspace": "remote", "num_workers": 30, "max_iterations": 500, "max_attempts": 3, diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index 53550855..00db964f 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -8,7 +8,6 @@ INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", - "workspace": "remote", "num_workers": 30, "max_iterations": 500, "max_attempts": 3, diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index a9f3276a..73a957dd 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -8,7 +8,6 @@ INFER_DEFAULTS = { "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", "split": "test", - "workspace": "remote", "num_workers": 30, "max_iterations": 500, "max_attempts": 3, diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 6e6485e2..70803949 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -40,8 +40,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--workspace", type=str, + default="remote", choices=["docker", "remote"], - help="Type of workspace to use", + help="Type of workspace to use (default: remote)", ) parser.add_argument("--max-iterations", type=int, help="Maximum iterations") parser.add_argument("--num-workers", type=int, help="Number of inference workers") From ebcdec14e5d1de3cc04dfaafaccd4d9f3eec7b03 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:15:31 +0000 Subject: [PATCH 27/33] Move max_iterations default to args_parser.py, remove from INFER_DEFAULTS Set max_iterations default=500 in args_parser.py since it's the same for all benchmarks. Remove max_iterations from all INFER_DEFAULTS in config.py files. Co-authored-by: openhands --- benchmarks/commit0/config.py | 1 - benchmarks/gaia/config.py | 1 - benchmarks/swebench/config.py | 1 - benchmarks/swebenchmultimodal/config.py | 1 - benchmarks/swtbench/config.py | 1 - benchmarks/utils/args_parser.py | 7 ++++++- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index fb3c10d9..2bf77c9d 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -10,7 +10,6 @@ "split": "test", "repo_split": "lite", "num_workers": 8, - "max_iterations": 500, "max_attempts": 1, "max_retries": 1, "critic": "finish_with_patch", diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index d4d529c4..50a473f5 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -9,7 +9,6 @@ "dataset": "gaia-benchmark/GAIA", "split": "validation", "num_workers": 30, - "max_iterations": 500, "max_attempts": 3, "critic": "finish_with_patch", } diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index c265963c..4b24e297 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -9,7 +9,6 @@ "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", "num_workers": 30, - "max_iterations": 500, "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index 00db964f..de11a727 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -9,7 +9,6 @@ "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", "num_workers": 30, - "max_iterations": 500, "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index 73a957dd..e41ee0f3 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -9,7 +9,6 @@ "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", "split": "test", "num_workers": 30, - "max_iterations": 500, "max_attempts": 3, "max_retries": 3, "critic": "finish_with_patch", diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 70803949..d554dfeb 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -44,7 +44,12 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: choices=["docker", "remote"], help="Type of workspace to use (default: remote)", ) - parser.add_argument("--max-iterations", type=int, help="Maximum iterations") + parser.add_argument( + "--max-iterations", + type=int, + default=500, + help="Maximum iterations (default: 500)", + ) parser.add_argument("--num-workers", type=int, help="Number of inference workers") parser.add_argument("--note", type=str, help="Optional evaluation note") parser.add_argument( From 2443e7de354a38a18764d0b240ca3617b6150318 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:18:16 +0000 Subject: [PATCH 28/33] Move critic default to critics.py, remove from INFER_DEFAULTS Set critic default='finish_with_patch' in critics.py since it's the same for all benchmarks. Remove critic from all INFER_DEFAULTS in config.py files. Co-authored-by: openhands --- benchmarks/commit0/config.py | 1 - benchmarks/gaia/config.py | 1 - benchmarks/swebench/config.py | 1 - benchmarks/swebenchmultimodal/config.py | 1 - benchmarks/swtbench/config.py | 1 - benchmarks/utils/critics.py | 3 ++- 6 files changed, 2 insertions(+), 6 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index 2bf77c9d..ee83c646 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -12,5 +12,4 @@ "num_workers": 8, "max_attempts": 1, "max_retries": 1, - "critic": "finish_with_patch", } diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 50a473f5..6208d844 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -10,5 +10,4 @@ "split": "validation", "num_workers": 30, "max_attempts": 3, - "critic": "finish_with_patch", } diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 4b24e297..c882b7b5 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -11,7 +11,6 @@ "num_workers": 30, "max_attempts": 3, "max_retries": 3, - "critic": "finish_with_patch", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index de11a727..53be4375 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -11,7 +11,6 @@ "num_workers": 30, "max_attempts": 3, "max_retries": 3, - "critic": "finish_with_patch", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index e41ee0f3..1d87073c 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -11,7 +11,6 @@ "num_workers": 30, "max_attempts": 3, "max_retries": 3, - "critic": "finish_with_patch", } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index b2978294..fa9f9d92 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -37,8 +37,9 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, + default="finish_with_patch", help=( - "Name of the critic to use for evaluation. " + "Name of the critic to use for evaluation (default: finish_with_patch). " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: " From 231886635f3c584f19801b92f09a353aed9006eb Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:22:49 +0000 Subject: [PATCH 29/33] Revert constants.py, hardcode output-dir default in args_parser.py Revert benchmarks/utils/constants.py to main version. Hardcode './eval_outputs' as default for --output-dir in args_parser.py. Co-authored-by: openhands --- benchmarks/utils/args_parser.py | 3 +-- benchmarks/utils/constants.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index d554dfeb..5e3e14af 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -8,7 +8,6 @@ import argparse -from benchmarks.utils.constants import OUTPUT_DIR from benchmarks.utils.critics import add_critic_args @@ -55,7 +54,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--output-dir", type=str, - default=OUTPUT_DIR, + default="./eval_outputs", help="Evaluation output directory", ) parser.add_argument( diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index e7f4f42b..9337b847 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,3 +1,2 @@ OUTPUT_FILENAME = "output.jsonl" -OUTPUT_DIR = "./eval_outputs" EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" From b8ad269c3dd28f0e1873e331f63ef3a7bf9f57ee Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 11:38:03 +0000 Subject: [PATCH 30/33] Add default level='2023_all' for GAIA benchmark Add level to GAIA INFER_DEFAULTS matching production configuration. Make --level argument optional since it now has a default. Co-authored-by: openhands --- benchmarks/gaia/config.py | 1 + benchmarks/gaia/run_infer.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 6208d844..49fad361 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -8,6 +8,7 @@ INFER_DEFAULTS = { "dataset": "gaia-benchmark/GAIA", "split": "validation", + "level": "2023_all", "num_workers": 30, "max_attempts": 3, } diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 78e65581..7198ea63 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -549,8 +549,7 @@ def main() -> None: parser.add_argument( "--level", type=str, - required=True, - help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)", + help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3, 2023_all)", ) parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() From 19be07fc9b60189fe04192a39871a3099a8f9c39 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 29 Jan 2026 12:07:32 +0000 Subject: [PATCH 31/33] Simplify max_attempts and max_retries defaults - Keep default=3 for max_attempts and max_retries in args_parser.py - Remove redundant max_attempts=3 and max_retries=3 from config.py files (gaia, swebench, swebenchmultimodal, swtbench) since they match the default - Keep max_attempts=1 and max_retries=1 in commit0/config.py since it differs from the default - Remove max_retries from commit0/build_images.py set_defaults (uses global default) Co-authored-by: openhands --- benchmarks/commit0/build_images.py | 3 +- benchmarks/commit0/config.py | 6 +++ benchmarks/gaia/config.py | 6 ++- benchmarks/swebench/build_images.py | 2 + benchmarks/swebench/config.py | 7 +++- benchmarks/swebenchmultimodal/build_images.py | 2 + benchmarks/swebenchmultimodal/config.py | 7 +++- benchmarks/swtbench/build_images.py | 40 +++++++++++++++++-- benchmarks/swtbench/config.py | 7 +++- benchmarks/utils/args_parser.py | 6 ++- 10 files changed, 72 insertions(+), 14 deletions(-) diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index 8b891d85..3f24567e 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -13,7 +13,7 @@ from commit0.harness.constants import SPLIT -from benchmarks.commit0.config import INFER_DEFAULTS +from benchmarks.commit0.config import BUILD_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -103,6 +103,7 @@ def main(argv: list[str]) -> int: dataset=INFER_DEFAULTS["dataset"], split=INFER_DEFAULTS["split"], repo_split=INFER_DEFAULTS["repo_split"], + **BUILD_DEFAULTS, ) args = parser.parse_args(argv) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index ee83c646..f0b1decb 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -5,6 +5,7 @@ """ # Inference defaults (used by run_infer.py) +# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3) INFER_DEFAULTS = { "dataset": "wentingzhao/commit0_combined", "split": "test", @@ -13,3 +14,8 @@ "max_attempts": 1, "max_retries": 1, } + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 16, +} diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index 49fad361..dadaa20a 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -10,5 +10,9 @@ "split": "validation", "level": "2023_all", "num_workers": 30, - "max_attempts": 3, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 1, } diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index 2041ed58..cae96b87 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -13,6 +13,7 @@ from pathlib import Path from benchmarks.swebench import constants +from benchmarks.swebench.config import BUILD_DEFAULTS from benchmarks.utils.build_utils import ( BuildOutput, build_all_images, @@ -158,6 +159,7 @@ def _wrap_if_needed(result: BuildOutput, push: bool) -> BuildOutput: def main(argv: list[str]) -> int: parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) args = parser.parse_args(argv) base_images: list[str] = collect_unique_base_images( diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index c882b7b5..cb3059e5 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -9,8 +9,6 @@ "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", "num_workers": 30, - "max_attempts": 3, - "max_retries": 3, } # Evaluation defaults (used by eval_infer.py) @@ -18,3 +16,8 @@ "dataset": "princeton-nlp/SWE-bench_Verified", "workers": 12, } + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 32, +} diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py index d32b5dc6..987cf7bd 100644 --- a/benchmarks/swebenchmultimodal/build_images.py +++ b/benchmarks/swebenchmultimodal/build_images.py @@ -10,6 +10,7 @@ import sys +from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -68,6 +69,7 @@ def collect_unique_base_images(dataset, split, n_limit): def main(argv: list[str]) -> int: parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) args = parser.parse_args(argv) base_images: list[str] = collect_unique_base_images( diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index 53be4375..a0bcb772 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -9,8 +9,6 @@ "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", "num_workers": 30, - "max_attempts": 3, - "max_retries": 3, } # Evaluation defaults (used by eval_infer.py) @@ -19,3 +17,8 @@ "split": "dev", "workers": 12, } + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 32, +} diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py index 09db613d..3fcd2d8d 100644 --- a/benchmarks/swtbench/build_images.py +++ b/benchmarks/swtbench/build_images.py @@ -5,18 +5,50 @@ SWT-Bench uses the same base environment images and build flow as SWE-Bench. This module simply forwards to the SWE-Bench build logic to avoid duplication while keeping the SWT entrypoint stable for workflows. + +Note: SWT-bench uses max_workers=16 (vs SWE-bench's 32) via BUILD_DEFAULTS. """ import sys from benchmarks.swebench.build_images import ( - main as swebench_main, + _wrap_if_needed, + collect_unique_base_images, + extract_custom_tag, +) +from benchmarks.swtbench.config import BUILD_DEFAULTS +from benchmarks.utils.build_utils import ( + build_all_images, + default_build_output_dir, + get_build_parser, ) -# Re-export the SWE-Bench logic under the SWT entrypoint -def main(argv: list[str]) -> int: # pragma: no cover - thin wrapper - return swebench_main(argv) +def main(argv: list[str]) -> int: + parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) + args = parser.parse_args(argv) + + base_images: list[str] = collect_unique_base_images( + args.dataset, + args.split, + args.n_limit, + args.select, + ) + build_dir = default_build_output_dir(args.dataset, args.split) + + return build_all_images( + base_images=base_images, + target=args.target, + build_dir=build_dir, + image=args.image, + push=args.push, + max_workers=args.max_workers, + dry_run=args.dry_run, + max_retries=args.max_retries, + base_image_to_custom_tag_fn=extract_custom_tag, + post_build_fn=_wrap_if_needed, + ) if __name__ == "__main__": diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index 1d87073c..ad38f825 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -9,8 +9,6 @@ "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", "split": "test", "num_workers": 30, - "max_attempts": 3, - "max_retries": 3, } # Evaluation defaults (used by eval_infer.py) @@ -20,3 +18,8 @@ "split": "test", "workers": 24, } + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 16, +} diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 5e3e14af..6ae98855 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -66,7 +66,8 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--max-attempts", type=int, - help="Maximum number of attempts for iterative mode (min: 1)", + default=3, + help="Maximum number of attempts for iterative mode (default: 3, min: 1)", ) # Add critic arguments (no default) @@ -80,6 +81,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--max-retries", type=int, - help="Maximum retries for instances that throw exceptions", + default=3, + help="Maximum retries for instances that throw exceptions (default: 3)", ) return parser From 3d3c73ad83d933ab68571637a6704803e61cd024 Mon Sep 17 00:00:00 2001 From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com> Date: Thu, 29 Jan 2026 14:01:18 +0100 Subject: [PATCH 32/33] Apply suggestion from @simonrosenberg --- benchmarks/commit0/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index f0b1decb..e5d13e17 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -10,7 +10,7 @@ "dataset": "wentingzhao/commit0_combined", "split": "test", "repo_split": "lite", - "num_workers": 8, + "num_workers": 16, "max_attempts": 1, "max_retries": 1, } From a8052aefc2806a1ae38a78b62615c681245156e3 Mon Sep 17 00:00:00 2001 From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:53:20 +0100 Subject: [PATCH 33/33] Apply suggestion from @simonrosenberg --- benchmarks/commit0/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index e5d13e17..dc5e2bc8 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -12,7 +12,7 @@ "repo_split": "lite", "num_workers": 16, "max_attempts": 1, - "max_retries": 1, + "max_retries": 3, } # Build defaults (used by build_images.py)