From 1f3437b3bf616a9bd736de17edb86e6740ec4de6 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 17:34:56 +0000
Subject: [PATCH 01/33] Align default argument values with evaluation
 repository

Update args_parser.py and benchmark-specific run_infer.py files to use
default values that match the evaluation repository (OpenHands/evaluation)
eval-job/values.yaml configuration.

Shared defaults updated in args_parser.py:
- workspace: 'docker' -> 'remote'
- max-iterations: 100 -> 500
- critic: 'pass' -> 'finish_with_patch'

Benchmark-specific overrides using parser.set_defaults():
- gaia: dataset='gaia-benchmark/GAIA'
- swtbench: dataset='eth-sri/SWT-bench_Verified_bm25_27k_zsp'
- commit0: max_attempts=1, max_retries=1 (in addition to existing dataset)

Also updated AGENTS.md to document the default values alignment pattern.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 AGENTS.md                        | 20 ++++++++++++++++++++
 benchmarks/commit0/run_infer.py  |  8 ++++++--
 benchmarks/gaia/run_infer.py     |  4 +++-
 benchmarks/swtbench/run_infer.py |  2 ++
 benchmarks/utils/args_parser.py  | 29 +++++++++++++++++++++--------
 benchmarks/utils/critics.py      |  8 ++++----
 6 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 0206a51d..dae512f9 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -83,6 +83,26 @@ make build  # Rebuild environment
 4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]`
 5. Update README.md with usage instructions
 
+# Default Values Alignment
+Default values in `benchmarks/utils/args_parser.py` are aligned with the evaluation 
+repository (OpenHands/evaluation) `eval-job/values.yaml`. This ensures consistency 
+between local development and production runs.
+
+**Shared defaults in args_parser.py:**
+- `--workspace`: "remote" (production uses remote workspaces)
+- `--max-iterations`: 500 (sufficient for complex tasks)
+- `--critic`: "finish_with_patch" (ensures agent produces valid patches)
+- `--max-attempts`: 3 (allows retries on critic failures)
+- `--max-retries`: 3 (handles transient errors)
+
+**Benchmark-specific overrides:** Use `parser.set_defaults()` in each benchmark's 
+`run_infer.py` before calling `parse_args()`:
+- `gaia`: dataset="gaia-benchmark/GAIA"
+- `swebench`: dataset="princeton-nlp/SWE-bench_Verified" (default)
+- `swtbench`: dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp"
+- `commit0`: dataset="wentingzhao/commit0_combined", max_attempts=1, max_retries=1
+- `swebenchmultimodal`: dataset="princeton-nlp/SWE-bench_Multimodal", split="dev"
+
 # LLM Configuration
 LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
 ```json
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 2e473669..c3ecf84e 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -596,8 +596,12 @@ def main() -> None:
         default="lite",
         help="all, lite, or each repo name",
     )
-    # Override the default dataset for commit0
-    parser.set_defaults(dataset="wentingzhao/commit0_combined")
+    # Override defaults for commit0 (matches evaluation repository values.yaml)
+    parser.set_defaults(
+        dataset="wentingzhao/commit0_combined",
+        max_attempts=1,
+        max_retries=1,
+    )
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 9a0a700d..18287950 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -551,6 +551,8 @@ def main() -> None:
         required=True,
         help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)",
     )
+    # Override defaults for GAIA (matches evaluation repository values.yaml)
+    parser.set_defaults(dataset="gaia-benchmark/GAIA")
     args = parser.parse_args()
 
     # Create critic instance from parsed arguments
@@ -585,7 +587,7 @@ def main() -> None:
     # Create metadata
     metadata = EvalMetadata(
         llm=llm,
-        dataset="gaia-benchmark/GAIA",
+        dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
         eval_output_dir=structured_output_dir,
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index a454e580..147f30cf 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -355,6 +355,8 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
+    # Override defaults for SWT-bench (matches evaluation repository values.yaml)
+    parser.set_defaults(dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp")
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 60f08d73..7572e843 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -1,5 +1,11 @@
 """
-Argument parsing utilities for SWE-bench benchmarks.
+Argument parsing utilities for benchmarks.
+
+Default values are aligned with the evaluation repository (OpenHands/evaluation)
+to ensure consistency between local development and production runs.
+
+Benchmark-specific values should be set via parser.set_defaults() in each
+benchmark's run_infer.py to override these common defaults.
 """
 
 import argparse
@@ -8,10 +14,17 @@
 
 
 def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
-    """Create and return argument parser.
+    """Create and return argument parser with common defaults.
+
+    Default values match the most common settings used across benchmarks
+    in the evaluation repository. Individual benchmarks can override
+    these using parser.set_defaults() before calling parse_args().
+
+    Args:
+        add_llm_config: Whether to add the llm_config_path positional argument.
 
     Returns:
-        ArgumentParser instance
+        ArgumentParser instance with common benchmark arguments.
     """
     parser = argparse.ArgumentParser(description="Run Evaluation inference")
     if add_llm_config:
@@ -30,15 +43,15 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--workspace",
         type=str,
-        default="docker",
+        default="remote",
         choices=["docker", "remote"],
-        help="Type of workspace to use (default: docker)",
+        help="Type of workspace to use (default: remote)",
     )
     parser.add_argument(
-        "--max-iterations", type=int, default=100, help="Maximum iterations"
+        "--max-iterations", type=int, default=500, help="Maximum iterations"
     )
     parser.add_argument(
-        "--num-workers", type=int, default=1, help="Number of evaluation workers"
+        "--num-workers", type=int, default=1, help="Number of inference workers"
     )
     parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
     parser.add_argument(
@@ -60,7 +73,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
     )
 
-    # Add critic arguments
+    # Add critic arguments (default: finish_with_patch)
     add_critic_args(parser)
 
     parser.add_argument(
diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py
index af9c55ae..6bc78bea 100644
--- a/benchmarks/utils/critics.py
+++ b/benchmarks/utils/critics.py
@@ -37,17 +37,17 @@ def add_critic_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--critic",
         type=str,
-        default="pass",
+        default="finish_with_patch",
         help=(
-            "Name of the critic to use for evaluation (default: 'pass'). "
+            "Name of the critic to use for evaluation (default: 'finish_with_patch'). "
             "Critics determine whether an agent's output is considered successful "
             "and whether another attempt should be made in iterative evaluation mode. "
             "Available critics: "
             "'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), "
             "'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, "
             "'empty_patch_critic' - Only requires non-empty git patch. "
-            "For single-attempt runs (default), 'pass' is recommended as the actual evaluation "
-            "is performed by the benchmark's own scoring system."
+            "For production runs, 'finish_with_patch' is recommended as it ensures "
+            "the agent produces a valid patch before completing."
         ),
     )
     parser.add_argument(

From e58ddb79272aee520e1e12dc16cb530c67933b97 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 17:38:42 +0000
Subject: [PATCH 02/33] Add explicit set_defaults for swebench and update
 comment for swebenchmultimodal

- swebench: Add explicit set_defaults(dataset, split) for consistency with
  other benchmarks, even though values match global defaults
- swebenchmultimodal: Update comment to match the pattern used in other benchmarks

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/run_infer.py           | 3 +++
 benchmarks/swebenchmultimodal/run_infer.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index e19f0877..7124d863 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -334,6 +334,9 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
+    # SWE-bench defaults match the global args_parser defaults (evaluation repository values.yaml)
+    # Explicit set_defaults for consistency with other benchmarks
+    parser.set_defaults(dataset="princeton-nlp/SWE-bench_Verified", split="test")
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 68e4c5b8..d47f3c74 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -423,7 +423,7 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
-    # Override the default dataset and split for multimodal
+    # Override defaults for SWE-bench Multimodal (matches evaluation repository values.yaml)
     parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev")
     args = parser.parse_args()
 

From dcb940f311d9f44f6d517be45be89c31d484b5c4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 17:40:48 +0000
Subject: [PATCH 03/33] Remove default dataset from args_parser.py

Each benchmark now sets its own dataset default via set_defaults(),
so no global default is needed.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/utils/args_parser.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 7572e843..63263a7c 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -36,8 +36,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--dataset",
         type=str,
-        default="princeton-nlp/SWE-bench_Verified",
-        help="Dataset name",
+        help="Dataset name (each benchmark sets its default via set_defaults)",
     )
     parser.add_argument("--split", type=str, default="test", help="Dataset split")
     parser.add_argument(

From c34d730d024626199a52e9c4ce6c8ae4642a110f Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 17:42:57 +0000
Subject: [PATCH 04/33] Add default value for llm_config_path

All benchmarks in the evaluation repository use .llm_config/runtime.json
as the LLM config path, so use this as the default.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/utils/args_parser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 63263a7c..698dea7a 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -31,7 +31,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "llm_config_path",
             type=str,
-            help="Path to JSON LLM configuration",
+            nargs="?",
+            default=".llm_config/runtime.json",
+            help="Path to JSON LLM configuration (default: .llm_config/runtime.json)",
         )
     parser.add_argument(
         "--dataset",

From 6af188a8f723968419dc6411978402fa707181cd Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 28 Jan 2026 17:44:36 +0000
Subject: [PATCH 05/33] Revert "Add default value for llm_config_path"

This reverts commit c34d730d024626199a52e9c4ce6c8ae4642a110f.
---
 benchmarks/utils/args_parser.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 698dea7a..63263a7c 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -31,9 +31,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "llm_config_path",
             type=str,
-            nargs="?",
-            default=".llm_config/runtime.json",
-            help="Path to JSON LLM configuration (default: .llm_config/runtime.json)",
+            help="Path to JSON LLM configuration",
         )
     parser.add_argument(
         "--dataset",

From 5fcb61d45a5687273186792ecae95f01ad2345ec Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 07:52:42 +0000
Subject: [PATCH 06/33] WIP: Add config.py files and refactor to use
 INFER_DEFAULTS

- Created config.py with INFER_DEFAULTS and EVAL_DEFAULTS for each benchmark
- Removed all defaults from utils/args_parser.py
- Removed default from critics.py
- Updated swebench, gaia, swtbench run_infer.py to use INFER_DEFAULTS
- Started commit0 update (import added)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py            | 26 ++++++++++++++
 benchmarks/commit0/run_infer.py         |  1 +
 benchmarks/gaia/config.py               | 25 +++++++++++++
 benchmarks/gaia/run_infer.py            |  4 +--
 benchmarks/swebench/config.py           | 27 ++++++++++++++
 benchmarks/swebench/run_infer.py        |  5 ++-
 benchmarks/swebenchmultimodal/config.py | 28 +++++++++++++++
 benchmarks/swtbench/config.py           | 28 +++++++++++++++
 benchmarks/swtbench/eval_infer.py       |  6 ++--
 benchmarks/swtbench/run_infer.py        |  4 +--
 benchmarks/utils/args_parser.py         | 47 +++++++++----------------
 benchmarks/utils/critics.py             |  9 ++---
 12 files changed, 164 insertions(+), 46 deletions(-)
 create mode 100644 benchmarks/commit0/config.py
 create mode 100644 benchmarks/gaia/config.py
 create mode 100644 benchmarks/swebench/config.py
 create mode 100644 benchmarks/swebenchmultimodal/config.py
 create mode 100644 benchmarks/swtbench/config.py

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
new file mode 100644
index 00000000..5855adbf
--- /dev/null
+++ b/benchmarks/commit0/config.py
@@ -0,0 +1,26 @@
+"""
+Commit0 benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "wentingzhao/commit0_combined",
+    "split": "test",
+    "repo_split": "lite",
+    "workspace": "remote",
+    "num_workers": 8,
+    "max_iterations": 500,
+    "max_attempts": 1,
+    "max_retries": 1,
+    "critic": "finish_with_patch",
+    "output_dir": "./eval_outputs",
+    "n_limit": 0,
+    "note": "initial",
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "model_name": "openhands",
+}
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index c3ecf84e..e1e79f06 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -12,6 +12,7 @@
     extract_custom_tag,
     get_base_docker_image,
 )
+from benchmarks.commit0.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
new file mode 100644
index 00000000..8a6977d4
--- /dev/null
+++ b/benchmarks/gaia/config.py
@@ -0,0 +1,25 @@
+"""
+GAIA benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "gaia-benchmark/GAIA",
+    "split": "validation",
+    "workspace": "remote",
+    "num_workers": 30,
+    "max_iterations": 500,
+    "max_attempts": 3,
+    "max_retries": 3,
+    "critic": "finish_with_patch",
+    "output_dir": "./eval_outputs",
+    "n_limit": 0,
+    "note": "initial",
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "model_name": "openhands",
+}
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 18287950..78e65581 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -11,6 +11,7 @@
 from datasets import DatasetDict, load_dataset
 from PIL import Image
 
+from benchmarks.gaia.config import INFER_DEFAULTS
 from benchmarks.gaia.scorer import question_scorer
 from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url
 from benchmarks.utils.args_parser import get_parser
@@ -551,8 +552,7 @@ def main() -> None:
         required=True,
         help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)",
     )
-    # Override defaults for GAIA (matches evaluation repository values.yaml)
-    parser.set_defaults(dataset="gaia-benchmark/GAIA")
+    parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
     # Create critic instance from parsed arguments
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
new file mode 100644
index 00000000..13d0839c
--- /dev/null
+++ b/benchmarks/swebench/config.py
@@ -0,0 +1,27 @@
+"""
+SWE-bench benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "split": "test",
+    "workspace": "remote",
+    "num_workers": 30,
+    "max_iterations": 500,
+    "max_attempts": 3,
+    "max_retries": 3,
+    "critic": "finish_with_patch",
+    "output_dir": "./eval_outputs",
+    "n_limit": 0,
+    "note": "initial",
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "model_name": "openhands",
+    "workers": 12,
+}
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 7124d863..1064fb42 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -5,6 +5,7 @@
 from jinja2 import Environment, FileSystemLoader
 
 from benchmarks.swebench import constants
+from benchmarks.swebench.config import INFER_DEFAULTS
 from benchmarks.swebench.build_images import (
     extract_custom_tag,
     get_official_docker_image,
@@ -334,9 +335,7 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
-    # SWE-bench defaults match the global args_parser defaults (evaluation repository values.yaml)
-    # Explicit set_defaults for consistency with other benchmarks
-    parser.set_defaults(dataset="princeton-nlp/SWE-bench_Verified", split="test")
+    parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
new file mode 100644
index 00000000..d43306c0
--- /dev/null
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -0,0 +1,28 @@
+"""
+SWE-bench Multimodal benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "princeton-nlp/SWE-bench_Multimodal",
+    "split": "dev",
+    "workspace": "remote",
+    "num_workers": 30,
+    "max_iterations": 500,
+    "max_attempts": 3,
+    "max_retries": 3,
+    "critic": "finish_with_patch",
+    "output_dir": "./eval_outputs",
+    "n_limit": 0,
+    "note": "initial",
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "dataset": "princeton-nlp/SWE-bench_Multimodal",
+    "split": "dev",
+    "model_name": "openhands",
+    "workers": 12,
+}
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
new file mode 100644
index 00000000..2643f46d
--- /dev/null
+++ b/benchmarks/swtbench/config.py
@@ -0,0 +1,28 @@
+"""
+SWT-bench benchmark configuration.
+
+Default values aligned with evaluation repository (OpenHands/evaluation).
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
+    "split": "test",
+    "workspace": "remote",
+    "num_workers": 30,
+    "max_iterations": 500,
+    "max_attempts": 3,
+    "max_retries": 3,
+    "critic": "finish_with_patch",
+    "output_dir": "./eval_outputs",
+    "n_limit": 0,
+    "note": "initial",
+}
+
+# Evaluation defaults (used by eval_infer.py)
+# Note: eval uses SWE-bench dataset, not SWT-bench dataset
+EVAL_DEFAULTS = {
+    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "model_name": "OpenHands",
+    "workers": 24,
+}
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 4f5f0632..5cb3a4d2 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -237,7 +237,7 @@ def run_swtbench_evaluation(
     predictions_file: str,
     # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness.
     dataset: str = "princeton-nlp/SWE-bench_Verified",
-    workers: str = "12",
+    workers: str = "24",
 ) -> None:
     """
     Run SWT-Bench evaluation on the predictions file.
@@ -384,8 +384,8 @@ def main() -> None:
 
     parser.add_argument(
         "--workers",
-        default="12",
-        help="Number of workers to use when evaluating",
+        default="24",
+        help="Number of workers to use when evaluating (default: 24)",
     )
 
     args = parser.parse_args()
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 147f30cf..7c863539 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -4,6 +4,7 @@
 
 from jinja2 import Environment, FileSystemLoader
 
+from benchmarks.swtbench.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -355,8 +356,7 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
-    # Override defaults for SWT-bench (matches evaluation repository values.yaml)
-    parser.set_defaults(dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp")
+    parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 63263a7c..cae2f928 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -1,11 +1,9 @@
 """
 Argument parsing utilities for benchmarks.
 
-Default values are aligned with the evaluation repository (OpenHands/evaluation)
-to ensure consistency between local development and production runs.
-
-Benchmark-specific values should be set via parser.set_defaults() in each
-benchmark's run_infer.py to override these common defaults.
+This module defines common arguments used across all benchmarks.
+No default values are set here - each benchmark must set its own defaults
+via parser.set_defaults() to match the evaluation repository configuration.
 """
 
 import argparse
@@ -14,17 +12,16 @@
 
 
 def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
-    """Create and return argument parser with common defaults.
+    """Create and return argument parser without defaults.
 
-    Default values match the most common settings used across benchmarks
-    in the evaluation repository. Individual benchmarks can override
-    these using parser.set_defaults() before calling parse_args().
+    Each benchmark must call parser.set_defaults() before parse_args()
+    to set values matching the evaluation repository (OpenHands/evaluation).
 
     Args:
         add_llm_config: Whether to add the llm_config_path positional argument.
 
     Returns:
-        ArgumentParser instance with common benchmark arguments.
+        ArgumentParser instance with common benchmark arguments (no defaults).
     """
     parser = argparse.ArgumentParser(description="Run Evaluation inference")
     if add_llm_config:
@@ -36,55 +33,45 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--dataset",
         type=str,
-        help="Dataset name (each benchmark sets its default via set_defaults)",
+        help="Dataset name",
     )
-    parser.add_argument("--split", type=str, default="test", help="Dataset split")
+    parser.add_argument("--split", type=str, help="Dataset split")
     parser.add_argument(
         "--workspace",
         type=str,
-        default="remote",
         choices=["docker", "remote"],
-        help="Type of workspace to use (default: remote)",
-    )
-    parser.add_argument(
-        "--max-iterations", type=int, default=500, help="Maximum iterations"
-    )
-    parser.add_argument(
-        "--num-workers", type=int, default=1, help="Number of inference workers"
+        help="Type of workspace to use",
     )
-    parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
+    parser.add_argument("--max-iterations", type=int, help="Maximum iterations")
+    parser.add_argument("--num-workers", type=int, help="Number of inference workers")
+    parser.add_argument("--note", type=str, help="Evaluation note")
     parser.add_argument(
         "--output-dir",
         type=str,
-        default="./eval_outputs",
         help="Evaluation output directory",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        default=0,
-        help="Limit number of instances to evaluate",
+        help="Limit number of instances to evaluate (0 = no limit)",
     )
     parser.add_argument(
         "--max-attempts",
         type=int,
-        default=3,
-        help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
+        help="Maximum number of attempts for iterative mode (min: 1)",
     )
 
-    # Add critic arguments (default: finish_with_patch)
+    # Add critic arguments (no default)
     add_critic_args(parser)
 
     parser.add_argument(
         "--select",
         type=str,
-        default=None,
         help="Path to text file containing instance IDs to select (one per line)",
     )
     parser.add_argument(
         "--max-retries",
         type=int,
-        default=3,
-        help="Maximum retries for instances that throw exceptions (default: 3)",
+        help="Maximum retries for instances that throw exceptions",
     )
     return parser
diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py
index 6bc78bea..b2978294 100644
--- a/benchmarks/utils/critics.py
+++ b/benchmarks/utils/critics.py
@@ -37,17 +37,14 @@ def add_critic_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--critic",
         type=str,
-        default="finish_with_patch",
         help=(
-            "Name of the critic to use for evaluation (default: 'finish_with_patch'). "
+            "Name of the critic to use for evaluation. "
             "Critics determine whether an agent's output is considered successful "
             "and whether another attempt should be made in iterative evaluation mode. "
             "Available critics: "
-            "'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), "
+            "'pass' - Always accepts the output (no retry logic), "
             "'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, "
-            "'empty_patch_critic' - Only requires non-empty git patch. "
-            "For production runs, 'finish_with_patch' is recommended as it ensures "
-            "the agent produces a valid patch before completing."
+            "'empty_patch_critic' - Only requires non-empty git patch."
         ),
     )
     parser.add_argument(

From 7fec81f96c344e911f25494b1392bf23945ca5fa Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:08:29 +0000
Subject: [PATCH 07/33] Fix import ordering to pass ruff lint checks

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/run_infer.py  | 1 -
 benchmarks/swebench/run_infer.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index e1e79f06..c3ecf84e 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -12,7 +12,6 @@
     extract_custom_tag,
     get_base_docker_image,
 )
-from benchmarks.commit0.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 1064fb42..259e9163 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -5,13 +5,13 @@
 from jinja2 import Environment, FileSystemLoader
 
 from benchmarks.swebench import constants
-from benchmarks.swebench.config import INFER_DEFAULTS
 from benchmarks.swebench.build_images import (
     extract_custom_tag,
     get_official_docker_image,
     should_wrap_instance_id,
     wrap_image,
 )
+from benchmarks.swebench.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE

From 4ed5b722c69bcdedb7d1cd1edbb7af0436bbdaf4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:14:56 +0000
Subject: [PATCH 08/33] Add missing workers field to GAIA and Commit0
 EVAL_DEFAULTS

Align EVAL_DEFAULTS with NUM_EVAL_WORKERS from evaluation repository values.yaml:
- GAIA: workers=1
- Commit0: workers=1

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py | 1 +
 benchmarks/gaia/config.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index 5855adbf..5acfbe62 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -23,4 +23,5 @@
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
     "model_name": "openhands",
+    "workers": 1,
 }
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 8a6977d4..af62c044 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -22,4 +22,5 @@
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
     "model_name": "openhands",
+    "workers": 1,
 }

From 2c1a9e19e0bf2f9fc7ba0a3c0fb0d192e28b2ee6 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:19:02 +0000
Subject: [PATCH 09/33] Use EVAL_DEFAULTS from config in eval_infer.py files

Update eval_infer.py files to import and use EVAL_DEFAULTS from their
respective config.py files via parser.set_defaults():
- swebench/eval_infer.py: uses EVAL_DEFAULTS for dataset, model_name, workers
- swtbench/eval_infer.py: uses EVAL_DEFAULTS for dataset, model_name, workers
- swebenchmultimodal/eval_infer.py: uses EVAL_DEFAULTS for dataset, split, model_name, workers

This ensures the default values defined in config.py are actually used
by the evaluation scripts, aligning with the pattern used in run_infer.py
files for INFER_DEFAULTS.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/eval_infer.py           |  7 +++++--
 benchmarks/swebenchmultimodal/eval_infer.py | 15 +++++++--------
 benchmarks/swtbench/eval_infer.py           | 10 ++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index b1c5ee69..8bb7a7eb 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 
 from benchmarks.swebench import constants
+from benchmarks.swebench.config import EVAL_DEFAULTS
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -223,10 +224,12 @@ def main() -> None:
     parser.add_argument(
         "--workers",
         type=int,
-        default=constants.DEFAULT_EVAL_WORKERS,
-        help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})",
+        help="Number of workers to use when evaluating",
     )
 
+    # Apply EVAL_DEFAULTS from config
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 0984b3e5..1e675b7d 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import Any
 
+from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -375,15 +376,12 @@ def main() -> None:
 
     parser.add_argument(
         "--dataset",
-        default="princeton-nlp/SWE-bench_Multimodal",
-        help="SWE-Bench dataset to evaluate against "
-        "(default: princeton-nlp/SWE-bench_Multimodal)",
+        help="SWE-Bench dataset to evaluate against",
     )
 
     parser.add_argument(
         "--split",
-        default="dev",
-        help="Dataset split to use (default: dev)",
+        help="Dataset split to use",
     )
 
     parser.add_argument(
@@ -400,13 +398,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        help="Model name to use in the model_name_or_path field",
     )
 
     parser.add_argument(
         "--workers",
-        default="12",
         help="Number of workers to use when evaluating",
     )
 
@@ -415,6 +411,9 @@ def main() -> None:
         help="Run ID for the evaluation (default: eval_<output_filename>)",
     )
 
+    # Apply EVAL_DEFAULTS from config
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 5cb3a4d2..5fb0cc5c 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from time import monotonic
 
+from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import (
     compute_required_images,
     ensure_swt_bench_repo,
@@ -378,16 +379,17 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="OpenHands",
-        help="Model name to use in the model_name_or_path field (default: OpenHands)",
+        help="Model name to use in the model_name_or_path field",
     )
 
     parser.add_argument(
         "--workers",
-        default="24",
-        help="Number of workers to use when evaluating (default: 24)",
+        help="Number of workers to use when evaluating",
     )
 
+    # Apply EVAL_DEFAULTS from config
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file

From 4e8cb53b8813a659937230304a8c4b37371637fa Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:22:04 +0000
Subject: [PATCH 10/33] Use INFER_DEFAULTS from config in commit0 and
 swebenchmultimodal run_infer.py

Update run_infer.py files to import and use INFER_DEFAULTS from their
respective config.py files via parser.set_defaults():
- commit0/run_infer.py: uses INFER_DEFAULTS for all inference settings
- swebenchmultimodal/run_infer.py: uses INFER_DEFAULTS for all inference settings

This ensures the default values defined in config.py are actually used
by the inference scripts, completing the alignment with the evaluation
repository values.yaml.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/run_infer.py            | 10 +++-------
 benchmarks/swebenchmultimodal/run_infer.py |  5 +++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index c3ecf84e..4cc166d8 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -12,6 +12,7 @@
     extract_custom_tag,
     get_base_docker_image,
 )
+from benchmarks.commit0.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -593,15 +594,10 @@ def main() -> None:
     parser.add_argument(
         "--repo-split",
         type=str,
-        default="lite",
         help="all, lite, or each repo name",
     )
-    # Override defaults for commit0 (matches evaluation repository values.yaml)
-    parser.set_defaults(
-        dataset="wentingzhao/commit0_combined",
-        max_attempts=1,
-        max_retries=1,
-    )
+    # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml)
+    parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
     # Validate max_attempts
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index d47f3c74..85fc8254 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -10,6 +10,7 @@
     extract_custom_tag,
     get_official_docker_image,
 )
+from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
@@ -423,8 +424,8 @@ def main() -> None:
         choices=choices,
         help="Path to prompt template file",
     )
-    # Override defaults for SWE-bench Multimodal (matches evaluation repository values.yaml)
-    parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev")
+    # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml)
+    parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
     # Validate max_attempts

From 7b8ab3dee67dc242c59168393830b6d11759888c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:30:41 +0000
Subject: [PATCH 11/33] Use EVAL_DEFAULTS from config in commit0 and gaia
 eval_infer.py

Update eval_infer.py files to import and use EVAL_DEFAULTS from their
respective config.py files via parser.set_defaults():
- commit0/eval_infer.py: uses EVAL_DEFAULTS for model_name
- gaia/eval_infer.py: uses EVAL_DEFAULTS for model_name

This ensures all benchmarks consistently use their config.py defaults.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/eval_infer.py | 7 +++++--
 benchmarks/gaia/eval_infer.py    | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
index f03e73f6..4470b36d 100644
--- a/benchmarks/commit0/eval_infer.py
+++ b/benchmarks/commit0/eval_infer.py
@@ -15,6 +15,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.commit0.config import EVAL_DEFAULTS
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 
@@ -174,10 +175,12 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        help="Model name to use in the model_name_or_path field",
     )
 
+    # Apply EVAL_DEFAULTS from config
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file
diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py
index 889d132d..715211f3 100644
--- a/benchmarks/gaia/eval_infer.py
+++ b/benchmarks/gaia/eval_infer.py
@@ -18,6 +18,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.gaia.config import EVAL_DEFAULTS
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -197,10 +198,12 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        help="Model name to use in the model_name_or_path field",
     )
 
+    # Apply EVAL_DEFAULTS from config
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file

From 05d34f9740e5726b09b55b7ea8e5ed3f2ba93c21 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:39:41 +0000
Subject: [PATCH 12/33] Move common defaults (note, n_limit, output_dir) to
 args_parser.py

These fields are not benchmark-specific and should have global defaults:
- note: 'initial' (user-facing option for run identification)
- n_limit: 0 (no limit by default)
- output_dir: OUTPUT_DIR from constants.py ('./eval_outputs')

Added OUTPUT_DIR constant to benchmarks/utils/constants.py.

This keeps INFER_DEFAULTS focused on benchmark-specific values from
the evaluation repository's values.yaml.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py            | 3 ---
 benchmarks/gaia/config.py               | 3 ---
 benchmarks/swebench/config.py           | 3 ---
 benchmarks/swebenchmultimodal/config.py | 3 ---
 benchmarks/swtbench/config.py           | 3 ---
 benchmarks/utils/args_parser.py         | 9 ++++++---
 benchmarks/utils/constants.py           | 1 +
 7 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index 5acfbe62..9c84c35b 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -15,9 +15,6 @@
     "max_attempts": 1,
     "max_retries": 1,
     "critic": "finish_with_patch",
-    "output_dir": "./eval_outputs",
-    "n_limit": 0,
-    "note": "initial",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index af62c044..192114ae 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -14,9 +14,6 @@
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
-    "output_dir": "./eval_outputs",
-    "n_limit": 0,
-    "note": "initial",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index 13d0839c..4eba91f6 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -14,9 +14,6 @@
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
-    "output_dir": "./eval_outputs",
-    "n_limit": 0,
-    "note": "initial",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index d43306c0..e9affe05 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -14,9 +14,6 @@
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
-    "output_dir": "./eval_outputs",
-    "n_limit": 0,
-    "note": "initial",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index 2643f46d..9b3e727c 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -14,9 +14,6 @@
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
-    "output_dir": "./eval_outputs",
-    "n_limit": 0,
-    "note": "initial",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index cae2f928..28c9444a 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -2,12 +2,13 @@
 Argument parsing utilities for benchmarks.
 
 This module defines common arguments used across all benchmarks.
-No default values are set here - each benchmark must set its own defaults
-via parser.set_defaults() to match the evaluation repository configuration.
+Benchmark-specific defaults should be set via parser.set_defaults()
+to match the evaluation repository configuration.
 """
 
 import argparse
 
+from benchmarks.utils.constants import OUTPUT_DIR
 from benchmarks.utils.critics import add_critic_args
 
 
@@ -44,15 +45,17 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     )
     parser.add_argument("--max-iterations", type=int, help="Maximum iterations")
     parser.add_argument("--num-workers", type=int, help="Number of inference workers")
-    parser.add_argument("--note", type=str, help="Evaluation note")
+    parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
     parser.add_argument(
         "--output-dir",
         type=str,
+        default=OUTPUT_DIR,
         help="Evaluation output directory",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
+        default=0,
         help="Limit number of instances to evaluate (0 = no limit)",
     )
     parser.add_argument(
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index 9337b847..e7f4f42b 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1,2 +1,3 @@
 OUTPUT_FILENAME = "output.jsonl"
+OUTPUT_DIR = "./eval_outputs"
 EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"

From 3d6955a607142df9020884f16ef90e8cd1c1dfea Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:50:09 +0000
Subject: [PATCH 13/33] Remove unused fields from INFER_DEFAULTS and
 EVAL_DEFAULTS

- gaia: Remove max_retries from INFER_DEFAULTS (not used in run_infer.py)
- gaia: Remove workers from EVAL_DEFAULTS (not used in eval_infer.py)
- commit0: Remove workers from EVAL_DEFAULTS (not used in eval_infer.py)

Each config now only contains fields that are actually used by the
corresponding run_infer.py and eval_infer.py scripts.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py | 1 -
 benchmarks/gaia/config.py    | 2 --
 2 files changed, 3 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index 9c84c35b..f003693d 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -20,5 +20,4 @@
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
     "model_name": "openhands",
-    "workers": 1,
 }
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 192114ae..32d15960 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -12,12 +12,10 @@
     "num_workers": 30,
     "max_iterations": 500,
     "max_attempts": 3,
-    "max_retries": 3,
     "critic": "finish_with_patch",
 }
 
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
     "model_name": "openhands",
-    "workers": 1,
 }

From 26d428b29b00bbca22c7f1ca00a8156f06a23724 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 08:53:21 +0000
Subject: [PATCH 14/33] Make --note optional with no default

Remove the default value 'initial' from --note argument. When not
specified, no note identifier is appended to the output directory.

The construct_eval_output_dir function already handles None/empty
values gracefully by not appending the _N_ suffix.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/utils/args_parser.py      | 2 +-
 benchmarks/utils/evaluation_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 28c9444a..6e6485e2 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -45,7 +45,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     )
     parser.add_argument("--max-iterations", type=int, help="Maximum iterations")
     parser.add_argument("--num-workers", type=int, help="Number of inference workers")
-    parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
+    parser.add_argument("--note", type=str, help="Optional evaluation note")
     parser.add_argument(
         "--output-dir",
         type=str,
diff --git a/benchmarks/utils/evaluation_utils.py b/benchmarks/utils/evaluation_utils.py
index 517b85d3..030457ea 100644
--- a/benchmarks/utils/evaluation_utils.py
+++ b/benchmarks/utils/evaluation_utils.py
@@ -18,7 +18,7 @@ def construct_eval_output_dir(
     dataset_name: str,
     model_name: str,
     max_iterations: int,
-    eval_note: str,
+    eval_note: str | None,
 ) -> str:
     """Construct the structured evaluation output directory path."""
     # Format: eval_out/<dataset>-<split>/<agent_config>/

From e53928f66c45c067a6fdbbf7637a708baf090bbe Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:17:42 +0000
Subject: [PATCH 15/33] Use INFER_DEFAULTS for commit0 hardcoded values

Replace hardcoded dataset, split, and repo_split values with references
to INFER_DEFAULTS in:
- commit0/run_infer.py: Commit0Evaluation class __init__ and prepare_instances
- commit0/build_images.py: set only the specific defaults needed (dataset, split, repo_split)

This ensures all commit0 code uses the centralized config values.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/build_images.py | 10 +++++++---
 benchmarks/commit0/run_infer.py    | 18 +++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py
index b59704ea..642bce72 100644
--- a/benchmarks/commit0/build_images.py
+++ b/benchmarks/commit0/build_images.py
@@ -4,7 +4,7 @@
 
 Example:
   uv run benchmarks/commit0/build_images.py \
-    --dataset wentingzhao/commit0_combined --split test --repo-split lite \
+    --repo-split lite \
     --image ghcr.io/openhands/eval-agent-server --push --max-workers 16
 """
 
@@ -13,6 +13,7 @@
 
 from commit0.harness.constants import SPLIT
 
+from benchmarks.commit0.config import INFER_DEFAULTS
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
@@ -90,7 +91,6 @@ def main(argv: list[str]) -> int:
     parser.add_argument(
         "--repo-split",
         type=str,
-        default="lite",
         help="Commit0 repo split (lite, all, or repo name)",
     )
     parser.add_argument(
@@ -99,7 +99,11 @@ def main(argv: list[str]) -> int:
         default="",
         help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)",
     )
-    parser.set_defaults(dataset="wentingzhao/commit0_combined")
+    parser.set_defaults(
+        dataset=INFER_DEFAULTS["dataset"],
+        split=INFER_DEFAULTS["split"],
+        repo_split=INFER_DEFAULTS["repo_split"],
+    )
     args = parser.parse_args(argv)
 
     docker_image_prefix = args.docker_image_prefix or None
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 4cc166d8..7ec21b6e 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -111,9 +111,9 @@ def __init__(
         self,
         metadata: EvalMetadata,
         num_workers: int = 1,
-        repo_split: str = "lite",
-        dataset_name: str = "wentingzhao/commit0_combined",
-        dataset_split: str = "test",
+        repo_split: str | None = None,
+        dataset_name: str | None = None,
+        dataset_split: str | None = None,
     ):
         super().__init__(metadata=metadata, num_workers=num_workers)
         # Store additional parameters in metadata.details for access in methods
@@ -121,9 +121,9 @@ def __init__(
             metadata.details = {}
         metadata.details.update(
             {
-                "repo_split": repo_split,
-                "dataset_name": dataset_name,
-                "dataset_split": dataset_split,
+                "repo_split": repo_split or INFER_DEFAULTS["repo_split"],
+                "dataset_name": dataset_name or INFER_DEFAULTS["dataset"],
+                "dataset_split": dataset_split or INFER_DEFAULTS["split"],
             }
         )
 
@@ -131,9 +131,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Setting up Commit0 evaluation data")
 
         details = self.metadata.details or {}
-        dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined")
-        dataset_split = details.get("dataset_split", "test")
-        repo_split = details.get("repo_split", "lite")
+        dataset_name = details.get("dataset_name", INFER_DEFAULTS["dataset"])
+        dataset_split = details.get("dataset_split", INFER_DEFAULTS["split"])
+        repo_split = details.get("repo_split", INFER_DEFAULTS["repo_split"])
 
         dataset = load_dataset(dataset_name, split=dataset_split)
         df = commit0_setup(dataset, repo_split)

From 9fe08b9f3f7a43f2a93ebea93360d29047f140a4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:24:14 +0000
Subject: [PATCH 16/33] Revert commit0/eval_infer.py and remove EVAL_DEFAULTS

The commit0 eval_infer.py is a simple JSON processor that doesn't need
centralized defaults. Reverted to main version.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py     | 5 -----
 benchmarks/commit0/eval_infer.py | 7 ++-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index f003693d..922feca4 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -16,8 +16,3 @@
     "max_retries": 1,
     "critic": "finish_with_patch",
 }
-
-# Evaluation defaults (used by eval_infer.py)
-EVAL_DEFAULTS = {
-    "model_name": "openhands",
-}
diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
index 4470b36d..f03e73f6 100644
--- a/benchmarks/commit0/eval_infer.py
+++ b/benchmarks/commit0/eval_infer.py
@@ -15,7 +15,6 @@
 import sys
 from pathlib import Path
 
-from benchmarks.commit0.config import EVAL_DEFAULTS
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 
@@ -175,12 +174,10 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        help="Model name to use in the model_name_or_path field",
+        default="openhands",
+        help="Model name to use in the model_name_or_path field (default: openhands)",
     )
 
-    # Apply EVAL_DEFAULTS from config
-    parser.set_defaults(**EVAL_DEFAULTS)
-
     args = parser.parse_args()
 
     # Validate input file

From 5b77dfa102817b8295097bb85bd14ac7d84611e3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:26:56 +0000
Subject: [PATCH 17/33] Revert gaia/eval_infer.py and remove EVAL_DEFAULTS

The gaia eval_infer.py is a simple JSON processor that doesn't need
centralized defaults. Reverted to main version.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/gaia/config.py     | 5 -----
 benchmarks/gaia/eval_infer.py | 7 ++-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 32d15960..4f2ce74c 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -14,8 +14,3 @@
     "max_attempts": 3,
     "critic": "finish_with_patch",
 }
-
-# Evaluation defaults (used by eval_infer.py)
-EVAL_DEFAULTS = {
-    "model_name": "openhands",
-}
diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py
index 715211f3..889d132d 100644
--- a/benchmarks/gaia/eval_infer.py
+++ b/benchmarks/gaia/eval_infer.py
@@ -18,7 +18,6 @@
 import sys
 from pathlib import Path
 
-from benchmarks.gaia.config import EVAL_DEFAULTS
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -198,12 +197,10 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        help="Model name to use in the model_name_or_path field",
+        default="openhands",
+        help="Model name to use in the model_name_or_path field (default: openhands)",
     )
 
-    # Apply EVAL_DEFAULTS from config
-    parser.set_defaults(**EVAL_DEFAULTS)
-
     args = parser.parse_args()
 
     # Validate input file

From b19fb1dc67b150f8a0de6e5811e449587f17f988 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:30:23 +0000
Subject: [PATCH 18/33] Use constants.py values in swebench/config.py

Import DEFAULT_DATASET, DEFAULT_CLI_MODEL_NAME, DEFAULT_EVAL_WORKERS from
constants.py instead of duplicating the values. This ensures constants.py
remains the single source of truth for these values.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/config.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index 4eba91f6..13f0cd2a 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -4,9 +4,16 @@
 Default values aligned with evaluation repository (OpenHands/evaluation).
 """
 
+from benchmarks.swebench.constants import (
+    DEFAULT_CLI_MODEL_NAME,
+    DEFAULT_DATASET,
+    DEFAULT_EVAL_WORKERS,
+)
+
+
 # Inference defaults (used by run_infer.py)
 INFER_DEFAULTS = {
-    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "dataset": DEFAULT_DATASET,
     "split": "test",
     "workspace": "remote",
     "num_workers": 30,
@@ -18,7 +25,7 @@
 
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
-    "dataset": "princeton-nlp/SWE-bench_Verified",
-    "model_name": "openhands",
-    "workers": 12,
+    "dataset": DEFAULT_DATASET,
+    "model_name": DEFAULT_CLI_MODEL_NAME,
+    "workers": DEFAULT_EVAL_WORKERS,
 }

From 5b68a77f065e24ba630a00d7bbaeab4c37245bc3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:35:48 +0000
Subject: [PATCH 19/33] Move DEFAULT_DATASET, DEFAULT_EVAL_WORKERS,
 DEFAULT_CLI_MODEL_NAME to config.py

Remove these constants from constants.py and update eval_infer.py to use
EVAL_DEFAULTS from config.py instead. config.py is now the single source
of truth for dataset, model_name, and workers defaults.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/config.py     | 15 ++++-----------
 benchmarks/swebench/constants.py  | 13 +++----------
 benchmarks/swebench/eval_infer.py | 12 +++++-------
 3 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index 13f0cd2a..4eba91f6 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -4,16 +4,9 @@
 Default values aligned with evaluation repository (OpenHands/evaluation).
 """
 
-from benchmarks.swebench.constants import (
-    DEFAULT_CLI_MODEL_NAME,
-    DEFAULT_DATASET,
-    DEFAULT_EVAL_WORKERS,
-)
-
-
 # Inference defaults (used by run_infer.py)
 INFER_DEFAULTS = {
-    "dataset": DEFAULT_DATASET,
+    "dataset": "princeton-nlp/SWE-bench_Verified",
     "split": "test",
     "workspace": "remote",
     "num_workers": 30,
@@ -25,7 +18,7 @@
 
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
-    "dataset": DEFAULT_DATASET,
-    "model_name": DEFAULT_CLI_MODEL_NAME,
-    "workers": DEFAULT_EVAL_WORKERS,
+    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "model_name": "openhands",
+    "workers": 12,
 }
diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
index 88d795c8..6cfd4809 100644
--- a/benchmarks/swebench/constants.py
+++ b/benchmarks/swebench/constants.py
@@ -1,16 +1,13 @@
 """
 SWE-Bench hyperparameters and constant values.
 
-This module serves as the single source of truth for all constant values
-used in the SWE-Bench evaluation workflow.
+This module provides constant values used in the SWE-Bench evaluation workflow.
+For dataset, model, and worker defaults, see config.py (INFER_DEFAULTS, EVAL_DEFAULTS).
 """
 
 from typing import Final, Literal
 
 
-# Dataset
-DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified"
-
 # Docker
 DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
 DOCKER_IMAGE_TAG: Final[str] = "latest"
@@ -28,12 +25,8 @@
 DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
 DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600
 
-# Evaluation
-DEFAULT_EVAL_WORKERS: Final[int] = 12
-
-# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands"
+# Model - preserving original behavior: function default is "OpenHands"
 DEFAULT_MODEL_NAME: Final[str] = "OpenHands"
-DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands"
 
 # Git
 GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index 8bb7a7eb..eefcc4a4 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -28,7 +28,7 @@
 
 
 def convert_to_swebench_format(
-    input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME
+    input_file: str, output_file: str, model_name: str = EVAL_DEFAULTS["model_name"]
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ -117,8 +117,8 @@ def convert_to_swebench_format(
 
 def run_swebench_evaluation(
     predictions_file: str,
-    dataset: str = constants.DEFAULT_DATASET,
-    workers: int = constants.DEFAULT_EVAL_WORKERS,
+    dataset: str = EVAL_DEFAULTS["dataset"],
+    workers: int = EVAL_DEFAULTS["workers"],
 ) -> None:
     """
     Run SWE-Bench evaluation on the predictions file.
@@ -199,8 +199,7 @@ def main() -> None:
 
     parser.add_argument(
         "--dataset",
-        default=constants.DEFAULT_DATASET,
-        help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})",
+        help="SWE-Bench dataset to evaluate against",
     )
 
     parser.add_argument(
@@ -217,8 +216,7 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default=constants.DEFAULT_CLI_MODEL_NAME,
-        help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})",
+        help="Model name to use in the model_name_or_path field",
     )
 
     parser.add_argument(

From e3b2b2d522eba2740657a07302a86b92040b4b14 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:44:44 +0000
Subject: [PATCH 20/33] Keep DEFAULT_CLI_MODEL_NAME in constants.py, remove
 model_name from EVAL_DEFAULTS

model_name is specific to the CLI and should stay in constants.py.
EVAL_DEFAULTS now only contains dataset and workers.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/config.py     | 1 -
 benchmarks/swebench/constants.py  | 3 ++-
 benchmarks/swebench/eval_infer.py | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index 4eba91f6..a6ea209a 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -19,6 +19,5 @@
 # Evaluation defaults (used by eval_infer.py)
 EVAL_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Verified",
-    "model_name": "openhands",
     "workers": 12,
 }
diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py
index 6cfd4809..46ca83ea 100644
--- a/benchmarks/swebench/constants.py
+++ b/benchmarks/swebench/constants.py
@@ -25,8 +25,9 @@
 DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
 DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600
 
-# Model - preserving original behavior: function default is "OpenHands"
+# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands"
 DEFAULT_MODEL_NAME: Final[str] = "OpenHands"
+DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands"
 
 # Git
 GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index eefcc4a4..0d688f31 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -28,7 +28,9 @@
 
 
 def convert_to_swebench_format(
-    input_file: str, output_file: str, model_name: str = EVAL_DEFAULTS["model_name"]
+    input_file: str,
+    output_file: str,
+    model_name: str = constants.DEFAULT_CLI_MODEL_NAME,
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWE-Bench prediction format.

From 24ba5bd6dd297216419d143fa179bdeea8a604d1 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:48:39 +0000
Subject: [PATCH 21/33] Remove model_name from swebenchmultimodal and swtbench
 EVAL_DEFAULTS

Revert eval_infer.py files to main and remove model_name from EVAL_DEFAULTS.
The model_name is hardcoded in the eval_infer.py files.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebenchmultimodal/config.py     |  1 -
 benchmarks/swebenchmultimodal/eval_infer.py | 15 ++++++++-------
 benchmarks/swtbench/config.py               |  1 -
 benchmarks/swtbench/eval_infer.py           | 10 ++++------
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index e9affe05..53550855 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -20,6 +20,5 @@
 EVAL_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Multimodal",
     "split": "dev",
-    "model_name": "openhands",
     "workers": 12,
 }
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 1e675b7d..0984b3e5 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -16,7 +16,6 @@
 from pathlib import Path
 from typing import Any
 
-from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -376,12 +375,15 @@ def main() -> None:
 
     parser.add_argument(
         "--dataset",
-        help="SWE-Bench dataset to evaluate against",
+        default="princeton-nlp/SWE-bench_Multimodal",
+        help="SWE-Bench dataset to evaluate against "
+        "(default: princeton-nlp/SWE-bench_Multimodal)",
     )
 
     parser.add_argument(
         "--split",
-        help="Dataset split to use",
+        default="dev",
+        help="Dataset split to use (default: dev)",
     )
 
     parser.add_argument(
@@ -398,11 +400,13 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        help="Model name to use in the model_name_or_path field",
+        default="openhands",
+        help="Model name to use in the model_name_or_path field (default: openhands)",
     )
 
     parser.add_argument(
         "--workers",
+        default="12",
         help="Number of workers to use when evaluating",
     )
 
@@ -411,9 +415,6 @@ def main() -> None:
         help="Run ID for the evaluation (default: eval_<output_filename>)",
     )
 
-    # Apply EVAL_DEFAULTS from config
-    parser.set_defaults(**EVAL_DEFAULTS)
-
     args = parser.parse_args()
 
     # Validate input file
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index 9b3e727c..f2473d58 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -20,6 +20,5 @@
 # Note: eval uses SWE-bench dataset, not SWT-bench dataset
 EVAL_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Verified",
-    "model_name": "OpenHands",
     "workers": 24,
 }
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 5fb0cc5c..4f5f0632 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,7 +18,6 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import (
     compute_required_images,
     ensure_swt_bench_repo,
@@ -238,7 +237,7 @@ def run_swtbench_evaluation(
     predictions_file: str,
     # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness.
     dataset: str = "princeton-nlp/SWE-bench_Verified",
-    workers: str = "24",
+    workers: str = "12",
 ) -> None:
     """
     Run SWT-Bench evaluation on the predictions file.
@@ -379,17 +378,16 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        help="Model name to use in the model_name_or_path field",
+        default="OpenHands",
+        help="Model name to use in the model_name_or_path field (default: OpenHands)",
     )
 
     parser.add_argument(
         "--workers",
+        default="12",
         help="Number of workers to use when evaluating",
     )
 
-    # Apply EVAL_DEFAULTS from config
-    parser.set_defaults(**EVAL_DEFAULTS)
-
     args = parser.parse_args()
 
     # Validate input file

From 19b214f5474a18f7c99fdf459eca37ec31397aba Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:53:25 +0000
Subject: [PATCH 22/33] Use EVAL_DEFAULTS for dataset, split, workers in
 swebenchmultimodal and swtbench eval_infer

Import EVAL_DEFAULTS and use parser.set_defaults() to apply them.
model_name remains hardcoded in the argument parser.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebenchmultimodal/eval_infer.py | 12 ++++++------
 benchmarks/swtbench/eval_infer.py           |  9 +++++----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 0984b3e5..b65b0c66 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import Any
 
+from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -375,15 +376,12 @@ def main() -> None:
 
     parser.add_argument(
         "--dataset",
-        default="princeton-nlp/SWE-bench_Multimodal",
-        help="SWE-Bench dataset to evaluate against "
-        "(default: princeton-nlp/SWE-bench_Multimodal)",
+        help="SWE-Bench dataset to evaluate against",
     )
 
     parser.add_argument(
         "--split",
-        default="dev",
-        help="Dataset split to use (default: dev)",
+        help="Dataset split to use",
     )
 
     parser.add_argument(
@@ -406,10 +404,12 @@ def main() -> None:
 
     parser.add_argument(
         "--workers",
-        default="12",
+        type=int,
         help="Number of workers to use when evaluating",
     )
 
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     parser.add_argument(
         "--run-id",
         help="Run ID for the evaluation (default: eval_<output_filename>)",
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 4f5f0632..c21bc0cd 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from time import monotonic
 
+from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import (
     compute_required_images,
     ensure_swt_bench_repo,
@@ -359,9 +360,7 @@ def main() -> None:
     # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness.
     parser.add_argument(
         "--dataset",
-        default="princeton-nlp/SWE-bench_Verified",
-        help="SWT-Bench dataset to evaluate against "
-        "(default: princeton-nlp/SWE-bench_Verified)",
+        help="SWT-Bench dataset to evaluate against",
     )
 
     parser.add_argument(
@@ -384,10 +383,12 @@ def main() -> None:
 
     parser.add_argument(
         "--workers",
-        default="12",
+        type=int,
         help="Number of workers to use when evaluating",
     )
 
+    parser.set_defaults(**EVAL_DEFAULTS)
+
     args = parser.parse_args()
 
     # Validate input file

From c11887c6fb58997ebba903157c0c23152a167f77 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 10:58:25 +0000
Subject: [PATCH 23/33] Use INFER_DEFAULTS for dataset/split in swtbench
 image_utils and build_eval_env_images

Update image_utils.py, build_eval_env_images.py, and eval_infer.py to import
and use INFER_DEFAULTS instead of hardcoding dataset and split values.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/build_eval_env_images.py | 9 +++++++--
 benchmarks/swtbench/eval_infer.py            | 4 ++--
 benchmarks/swtbench/image_utils.py           | 9 +++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 079ad66c..1c6e0820 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -9,6 +9,7 @@
 
 import docker
 
+from benchmarks.swtbench.config import INFER_DEFAULTS
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.image_utils import image_exists as remote_image_exists
@@ -257,8 +258,12 @@ def main() -> None:
     parser = argparse.ArgumentParser(
         description="Build and push prebaked SWT-bench eval env images."
     )
-    parser.add_argument("--dataset", required=True, help="Dataset name")
-    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument("--dataset", help="Dataset name")
+    parser.add_argument("--split", help="Dataset split")
+    parser.set_defaults(
+        dataset=INFER_DEFAULTS["dataset"],
+        split=INFER_DEFAULTS["split"],
+    )
     parser.add_argument(
         "--eval-limit",
         type=int,
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index c21bc0cd..1464cc01 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.config import EVAL_DEFAULTS
+from benchmarks.swtbench.config import EVAL_DEFAULTS, INFER_DEFAULTS
 from benchmarks.swtbench.image_utils import (
     compute_required_images,
     ensure_swt_bench_repo,
@@ -68,7 +68,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
 def try_pull_prebaked_images(
     predictions_file: Path,
     dataset: str,
-    split: str = "test",
+    split: str = INFER_DEFAULTS["split"],
     registry: str = PREBAKED_REGISTRY,
 ) -> None:
     """
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index e7aae1f4..1459ee13 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Iterable
 
+from benchmarks.swtbench.config import INFER_DEFAULTS
 from openhands.sdk import get_logger
 
 
@@ -130,8 +131,12 @@ def main() -> None:
         description="List SWT-bench base/env images required for a predictions file."
     )
     parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl")
-    parser.add_argument("--dataset", required=True, help="Dataset name")
-    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument("--dataset", help="Dataset name")
+    parser.add_argument("--split", help="Dataset split")
+    parser.set_defaults(
+        dataset=INFER_DEFAULTS["dataset"],
+        split=INFER_DEFAULTS["split"],
+    )
     parser.add_argument(
         "--format",
         choices=["plain", "json"],

From 0b229371af40503711b06799916b77dc55580ea6 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:04:30 +0000
Subject: [PATCH 24/33] Fix swtbench: use EVAL_DEFAULTS for eval-related files,
 add split to EVAL_DEFAULTS

image_utils.py and build_eval_env_images.py are used for evaluation, so they
should use EVAL_DEFAULTS (princeton-nlp/SWE-bench_Verified) not INFER_DEFAULTS
(eth-sri/SWT-bench_Verified_bm25_27k_zsp).

Added split='test' to EVAL_DEFAULTS to match values.yaml.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/build_eval_env_images.py | 6 +++---
 benchmarks/swtbench/config.py                | 1 +
 benchmarks/swtbench/eval_infer.py            | 4 ++--
 benchmarks/swtbench/image_utils.py           | 7 ++-----
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 1c6e0820..fde30ed9 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -9,7 +9,7 @@
 
 import docker
 
-from benchmarks.swtbench.config import INFER_DEFAULTS
+from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.image_utils import image_exists as remote_image_exists
@@ -261,8 +261,8 @@ def main() -> None:
     parser.add_argument("--dataset", help="Dataset name")
     parser.add_argument("--split", help="Dataset split")
     parser.set_defaults(
-        dataset=INFER_DEFAULTS["dataset"],
-        split=INFER_DEFAULTS["split"],
+        dataset=EVAL_DEFAULTS["dataset"],
+        split=EVAL_DEFAULTS["split"],
     )
     parser.add_argument(
         "--eval-limit",
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index f2473d58..a9f3276a 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -20,5 +20,6 @@
 # Note: eval uses SWE-bench dataset, not SWT-bench dataset
 EVAL_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Verified",
+    "split": "test",
     "workers": 24,
 }
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 1464cc01..8c37775f 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.config import EVAL_DEFAULTS, INFER_DEFAULTS
+from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import (
     compute_required_images,
     ensure_swt_bench_repo,
@@ -68,7 +68,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
 def try_pull_prebaked_images(
     predictions_file: Path,
     dataset: str,
-    split: str = INFER_DEFAULTS["split"],
+    split: str = EVAL_DEFAULTS["split"],
     registry: str = PREBAKED_REGISTRY,
 ) -> None:
     """
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index 1459ee13..c5d34035 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from typing import Iterable
 
-from benchmarks.swtbench.config import INFER_DEFAULTS
+from benchmarks.swtbench.config import EVAL_DEFAULTS
 from openhands.sdk import get_logger
 
 
@@ -133,10 +133,7 @@ def main() -> None:
     parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl")
     parser.add_argument("--dataset", help="Dataset name")
     parser.add_argument("--split", help="Dataset split")
-    parser.set_defaults(
-        dataset=INFER_DEFAULTS["dataset"],
-        split=INFER_DEFAULTS["split"],
-    )
+    parser.set_defaults(**EVAL_DEFAULTS)
     parser.add_argument(
         "--format",
         choices=["plain", "json"],

From 98bc7b4f354a4013e7e67f86c55d4a113da4d914 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:07:02 +0000
Subject: [PATCH 25/33] Revert AGENTS.md and fix commit0/build_images.py
 docstring

Revert AGENTS.md to main version.
Restore original docstring example in build_images.py.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 AGENTS.md                          | 20 --------------------
 benchmarks/commit0/build_images.py |  2 +-
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index dae512f9..0206a51d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -83,26 +83,6 @@ make build  # Rebuild environment
 4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]`
 5. Update README.md with usage instructions
 
-# Default Values Alignment
-Default values in `benchmarks/utils/args_parser.py` are aligned with the evaluation 
-repository (OpenHands/evaluation) `eval-job/values.yaml`. This ensures consistency 
-between local development and production runs.
-
-**Shared defaults in args_parser.py:**
-- `--workspace`: "remote" (production uses remote workspaces)
-- `--max-iterations`: 500 (sufficient for complex tasks)
-- `--critic`: "finish_with_patch" (ensures agent produces valid patches)
-- `--max-attempts`: 3 (allows retries on critic failures)
-- `--max-retries`: 3 (handles transient errors)
-
-**Benchmark-specific overrides:** Use `parser.set_defaults()` in each benchmark's 
-`run_infer.py` before calling `parse_args()`:
-- `gaia`: dataset="gaia-benchmark/GAIA"
-- `swebench`: dataset="princeton-nlp/SWE-bench_Verified" (default)
-- `swtbench`: dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp"
-- `commit0`: dataset="wentingzhao/commit0_combined", max_attempts=1, max_retries=1
-- `swebenchmultimodal`: dataset="princeton-nlp/SWE-bench_Multimodal", split="dev"
-
 # LLM Configuration
 LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
 ```json
diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py
index 642bce72..8b891d85 100644
--- a/benchmarks/commit0/build_images.py
+++ b/benchmarks/commit0/build_images.py
@@ -4,7 +4,7 @@
 
 Example:
   uv run benchmarks/commit0/build_images.py \
-    --repo-split lite \
+    --dataset wentingzhao/commit0_combined --split test --repo-split lite \
     --image ghcr.io/openhands/eval-agent-server --push --max-workers 16
 """
 

From a6507ed723bf14228f59a6fdd466789a0080f3b3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:10:27 +0000
Subject: [PATCH 26/33] Move workspace default to args_parser.py, remove from
 INFER_DEFAULTS

Set workspace default='remote' in args_parser.py since it's the same for all
benchmarks. Remove workspace from all INFER_DEFAULTS in config.py files.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py            | 1 -
 benchmarks/gaia/config.py               | 1 -
 benchmarks/swebench/config.py           | 1 -
 benchmarks/swebenchmultimodal/config.py | 1 -
 benchmarks/swtbench/config.py           | 1 -
 benchmarks/utils/args_parser.py         | 3 ++-
 6 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index 922feca4..fb3c10d9 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -9,7 +9,6 @@
     "dataset": "wentingzhao/commit0_combined",
     "split": "test",
     "repo_split": "lite",
-    "workspace": "remote",
     "num_workers": 8,
     "max_iterations": 500,
     "max_attempts": 1,
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 4f2ce74c..d4d529c4 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -8,7 +8,6 @@
 INFER_DEFAULTS = {
     "dataset": "gaia-benchmark/GAIA",
     "split": "validation",
-    "workspace": "remote",
     "num_workers": 30,
     "max_iterations": 500,
     "max_attempts": 3,
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index a6ea209a..c265963c 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -8,7 +8,6 @@
 INFER_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Verified",
     "split": "test",
-    "workspace": "remote",
     "num_workers": 30,
     "max_iterations": 500,
     "max_attempts": 3,
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index 53550855..00db964f 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -8,7 +8,6 @@
 INFER_DEFAULTS = {
     "dataset": "princeton-nlp/SWE-bench_Multimodal",
     "split": "dev",
-    "workspace": "remote",
     "num_workers": 30,
     "max_iterations": 500,
     "max_attempts": 3,
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index a9f3276a..73a957dd 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -8,7 +8,6 @@
 INFER_DEFAULTS = {
     "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
     "split": "test",
-    "workspace": "remote",
     "num_workers": 30,
     "max_iterations": 500,
     "max_attempts": 3,
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 6e6485e2..70803949 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -40,8 +40,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--workspace",
         type=str,
+        default="remote",
         choices=["docker", "remote"],
-        help="Type of workspace to use",
+        help="Type of workspace to use (default: remote)",
     )
     parser.add_argument("--max-iterations", type=int, help="Maximum iterations")
     parser.add_argument("--num-workers", type=int, help="Number of inference workers")

From ebcdec14e5d1de3cc04dfaafaccd4d9f3eec7b03 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:15:31 +0000
Subject: [PATCH 27/33] Move max_iterations default to args_parser.py, remove
 from INFER_DEFAULTS

Set max_iterations default=500 in args_parser.py since it's the same for all
benchmarks. Remove max_iterations from all INFER_DEFAULTS in config.py files.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py            | 1 -
 benchmarks/gaia/config.py               | 1 -
 benchmarks/swebench/config.py           | 1 -
 benchmarks/swebenchmultimodal/config.py | 1 -
 benchmarks/swtbench/config.py           | 1 -
 benchmarks/utils/args_parser.py         | 7 ++++++-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index fb3c10d9..2bf77c9d 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -10,7 +10,6 @@
     "split": "test",
     "repo_split": "lite",
     "num_workers": 8,
-    "max_iterations": 500,
     "max_attempts": 1,
     "max_retries": 1,
     "critic": "finish_with_patch",
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index d4d529c4..50a473f5 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -9,7 +9,6 @@
     "dataset": "gaia-benchmark/GAIA",
     "split": "validation",
     "num_workers": 30,
-    "max_iterations": 500,
     "max_attempts": 3,
     "critic": "finish_with_patch",
 }
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index c265963c..4b24e297 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -9,7 +9,6 @@
     "dataset": "princeton-nlp/SWE-bench_Verified",
     "split": "test",
     "num_workers": 30,
-    "max_iterations": 500,
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index 00db964f..de11a727 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -9,7 +9,6 @@
     "dataset": "princeton-nlp/SWE-bench_Multimodal",
     "split": "dev",
     "num_workers": 30,
-    "max_iterations": 500,
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index 73a957dd..e41ee0f3 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -9,7 +9,6 @@
     "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
     "split": "test",
     "num_workers": 30,
-    "max_iterations": 500,
     "max_attempts": 3,
     "max_retries": 3,
     "critic": "finish_with_patch",
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 70803949..d554dfeb 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -44,7 +44,12 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         choices=["docker", "remote"],
         help="Type of workspace to use (default: remote)",
     )
-    parser.add_argument("--max-iterations", type=int, help="Maximum iterations")
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=500,
+        help="Maximum iterations (default: 500)",
+    )
     parser.add_argument("--num-workers", type=int, help="Number of inference workers")
     parser.add_argument("--note", type=str, help="Optional evaluation note")
     parser.add_argument(

From 2443e7de354a38a18764d0b240ca3617b6150318 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:18:16 +0000
Subject: [PATCH 28/33] Move critic default to critics.py, remove from
 INFER_DEFAULTS

Set critic default='finish_with_patch' in critics.py since it's the same for
all benchmarks. Remove critic from all INFER_DEFAULTS in config.py files.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/config.py            | 1 -
 benchmarks/gaia/config.py               | 1 -
 benchmarks/swebench/config.py           | 1 -
 benchmarks/swebenchmultimodal/config.py | 1 -
 benchmarks/swtbench/config.py           | 1 -
 benchmarks/utils/critics.py             | 3 ++-
 6 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index 2bf77c9d..ee83c646 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -12,5 +12,4 @@
     "num_workers": 8,
     "max_attempts": 1,
     "max_retries": 1,
-    "critic": "finish_with_patch",
 }
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 50a473f5..6208d844 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -10,5 +10,4 @@
     "split": "validation",
     "num_workers": 30,
     "max_attempts": 3,
-    "critic": "finish_with_patch",
 }
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index 4b24e297..c882b7b5 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -11,7 +11,6 @@
     "num_workers": 30,
     "max_attempts": 3,
     "max_retries": 3,
-    "critic": "finish_with_patch",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index de11a727..53be4375 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -11,7 +11,6 @@
     "num_workers": 30,
     "max_attempts": 3,
     "max_retries": 3,
-    "critic": "finish_with_patch",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index e41ee0f3..1d87073c 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -11,7 +11,6 @@
     "num_workers": 30,
     "max_attempts": 3,
     "max_retries": 3,
-    "critic": "finish_with_patch",
 }
 
 # Evaluation defaults (used by eval_infer.py)
diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py
index b2978294..fa9f9d92 100644
--- a/benchmarks/utils/critics.py
+++ b/benchmarks/utils/critics.py
@@ -37,8 +37,9 @@ def add_critic_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--critic",
         type=str,
+        default="finish_with_patch",
         help=(
-            "Name of the critic to use for evaluation. "
+            "Name of the critic to use for evaluation (default: finish_with_patch). "
             "Critics determine whether an agent's output is considered successful "
             "and whether another attempt should be made in iterative evaluation mode. "
             "Available critics: "

From 231886635f3c584f19801b92f09a353aed9006eb Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:22:49 +0000
Subject: [PATCH 29/33] Revert constants.py, hardcode output-dir default in
 args_parser.py

Revert benchmarks/utils/constants.py to main version.
Hardcode './eval_outputs' as default for --output-dir in args_parser.py.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/utils/args_parser.py | 3 +--
 benchmarks/utils/constants.py   | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index d554dfeb..5e3e14af 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -8,7 +8,6 @@
 
 import argparse
 
-from benchmarks.utils.constants import OUTPUT_DIR
 from benchmarks.utils.critics import add_critic_args
 
 
@@ -55,7 +54,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--output-dir",
         type=str,
-        default=OUTPUT_DIR,
+        default="./eval_outputs",
         help="Evaluation output directory",
     )
     parser.add_argument(
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index e7f4f42b..9337b847 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1,3 +1,2 @@
 OUTPUT_FILENAME = "output.jsonl"
-OUTPUT_DIR = "./eval_outputs"
 EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"

From b8ad269c3dd28f0e1873e331f63ef3a7bf9f57ee Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 11:38:03 +0000
Subject: [PATCH 30/33] Add default level='2023_all' for GAIA benchmark

Add level to GAIA INFER_DEFAULTS matching production configuration.
Make --level argument optional since it now has a default.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/gaia/config.py    | 1 +
 benchmarks/gaia/run_infer.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 6208d844..49fad361 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -8,6 +8,7 @@
 INFER_DEFAULTS = {
     "dataset": "gaia-benchmark/GAIA",
     "split": "validation",
+    "level": "2023_all",
     "num_workers": 30,
     "max_attempts": 3,
 }
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 78e65581..7198ea63 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -549,8 +549,7 @@ def main() -> None:
     parser.add_argument(
         "--level",
         type=str,
-        required=True,
-        help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)",
+        help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3, 2023_all)",
     )
     parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()

From 19be07fc9b60189fe04192a39871a3099a8f9c39 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 29 Jan 2026 12:07:32 +0000
Subject: [PATCH 31/33] Simplify max_attempts and max_retries defaults

- Keep default=3 for max_attempts and max_retries in args_parser.py
- Remove redundant max_attempts=3 and max_retries=3 from config.py files
  (gaia, swebench, swebenchmultimodal, swtbench) since they match the default
- Keep max_attempts=1 and max_retries=1 in commit0/config.py since it differs
  from the default
- Remove max_retries from commit0/build_images.py set_defaults (uses global default)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/build_images.py            |  3 +-
 benchmarks/commit0/config.py                  |  6 +++
 benchmarks/gaia/config.py                     |  6 ++-
 benchmarks/swebench/build_images.py           |  2 +
 benchmarks/swebench/config.py                 |  7 +++-
 benchmarks/swebenchmultimodal/build_images.py |  2 +
 benchmarks/swebenchmultimodal/config.py       |  7 +++-
 benchmarks/swtbench/build_images.py           | 40 +++++++++++++++++--
 benchmarks/swtbench/config.py                 |  7 +++-
 benchmarks/utils/args_parser.py               |  6 ++-
 10 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py
index 8b891d85..3f24567e 100644
--- a/benchmarks/commit0/build_images.py
+++ b/benchmarks/commit0/build_images.py
@@ -13,7 +13,7 @@
 
 from commit0.harness.constants import SPLIT
 
-from benchmarks.commit0.config import INFER_DEFAULTS
+from benchmarks.commit0.config import BUILD_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
@@ -103,6 +103,7 @@ def main(argv: list[str]) -> int:
         dataset=INFER_DEFAULTS["dataset"],
         split=INFER_DEFAULTS["split"],
         repo_split=INFER_DEFAULTS["repo_split"],
+        **BUILD_DEFAULTS,
     )
     args = parser.parse_args(argv)
 
diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index ee83c646..f0b1decb 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -5,6 +5,7 @@
 """
 
 # Inference defaults (used by run_infer.py)
+# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3)
 INFER_DEFAULTS = {
     "dataset": "wentingzhao/commit0_combined",
     "split": "test",
@@ -13,3 +14,8 @@
     "max_attempts": 1,
     "max_retries": 1,
 }
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 16,
+}
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index 49fad361..dadaa20a 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -10,5 +10,9 @@
     "split": "validation",
     "level": "2023_all",
     "num_workers": 30,
-    "max_attempts": 3,
+}
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 1,
 }
diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py
index 2041ed58..cae96b87 100644
--- a/benchmarks/swebench/build_images.py
+++ b/benchmarks/swebench/build_images.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 
 from benchmarks.swebench import constants
+from benchmarks.swebench.config import BUILD_DEFAULTS
 from benchmarks.utils.build_utils import (
     BuildOutput,
     build_all_images,
@@ -158,6 +159,7 @@ def _wrap_if_needed(result: BuildOutput, push: bool) -> BuildOutput:
 
 def main(argv: list[str]) -> int:
     parser = get_build_parser()
+    parser.set_defaults(**BUILD_DEFAULTS)
     args = parser.parse_args(argv)
 
     base_images: list[str] = collect_unique_base_images(
diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py
index c882b7b5..cb3059e5 100644
--- a/benchmarks/swebench/config.py
+++ b/benchmarks/swebench/config.py
@@ -9,8 +9,6 @@
     "dataset": "princeton-nlp/SWE-bench_Verified",
     "split": "test",
     "num_workers": 30,
-    "max_attempts": 3,
-    "max_retries": 3,
 }
 
 # Evaluation defaults (used by eval_infer.py)
@@ -18,3 +16,8 @@
     "dataset": "princeton-nlp/SWE-bench_Verified",
     "workers": 12,
 }
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 32,
+}
diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py
index d32b5dc6..987cf7bd 100644
--- a/benchmarks/swebenchmultimodal/build_images.py
+++ b/benchmarks/swebenchmultimodal/build_images.py
@@ -10,6 +10,7 @@
 
 import sys
 
+from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
@@ -68,6 +69,7 @@ def collect_unique_base_images(dataset, split, n_limit):
 
 def main(argv: list[str]) -> int:
     parser = get_build_parser()
+    parser.set_defaults(**BUILD_DEFAULTS)
     args = parser.parse_args(argv)
 
     base_images: list[str] = collect_unique_base_images(
diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py
index 53be4375..a0bcb772 100644
--- a/benchmarks/swebenchmultimodal/config.py
+++ b/benchmarks/swebenchmultimodal/config.py
@@ -9,8 +9,6 @@
     "dataset": "princeton-nlp/SWE-bench_Multimodal",
     "split": "dev",
     "num_workers": 30,
-    "max_attempts": 3,
-    "max_retries": 3,
 }
 
 # Evaluation defaults (used by eval_infer.py)
@@ -19,3 +17,8 @@
     "split": "dev",
     "workers": 12,
 }
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 32,
+}
diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py
index 09db613d..3fcd2d8d 100644
--- a/benchmarks/swtbench/build_images.py
+++ b/benchmarks/swtbench/build_images.py
@@ -5,18 +5,50 @@
 SWT-Bench uses the same base environment images and build flow as SWE-Bench.
 This module simply forwards to the SWE-Bench build logic to avoid duplication
 while keeping the SWT entrypoint stable for workflows.
+
+Note: SWT-bench uses max_workers=16 (vs SWE-bench's 32) via BUILD_DEFAULTS.
 """
 
 import sys
 
 from benchmarks.swebench.build_images import (
-    main as swebench_main,
+    _wrap_if_needed,
+    collect_unique_base_images,
+    extract_custom_tag,
+)
+from benchmarks.swtbench.config import BUILD_DEFAULTS
+from benchmarks.utils.build_utils import (
+    build_all_images,
+    default_build_output_dir,
+    get_build_parser,
 )
 
 
-# Re-export the SWE-Bench logic under the SWT entrypoint
-def main(argv: list[str]) -> int:  # pragma: no cover - thin wrapper
-    return swebench_main(argv)
+def main(argv: list[str]) -> int:
+    parser = get_build_parser()
+    parser.set_defaults(**BUILD_DEFAULTS)
+    args = parser.parse_args(argv)
+
+    base_images: list[str] = collect_unique_base_images(
+        args.dataset,
+        args.split,
+        args.n_limit,
+        args.select,
+    )
+    build_dir = default_build_output_dir(args.dataset, args.split)
+
+    return build_all_images(
+        base_images=base_images,
+        target=args.target,
+        build_dir=build_dir,
+        image=args.image,
+        push=args.push,
+        max_workers=args.max_workers,
+        dry_run=args.dry_run,
+        max_retries=args.max_retries,
+        base_image_to_custom_tag_fn=extract_custom_tag,
+        post_build_fn=_wrap_if_needed,
+    )
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py
index 1d87073c..ad38f825 100644
--- a/benchmarks/swtbench/config.py
+++ b/benchmarks/swtbench/config.py
@@ -9,8 +9,6 @@
     "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
     "split": "test",
     "num_workers": 30,
-    "max_attempts": 3,
-    "max_retries": 3,
 }
 
 # Evaluation defaults (used by eval_infer.py)
@@ -20,3 +18,8 @@
     "split": "test",
     "workers": 24,
 }
+
+# Build defaults (used by build_images.py)
+BUILD_DEFAULTS = {
+    "max_workers": 16,
+}
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 5e3e14af..6ae98855 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -66,7 +66,8 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--max-attempts",
         type=int,
-        help="Maximum number of attempts for iterative mode (min: 1)",
+        default=3,
+        help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
     )
 
     # Add critic arguments (no default)
@@ -80,6 +81,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
     parser.add_argument(
         "--max-retries",
         type=int,
-        help="Maximum retries for instances that throw exceptions",
+        default=3,
+        help="Maximum retries for instances that throw exceptions (default: 3)",
     )
     return parser

From 3d3c73ad83d933ab68571637a6704803e61cd024 Mon Sep 17 00:00:00 2001
From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com>
Date: Thu, 29 Jan 2026 14:01:18 +0100
Subject: [PATCH 32/33] Apply suggestion from @simonrosenberg

---
 benchmarks/commit0/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index f0b1decb..e5d13e17 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -10,7 +10,7 @@
     "dataset": "wentingzhao/commit0_combined",
     "split": "test",
     "repo_split": "lite",
-    "num_workers": 8,
+    "num_workers": 16,
     "max_attempts": 1,
     "max_retries": 1,
 }

From a8052aefc2806a1ae38a78b62615c681245156e3 Mon Sep 17 00:00:00 2001
From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com>
Date: Thu, 29 Jan 2026 15:53:20 +0100
Subject: [PATCH 33/33] Apply suggestion from @simonrosenberg

---
 benchmarks/commit0/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py
index e5d13e17..dc5e2bc8 100644
--- a/benchmarks/commit0/config.py
+++ b/benchmarks/commit0/config.py
@@ -12,7 +12,7 @@
     "repo_split": "lite",
     "num_workers": 16,
     "max_attempts": 1,
-    "max_retries": 1,
+    "max_retries": 3,
 }
 
 # Build defaults (used by build_images.py)