Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1f3437b
Align default argument values with evaluation repository
openhands-agent Jan 28, 2026
e58ddb7
Add explicit set_defaults for swebench and update comment for swebenc…
openhands-agent Jan 28, 2026
dcb940f
Remove default dataset from args_parser.py
openhands-agent Jan 28, 2026
c34d730
Add default value for llm_config_path
openhands-agent Jan 28, 2026
6af188a
Revert "Add default value for llm_config_path"
openhands-agent Jan 28, 2026
5fcb61d
WIP: Add config.py files and refactor to use INFER_DEFAULTS
openhands-agent Jan 29, 2026
7fec81f
Fix import ordering to pass ruff lint checks
openhands-agent Jan 29, 2026
4ed5b72
Add missing workers field to GAIA and Commit0 EVAL_DEFAULTS
openhands-agent Jan 29, 2026
2c1a9e1
Use EVAL_DEFAULTS from config in eval_infer.py files
openhands-agent Jan 29, 2026
4e8cb53
Use INFER_DEFAULTS from config in commit0 and swebenchmultimodal run_…
openhands-agent Jan 29, 2026
7b8ab3d
Use EVAL_DEFAULTS from config in commit0 and gaia eval_infer.py
openhands-agent Jan 29, 2026
05d34f9
Move common defaults (note, n_limit, output_dir) to args_parser.py
openhands-agent Jan 29, 2026
3d6955a
Remove unused fields from INFER_DEFAULTS and EVAL_DEFAULTS
openhands-agent Jan 29, 2026
26d428b
Make --note optional with no default
openhands-agent Jan 29, 2026
e53928f
Use INFER_DEFAULTS for commit0 hardcoded values
openhands-agent Jan 29, 2026
9fe08b9
Revert commit0/eval_infer.py and remove EVAL_DEFAULTS
openhands-agent Jan 29, 2026
5b77dfa
Revert gaia/eval_infer.py and remove EVAL_DEFAULTS
openhands-agent Jan 29, 2026
b19fb1d
Use constants.py values in swebench/config.py
openhands-agent Jan 29, 2026
5b68a77
Move DEFAULT_DATASET, DEFAULT_EVAL_WORKERS, DEFAULT_CLI_MODEL_NAME to…
openhands-agent Jan 29, 2026
e3b2b2d
Keep DEFAULT_CLI_MODEL_NAME in constants.py, remove model_name from E…
openhands-agent Jan 29, 2026
24ba5bd
Remove model_name from swebenchmultimodal and swtbench EVAL_DEFAULTS
openhands-agent Jan 29, 2026
19b214f
Use EVAL_DEFAULTS for dataset, split, workers in swebenchmultimodal a…
openhands-agent Jan 29, 2026
c11887c
Use INFER_DEFAULTS for dataset/split in swtbench image_utils and buil…
openhands-agent Jan 29, 2026
0b22937
Fix swtbench: use EVAL_DEFAULTS for eval-related files, add split to …
openhands-agent Jan 29, 2026
98bc7b4
Revert AGENTS.md and fix commit0/build_images.py docstring
openhands-agent Jan 29, 2026
a6507ed
Move workspace default to args_parser.py, remove from INFER_DEFAULTS
openhands-agent Jan 29, 2026
ebcdec1
Move max_iterations default to args_parser.py, remove from INFER_DEFA…
openhands-agent Jan 29, 2026
2443e7d
Move critic default to critics.py, remove from INFER_DEFAULTS
openhands-agent Jan 29, 2026
2318866
Revert constants.py, hardcode output-dir default in args_parser.py
openhands-agent Jan 29, 2026
b8ad269
Add default level='2023_all' for GAIA benchmark
openhands-agent Jan 29, 2026
19be07f
Simplify max_attempts and max_retries defaults
openhands-agent Jan 29, 2026
3d3c73a
Apply suggestion from @simonrosenberg
simonrosenberg Jan 29, 2026
a8052ae
Apply suggestion from @simonrosenberg
simonrosenberg Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions benchmarks/commit0/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from commit0.harness.constants import SPLIT

from benchmarks.commit0.config import BUILD_DEFAULTS, INFER_DEFAULTS
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
Expand Down Expand Up @@ -90,7 +91,6 @@ def main(argv: list[str]) -> int:
parser.add_argument(
"--repo-split",
type=str,
default="lite",
help="Commit0 repo split (lite, all, or repo name)",
)
parser.add_argument(
Expand All @@ -99,7 +99,12 @@ def main(argv: list[str]) -> int:
default="",
help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)",
)
parser.set_defaults(dataset="wentingzhao/commit0_combined")
parser.set_defaults(
dataset=INFER_DEFAULTS["dataset"],
split=INFER_DEFAULTS["split"],
repo_split=INFER_DEFAULTS["repo_split"],
**BUILD_DEFAULTS,
)
args = parser.parse_args(argv)

docker_image_prefix = args.docker_image_prefix or None
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/commit0/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
Commit0 benchmark configuration.

Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Inference defaults (used by run_infer.py)
# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3)
INFER_DEFAULTS = {
"dataset": "wentingzhao/commit0_combined",
"split": "test",
"repo_split": "lite",
"num_workers": 16,
"max_attempts": 1,
"max_retries": 3,
}

# Build defaults (used by build_images.py)
BUILD_DEFAULTS = {
"max_workers": 16,
}
24 changes: 12 additions & 12 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
extract_custom_tag,
get_base_docker_image,
)
from benchmarks.commit0.config import INFER_DEFAULTS
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand Down Expand Up @@ -110,29 +111,29 @@ def __init__(
self,
metadata: EvalMetadata,
num_workers: int = 1,
repo_split: str = "lite",
dataset_name: str = "wentingzhao/commit0_combined",
dataset_split: str = "test",
repo_split: str | None = None,
dataset_name: str | None = None,
dataset_split: str | None = None,
):
super().__init__(metadata=metadata, num_workers=num_workers)
# Store additional parameters in metadata.details for access in methods
if not hasattr(metadata, "details") or metadata.details is None:
metadata.details = {}
metadata.details.update(
{
"repo_split": repo_split,
"dataset_name": dataset_name,
"dataset_split": dataset_split,
"repo_split": repo_split or INFER_DEFAULTS["repo_split"],
"dataset_name": dataset_name or INFER_DEFAULTS["dataset"],
"dataset_split": dataset_split or INFER_DEFAULTS["split"],
}
)

def prepare_instances(self) -> List[EvalInstance]:
logger.info("Setting up Commit0 evaluation data")

details = self.metadata.details or {}
dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined")
dataset_split = details.get("dataset_split", "test")
repo_split = details.get("repo_split", "lite")
dataset_name = details.get("dataset_name", INFER_DEFAULTS["dataset"])
dataset_split = details.get("dataset_split", INFER_DEFAULTS["split"])
repo_split = details.get("repo_split", INFER_DEFAULTS["repo_split"])

dataset = load_dataset(dataset_name, split=dataset_split)
df = commit0_setup(dataset, repo_split)
Expand Down Expand Up @@ -593,11 +594,10 @@ def main() -> None:
parser.add_argument(
"--repo-split",
type=str,
default="lite",
help="all, lite, or each repo name",
)
# Override the default dataset for commit0
parser.set_defaults(dataset="wentingzhao/commit0_combined")
# Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml)
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
Expand Down
18 changes: 18 additions & 0 deletions benchmarks/gaia/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
GAIA benchmark configuration.
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "gaia-benchmark/GAIA",
"split": "validation",
"level": "2023_all",
"num_workers": 30,
}

# Build defaults (used by build_images.py)
BUILD_DEFAULTS = {
"max_workers": 1,
}
7 changes: 4 additions & 3 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from datasets import DatasetDict, load_dataset
from PIL import Image

from benchmarks.gaia.config import INFER_DEFAULTS
from benchmarks.gaia.scorer import question_scorer
from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url
from benchmarks.utils.args_parser import get_parser
Expand Down Expand Up @@ -548,9 +549,9 @@ def main() -> None:
parser.add_argument(
"--level",
type=str,
required=True,
help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)",
help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3, 2023_all)",
)
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Create critic instance from parsed arguments
Expand Down Expand Up @@ -585,7 +586,7 @@ def main() -> None:
# Create metadata
metadata = EvalMetadata(
llm=llm,
dataset="gaia-benchmark/GAIA",
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
eval_output_dir=structured_output_dir,
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/swebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pathlib import Path

from benchmarks.swebench import constants
from benchmarks.swebench.config import BUILD_DEFAULTS
from benchmarks.utils.build_utils import (
BuildOutput,
build_all_images,
Expand Down Expand Up @@ -158,6 +159,7 @@ def _wrap_if_needed(result: BuildOutput, push: bool) -> BuildOutput:

def main(argv: list[str]) -> int:
parser = get_build_parser()
parser.set_defaults(**BUILD_DEFAULTS)
args = parser.parse_args(argv)

base_images: list[str] = collect_unique_base_images(
Expand Down
23 changes: 23 additions & 0 deletions benchmarks/swebench/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
SWE-bench benchmark configuration.

Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Verified",
"split": "test",
"num_workers": 30,
}

# Evaluation defaults (used by eval_infer.py)
EVAL_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Verified",
"workers": 12,
}

# Build defaults (used by build_images.py)
BUILD_DEFAULTS = {
"max_workers": 32,
}
10 changes: 2 additions & 8 deletions benchmarks/swebench/constants.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
"""
SWE-Bench hyperparameters and constant values.
This module serves as the single source of truth for all constant values
used in the SWE-Bench evaluation workflow.
This module provides constant values used in the SWE-Bench evaluation workflow.
For dataset, model, and worker defaults, see config.py (INFER_DEFAULTS, EVAL_DEFAULTS).
"""

from typing import Final, Literal


# Dataset
DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified"

# Docker
DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
DOCKER_IMAGE_TAG: Final[str] = "latest"
Expand All @@ -28,9 +25,6 @@
DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600

# Evaluation
DEFAULT_EVAL_WORKERS: Final[int] = 12

# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands"
DEFAULT_MODEL_NAME: Final[str] = "OpenHands"
DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands"
Expand Down
21 changes: 12 additions & 9 deletions benchmarks/swebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pathlib import Path

from benchmarks.swebench import constants
from benchmarks.swebench.config import EVAL_DEFAULTS
from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
Expand All @@ -27,7 +28,9 @@


def convert_to_swebench_format(
input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME
input_file: str,
output_file: str,
model_name: str = constants.DEFAULT_CLI_MODEL_NAME,
) -> None:
"""
Convert OpenHands output.jsonl to SWE-Bench prediction format.
Expand Down Expand Up @@ -116,8 +119,8 @@ def convert_to_swebench_format(

def run_swebench_evaluation(
predictions_file: str,
dataset: str = constants.DEFAULT_DATASET,
workers: int = constants.DEFAULT_EVAL_WORKERS,
dataset: str = EVAL_DEFAULTS["dataset"],
workers: int = EVAL_DEFAULTS["workers"],
) -> None:
"""
Run SWE-Bench evaluation on the predictions file.
Expand Down Expand Up @@ -198,8 +201,7 @@ def main() -> None:

parser.add_argument(
"--dataset",
default=constants.DEFAULT_DATASET,
help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})",
help="SWE-Bench dataset to evaluate against",
)

parser.add_argument(
Expand All @@ -216,17 +218,18 @@ def main() -> None:

parser.add_argument(
"--model-name",
default=constants.DEFAULT_CLI_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})",
help="Model name to use in the model_name_or_path field",
)

parser.add_argument(
"--workers",
type=int,
default=constants.DEFAULT_EVAL_WORKERS,
help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})",
help="Number of workers to use when evaluating",
)

# Apply EVAL_DEFAULTS from config
parser.set_defaults(**EVAL_DEFAULTS)

args = parser.parse_args()

# Validate input file
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
should_wrap_instance_id,
wrap_image,
)
from benchmarks.swebench.config import INFER_DEFAULTS
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
Expand Down Expand Up @@ -334,6 +335,7 @@ def main() -> None:
choices=choices,
help="Path to prompt template file",
)
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/swebenchmultimodal/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import sys

from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
Expand Down Expand Up @@ -68,6 +69,7 @@ def collect_unique_base_images(dataset, split, n_limit):

def main(argv: list[str]) -> int:
parser = get_build_parser()
parser.set_defaults(**BUILD_DEFAULTS)
args = parser.parse_args(argv)

base_images: list[str] = collect_unique_base_images(
Expand Down
24 changes: 24 additions & 0 deletions benchmarks/swebenchmultimodal/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
SWE-bench Multimodal benchmark configuration.
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Multimodal",
"split": "dev",
"num_workers": 30,
}

# Evaluation defaults (used by eval_infer.py)
EVAL_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Multimodal",
"split": "dev",
"workers": 12,
}

# Build defaults (used by build_images.py)
BUILD_DEFAULTS = {
"max_workers": 32,
}
12 changes: 6 additions & 6 deletions benchmarks/swebenchmultimodal/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pathlib import Path
from typing import Any

from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger
Expand Down Expand Up @@ -375,15 +376,12 @@ def main() -> None:

parser.add_argument(
"--dataset",
default="princeton-nlp/SWE-bench_Multimodal",
help="SWE-Bench dataset to evaluate against "
"(default: princeton-nlp/SWE-bench_Multimodal)",
help="SWE-Bench dataset to evaluate against",
)

parser.add_argument(
"--split",
default="dev",
help="Dataset split to use (default: dev)",
help="Dataset split to use",
)

parser.add_argument(
Expand All @@ -406,10 +404,12 @@ def main() -> None:

parser.add_argument(
"--workers",
default="12",
type=int,
help="Number of workers to use when evaluating",
)

parser.set_defaults(**EVAL_DEFAULTS)

parser.add_argument(
"--run-id",
help="Run ID for the evaluation (default: eval_<output_filename>)",
Expand Down
Loading