Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,26 @@ make build # Rebuild environment
4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]`
5. Update README.md with usage instructions

# Default Values Alignment
Default values in `benchmarks/utils/args_parser.py` are aligned with the evaluation
repository (OpenHands/evaluation) `eval-job/values.yaml`. This ensures consistency
between local development and production runs.

**Shared defaults in args_parser.py:**
- `--workspace`: "remote" (production uses remote workspaces)
- `--max-iterations`: 500 (sufficient for complex tasks)
- `--critic`: "finish_with_patch" (ensures agent produces valid patches)
- `--max-attempts`: 3 (allows retries on critic failures)
- `--max-retries`: 3 (handles transient errors)

**Benchmark-specific overrides:** Use `parser.set_defaults()` in each benchmark's
`run_infer.py` before calling `parse_args()`:
- `gaia`: dataset="gaia-benchmark/GAIA"
- `swebench`: dataset="princeton-nlp/SWE-bench_Verified" (default)
- `swtbench`: dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp"
- `commit0`: dataset="wentingzhao/commit0_combined", max_attempts=1, max_retries=1
- `swebenchmultimodal`: dataset="princeton-nlp/SWE-bench_Multimodal", split="dev"

# LLM Configuration
LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
```json
Expand Down
8 changes: 6 additions & 2 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,12 @@ def main() -> None:
default="lite",
help="all, lite, or each repo name",
)
# Override the default dataset for commit0
parser.set_defaults(dataset="wentingzhao/commit0_combined")
# Override defaults for commit0 (matches evaluation repository values.yaml)
parser.set_defaults(
dataset="wentingzhao/commit0_combined",
max_attempts=1,
max_retries=1,
)
args = parser.parse_args()

# Validate max_attempts
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ def main() -> None:
required=True,
help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)",
)
# Override defaults for GAIA (matches evaluation repository values.yaml)
parser.set_defaults(dataset="gaia-benchmark/GAIA")
args = parser.parse_args()

# Create critic instance from parsed arguments
Expand Down Expand Up @@ -585,7 +587,7 @@ def main() -> None:
# Create metadata
metadata = EvalMetadata(
llm=llm,
dataset="gaia-benchmark/GAIA",
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
eval_output_dir=structured_output_dir,
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,9 @@ def main() -> None:
choices=choices,
help="Path to prompt template file",
)
# SWE-bench defaults match the global args_parser defaults (evaluation repository values.yaml)
# Explicit set_defaults for consistency with other benchmarks
parser.set_defaults(dataset="princeton-nlp/SWE-bench_Verified", split="test")
args = parser.parse_args()

# Validate max_attempts
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def main() -> None:
choices=choices,
help="Path to prompt template file",
)
# Override the default dataset and split for multimodal
# Override defaults for SWE-bench Multimodal (matches evaluation repository values.yaml)
parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev")
args = parser.parse_args()

Expand Down
2 changes: 2 additions & 0 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ def main() -> None:
choices=choices,
help="Path to prompt template file",
)
# Override defaults for SWT-bench (matches evaluation repository values.yaml)
parser.set_defaults(dataset="eth-sri/SWT-bench_Verified_bm25_27k_zsp")
args = parser.parse_args()

# Validate max_attempts
Expand Down
32 changes: 22 additions & 10 deletions benchmarks/utils/args_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""
Argument parsing utilities for SWE-bench benchmarks.
Argument parsing utilities for benchmarks.

Default values are aligned with the evaluation repository (OpenHands/evaluation)
to ensure consistency between local development and production runs.

Benchmark-specific values should be set via parser.set_defaults() in each
benchmark's run_infer.py to override these common defaults.
"""

import argparse
Expand All @@ -8,10 +14,17 @@


def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
"""Create and return argument parser.
"""Create and return argument parser with common defaults.

Default values match the most common settings used across benchmarks
in the evaluation repository. Individual benchmarks can override
these using parser.set_defaults() before calling parse_args().

Args:
add_llm_config: Whether to add the llm_config_path positional argument.

Returns:
ArgumentParser instance
ArgumentParser instance with common benchmark arguments.
"""
parser = argparse.ArgumentParser(description="Run Evaluation inference")
if add_llm_config:
Expand All @@ -23,22 +36,21 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
parser.add_argument(
"--dataset",
type=str,
default="princeton-nlp/SWE-bench_Verified",
help="Dataset name",
help="Dataset name (each benchmark sets its default via set_defaults)",
)
parser.add_argument("--split", type=str, default="test", help="Dataset split")
parser.add_argument(
"--workspace",
type=str,
default="docker",
default="remote",
choices=["docker", "remote"],
help="Type of workspace to use (default: docker)",
help="Type of workspace to use (default: remote)",
)
parser.add_argument(
"--max-iterations", type=int, default=100, help="Maximum iterations"
"--max-iterations", type=int, default=500, help="Maximum iterations"
)
parser.add_argument(
"--num-workers", type=int, default=1, help="Number of evaluation workers"
"--num-workers", type=int, default=1, help="Number of inference workers"
)
parser.add_argument("--note", type=str, default="initial", help="Evaluation note")
parser.add_argument(
Expand All @@ -60,7 +72,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
)

# Add critic arguments
# Add critic arguments (default: finish_with_patch)
add_critic_args(parser)

parser.add_argument(
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/utils/critics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ def add_critic_args(parser: ArgumentParser) -> None:
parser.add_argument(
"--critic",
type=str,
default="pass",
default="finish_with_patch",
help=(
"Name of the critic to use for evaluation (default: 'pass'). "
"Name of the critic to use for evaluation (default: 'finish_with_patch'). "
"Critics determine whether an agent's output is considered successful "
"and whether another attempt should be made in iterative evaluation mode. "
"Available critics: "
"'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), "
"'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, "
"'empty_patch_critic' - Only requires non-empty git patch. "
"For single-attempt runs (default), 'pass' is recommended as the actual evaluation "
"is performed by the benchmark's own scoring system."
"For production runs, 'finish_with_patch' is recommended as it ensures "
"the agent produces a valid patch before completing."
),
)
parser.add_argument(
Expand Down