diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index b59704ea..3f24567e 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -13,6 +13,7 @@ from commit0.harness.constants import SPLIT +from benchmarks.commit0.config import BUILD_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -90,7 +91,6 @@ def main(argv: list[str]) -> int: parser.add_argument( "--repo-split", type=str, - default="lite", help="Commit0 repo split (lite, all, or repo name)", ) parser.add_argument( @@ -99,7 +99,12 @@ def main(argv: list[str]) -> int: default="", help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)", ) - parser.set_defaults(dataset="wentingzhao/commit0_combined") + parser.set_defaults( + dataset=INFER_DEFAULTS["dataset"], + split=INFER_DEFAULTS["split"], + repo_split=INFER_DEFAULTS["repo_split"], + **BUILD_DEFAULTS, + ) args = parser.parse_args(argv) docker_image_prefix = args.docker_image_prefix or None diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py new file mode 100644 index 00000000..dc5e2bc8 --- /dev/null +++ b/benchmarks/commit0/config.py @@ -0,0 +1,21 @@ +""" +Commit0 benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3) +INFER_DEFAULTS = { + "dataset": "wentingzhao/commit0_combined", + "split": "test", + "repo_split": "lite", + "num_workers": 16, + "max_attempts": 1, + "max_retries": 3, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 16, +} diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2e473669..7ec21b6e 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -12,6 +12,7 @@ extract_custom_tag, get_base_docker_image, ) +from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -110,9 +111,9 @@ def __init__( self, metadata: EvalMetadata, num_workers: int = 1, - repo_split: str = "lite", - dataset_name: str = "wentingzhao/commit0_combined", - dataset_split: str = "test", + repo_split: str | None = None, + dataset_name: str | None = None, + dataset_split: str | None = None, ): super().__init__(metadata=metadata, num_workers=num_workers) # Store additional parameters in metadata.details for access in methods @@ -120,9 +121,9 @@ def __init__( metadata.details = {} metadata.details.update( { - "repo_split": repo_split, - "dataset_name": dataset_name, - "dataset_split": dataset_split, + "repo_split": repo_split or INFER_DEFAULTS["repo_split"], + "dataset_name": dataset_name or INFER_DEFAULTS["dataset"], + "dataset_split": dataset_split or INFER_DEFAULTS["split"], } ) @@ -130,9 +131,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up Commit0 evaluation data") details = self.metadata.details or {} - dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined") - dataset_split = details.get("dataset_split", "test") - repo_split = details.get("repo_split", "lite") + dataset_name = details.get("dataset_name", INFER_DEFAULTS["dataset"]) + dataset_split = details.get("dataset_split", INFER_DEFAULTS["split"]) + repo_split = details.get("repo_split", INFER_DEFAULTS["repo_split"]) dataset = load_dataset(dataset_name, split=dataset_split) df = commit0_setup(dataset, repo_split) @@ -593,11 +594,10 @@ def main() -> None: parser.add_argument( "--repo-split", type=str, - default="lite", help="all, lite, or each repo name", ) - # Override the default dataset for commit0 - parser.set_defaults(dataset="wentingzhao/commit0_combined") + # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py new file mode 100644 index 00000000..dadaa20a --- /dev/null +++ b/benchmarks/gaia/config.py @@ -0,0 +1,18 @@ +""" +GAIA benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "gaia-benchmark/GAIA", + "split": "validation", + "level": "2023_all", + "num_workers": 30, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 1, +} diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 9a0a700d..7198ea63 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -11,6 +11,7 @@ from datasets import DatasetDict, load_dataset from PIL import Image +from benchmarks.gaia.config import INFER_DEFAULTS from benchmarks.gaia.scorer import question_scorer from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url from benchmarks.utils.args_parser import get_parser @@ -548,9 +549,9 @@ def main() -> None: parser.add_argument( "--level", type=str, - required=True, - help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3)", + help="GAIA level to evaluate (e.g., 2023_level1, 2023_level2, 2023_level3, 2023_all)", ) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Create critic instance from parsed arguments @@ -585,7 +586,7 @@ def main() -> None: # Create metadata metadata = EvalMetadata( llm=llm, - dataset="gaia-benchmark/GAIA", + dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, eval_output_dir=structured_output_dir, diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index 2041ed58..cae96b87 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -13,6 +13,7 @@ from pathlib import Path from benchmarks.swebench import constants +from benchmarks.swebench.config import BUILD_DEFAULTS from benchmarks.utils.build_utils import ( BuildOutput, build_all_images, @@ -158,6 +159,7 @@ def _wrap_if_needed(result: BuildOutput, push: bool) -> BuildOutput: def main(argv: list[str]) -> int: parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) args = parser.parse_args(argv) base_images: list[str] = collect_unique_base_images( diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py new file mode 100644 index 00000000..cb3059e5 --- /dev/null +++ b/benchmarks/swebench/config.py @@ -0,0 +1,23 @@ +""" +SWE-bench benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "split": "test", + "num_workers": 30, +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "workers": 12, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 32, +} diff --git a/benchmarks/swebench/constants.py b/benchmarks/swebench/constants.py index 88d795c8..46ca83ea 100644 --- a/benchmarks/swebench/constants.py +++ b/benchmarks/swebench/constants.py @@ -1,16 +1,13 @@ """ SWE-Bench hyperparameters and constant values. -This module serves as the single source of truth for all constant values -used in the SWE-Bench evaluation workflow. +This module provides constant values used in the SWE-Bench evaluation workflow. +For dataset, model, and worker defaults, see config.py (INFER_DEFAULTS, EVAL_DEFAULTS). """ from typing import Final, Literal -# Dataset -DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified" - # Docker DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/" DOCKER_IMAGE_TAG: Final[str] = "latest" @@ -28,9 +25,6 @@ DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 -# Evaluation -DEFAULT_EVAL_WORKERS: Final[int] = 12 - # Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands" DEFAULT_MODEL_NAME: Final[str] = "OpenHands" DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands" diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index b1c5ee69..0d688f31 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -17,6 +17,7 @@ from pathlib import Path from benchmarks.swebench import constants +from benchmarks.swebench.config import EVAL_DEFAULTS from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -27,7 +28,9 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME + input_file: str, + output_file: str, + model_name: str = constants.DEFAULT_CLI_MODEL_NAME, ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -116,8 +119,8 @@ def convert_to_swebench_format( def run_swebench_evaluation( predictions_file: str, - dataset: str = constants.DEFAULT_DATASET, - workers: int = constants.DEFAULT_EVAL_WORKERS, + dataset: str = EVAL_DEFAULTS["dataset"], + workers: int = EVAL_DEFAULTS["workers"], ) -> None: """ Run SWE-Bench evaluation on the predictions file. @@ -198,8 +201,7 @@ def main() -> None: parser.add_argument( "--dataset", - default=constants.DEFAULT_DATASET, - help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})", + help="SWE-Bench dataset to evaluate against", ) parser.add_argument( @@ -216,17 +218,18 @@ def main() -> None: parser.add_argument( "--model-name", - default=constants.DEFAULT_CLI_MODEL_NAME, - help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})", + help="Model name to use in the model_name_or_path field", ) parser.add_argument( "--workers", type=int, - default=constants.DEFAULT_EVAL_WORKERS, - help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})", + help="Number of workers to use when evaluating", ) + # Apply EVAL_DEFAULTS from config + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index e19f0877..259e9163 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -11,6 +11,7 @@ should_wrap_instance_id, wrap_image, ) +from benchmarks.swebench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -334,6 +335,7 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py index d32b5dc6..987cf7bd 100644 --- a/benchmarks/swebenchmultimodal/build_images.py +++ b/benchmarks/swebenchmultimodal/build_images.py @@ -10,6 +10,7 @@ import sys +from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, @@ -68,6 +69,7 @@ def collect_unique_base_images(dataset, split, n_limit): def main(argv: list[str]) -> int: parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) args = parser.parse_args(argv) base_images: list[str] = collect_unique_base_images( diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py new file mode 100644 index 00000000..a0bcb772 --- /dev/null +++ b/benchmarks/swebenchmultimodal/config.py @@ -0,0 +1,24 @@ +""" +SWE-bench Multimodal benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Multimodal", + "split": "dev", + "num_workers": 30, +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Multimodal", + "split": "dev", + "workers": 12, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 32, +} diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 0984b3e5..b65b0c66 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import Any +from benchmarks.swebenchmultimodal.config import EVAL_DEFAULTS from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -375,15 +376,12 @@ def main() -> None: parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Multimodal", - help="SWE-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Multimodal)", + help="SWE-Bench dataset to evaluate against", ) parser.add_argument( "--split", - default="dev", - help="Dataset split to use (default: dev)", + help="Dataset split to use", ) parser.add_argument( @@ -406,10 +404,12 @@ def main() -> None: parser.add_argument( "--workers", - default="12", + type=int, help="Number of workers to use when evaluating", ) + parser.set_defaults(**EVAL_DEFAULTS) + parser.add_argument( "--run-id", help="Run ID for the evaluation (default: eval_)", diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 68e4c5b8..85fc8254 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -10,6 +10,7 @@ extract_custom_tag, get_official_docker_image, ) +from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -423,8 +424,8 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) - # Override the default dataset and split for multimodal - parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev") + # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 079ad66c..fde30ed9 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -9,6 +9,7 @@ import docker +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from benchmarks.utils.image_utils import image_exists as remote_image_exists @@ -257,8 +258,12 @@ def main() -> None: parser = argparse.ArgumentParser( description="Build and push prebaked SWT-bench eval env images." ) - parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", help="Dataset name") + parser.add_argument("--split", help="Dataset split") + parser.set_defaults( + dataset=EVAL_DEFAULTS["dataset"], + split=EVAL_DEFAULTS["split"], + ) parser.add_argument( "--eval-limit", type=int, diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py index 09db613d..3fcd2d8d 100644 --- a/benchmarks/swtbench/build_images.py +++ b/benchmarks/swtbench/build_images.py @@ -5,18 +5,50 @@ SWT-Bench uses the same base environment images and build flow as SWE-Bench. This module simply forwards to the SWE-Bench build logic to avoid duplication while keeping the SWT entrypoint stable for workflows. + +Note: SWT-bench uses max_workers=16 (vs SWE-bench's 32) via BUILD_DEFAULTS. """ import sys from benchmarks.swebench.build_images import ( - main as swebench_main, + _wrap_if_needed, + collect_unique_base_images, + extract_custom_tag, +) +from benchmarks.swtbench.config import BUILD_DEFAULTS +from benchmarks.utils.build_utils import ( + build_all_images, + default_build_output_dir, + get_build_parser, ) -# Re-export the SWE-Bench logic under the SWT entrypoint -def main(argv: list[str]) -> int: # pragma: no cover - thin wrapper - return swebench_main(argv) +def main(argv: list[str]) -> int: + parser = get_build_parser() + parser.set_defaults(**BUILD_DEFAULTS) + args = parser.parse_args(argv) + + base_images: list[str] = collect_unique_base_images( + args.dataset, + args.split, + args.n_limit, + args.select, + ) + build_dir = default_build_output_dir(args.dataset, args.split) + + return build_all_images( + base_images=base_images, + target=args.target, + build_dir=build_dir, + image=args.image, + push=args.push, + max_workers=args.max_workers, + dry_run=args.dry_run, + max_retries=args.max_retries, + base_image_to_custom_tag_fn=extract_custom_tag, + post_build_fn=_wrap_if_needed, + ) if __name__ == "__main__": diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py new file mode 100644 index 00000000..ad38f825 --- /dev/null +++ b/benchmarks/swtbench/config.py @@ -0,0 +1,25 @@ +""" +SWT-bench benchmark configuration. + +Default values aligned with evaluation repository (OpenHands/evaluation). +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", + "split": "test", + "num_workers": 30, +} + +# Evaluation defaults (used by eval_infer.py) +# Note: eval uses SWE-bench dataset, not SWT-bench dataset +EVAL_DEFAULTS = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "split": "test", + "workers": 24, +} + +# Build defaults (used by build_images.py) +BUILD_DEFAULTS = { + "max_workers": 16, +} diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..8c37775f 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,7 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ( compute_required_images, ensure_swt_bench_repo, @@ -67,7 +68,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: def try_pull_prebaked_images( predictions_file: Path, dataset: str, - split: str = "test", + split: str = EVAL_DEFAULTS["split"], registry: str = PREBAKED_REGISTRY, ) -> None: """ @@ -359,9 +360,7 @@ def main() -> None: # Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness. parser.add_argument( "--dataset", - default="princeton-nlp/SWE-bench_Verified", - help="SWT-Bench dataset to evaluate against " - "(default: princeton-nlp/SWE-bench_Verified)", + help="SWT-Bench dataset to evaluate against", ) parser.add_argument( @@ -384,10 +383,12 @@ def main() -> None: parser.add_argument( "--workers", - default="12", + type=int, help="Number of workers to use when evaluating", ) + parser.set_defaults(**EVAL_DEFAULTS) + args = parser.parse_args() # Validate input file diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index e7aae1f4..c5d34035 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Iterable +from benchmarks.swtbench.config import EVAL_DEFAULTS from openhands.sdk import get_logger @@ -130,8 +131,9 @@ def main() -> None: description="List SWT-bench base/env images required for a predictions file." ) parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") - parser.add_argument("--dataset", required=True, help="Dataset name") - parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument("--dataset", help="Dataset name") + parser.add_argument("--split", help="Dataset split") + parser.set_defaults(**EVAL_DEFAULTS) parser.add_argument( "--format", choices=["plain", "json"], diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index a454e580..7c863539 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -4,6 +4,7 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swtbench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -355,6 +356,7 @@ def main() -> None: choices=choices, help="Path to prompt template file", ) + parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() # Validate max_attempts diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 60f08d73..6ae98855 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -1,5 +1,9 @@ """ -Argument parsing utilities for SWE-bench benchmarks. +Argument parsing utilities for benchmarks. + +This module defines common arguments used across all benchmarks. +Benchmark-specific defaults should be set via parser.set_defaults() +to match the evaluation repository configuration. """ import argparse @@ -8,10 +12,16 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: - """Create and return argument parser. + """Create and return argument parser without defaults. + + Each benchmark must call parser.set_defaults() before parse_args() + to set values matching the evaluation repository (OpenHands/evaluation). + + Args: + add_llm_config: Whether to add the llm_config_path positional argument. Returns: - ArgumentParser instance + ArgumentParser instance with common benchmark arguments (no defaults). """ parser = argparse.ArgumentParser(description="Run Evaluation inference") if add_llm_config: @@ -23,24 +33,24 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dataset", type=str, - default="princeton-nlp/SWE-bench_Verified", help="Dataset name", ) - parser.add_argument("--split", type=str, default="test", help="Dataset split") + parser.add_argument("--split", type=str, help="Dataset split") parser.add_argument( "--workspace", type=str, - default="docker", + default="remote", choices=["docker", "remote"], - help="Type of workspace to use (default: docker)", + help="Type of workspace to use (default: remote)", ) parser.add_argument( - "--max-iterations", type=int, default=100, help="Maximum iterations" - ) - parser.add_argument( - "--num-workers", type=int, default=1, help="Number of evaluation workers" + "--max-iterations", + type=int, + default=500, + help="Maximum iterations (default: 500)", ) - parser.add_argument("--note", type=str, default="initial", help="Evaluation note") + parser.add_argument("--num-workers", type=int, help="Number of inference workers") + parser.add_argument("--note", type=str, help="Optional evaluation note") parser.add_argument( "--output-dir", type=str, @@ -51,7 +61,7 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: "--n-limit", type=int, default=0, - help="Limit number of instances to evaluate", + help="Limit number of instances to evaluate (0 = no limit)", ) parser.add_argument( "--max-attempts", @@ -60,13 +70,12 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Maximum number of attempts for iterative mode (default: 3, min: 1)", ) - # Add critic arguments + # Add critic arguments (no default) add_critic_args(parser) parser.add_argument( "--select", type=str, - default=None, help="Path to text file containing instance IDs to select (one per line)", ) parser.add_argument( diff --git a/benchmarks/utils/critics.py b/benchmarks/utils/critics.py index af9c55ae..fa9f9d92 100644 --- a/benchmarks/utils/critics.py +++ b/benchmarks/utils/critics.py @@ -37,17 +37,15 @@ def add_critic_args(parser: ArgumentParser) -> None: parser.add_argument( "--critic", type=str, - default="pass", + default="finish_with_patch", help=( - "Name of the critic to use for evaluation (default: 'pass'). " + "Name of the critic to use for evaluation (default: finish_with_patch). " "Critics determine whether an agent's output is considered successful " "and whether another attempt should be made in iterative evaluation mode. " "Available critics: " - "'pass' - Always accepts the output (no retry logic, suitable for single-attempt runs), " + "'pass' - Always accepts the output (no retry logic), " "'finish_with_patch' - Requires both AgentFinishAction and non-empty git patch, " - "'empty_patch_critic' - Only requires non-empty git patch. " - "For single-attempt runs (default), 'pass' is recommended as the actual evaluation " - "is performed by the benchmark's own scoring system." + "'empty_patch_critic' - Only requires non-empty git patch." ), ) parser.add_argument( diff --git a/benchmarks/utils/evaluation_utils.py b/benchmarks/utils/evaluation_utils.py index 517b85d3..030457ea 100644 --- a/benchmarks/utils/evaluation_utils.py +++ b/benchmarks/utils/evaluation_utils.py @@ -18,7 +18,7 @@ def construct_eval_output_dir( dataset_name: str, model_name: str, max_iterations: int, - eval_note: str, + eval_note: str | None, ) -> str: """Construct the structured evaluation output directory path.""" # Format: eval_out/-//