diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py index 9f6b90cd..95f00983 100644 --- a/benchmarks/gaia/build_images.py +++ b/benchmarks/gaia/build_images.py @@ -13,6 +13,7 @@ import sys from pathlib import Path +from benchmarks.gaia import constants from benchmarks.utils.build_utils import ( BuildOutput, _get_sdk_submodule_info, @@ -26,8 +27,6 @@ logger = get_logger(__name__) -# GAIA base image: Python 3.12 + Node.js 22 (default for agent server) -GAIA_BASE_IMAGE = "nikolaik/python-nodejs:python3.12-nodejs22" # MCP layer Dockerfile MCP_DOCKERFILE = Path(__file__).with_name("Dockerfile.gaia") @@ -63,18 +62,22 @@ def main(argv: list[str]) -> int: args = parser.parse_args(argv) # GAIA only needs one universal image for all instances - base_images = [GAIA_BASE_IMAGE] + base_images = [constants.GAIA_BASE_IMAGE] - logger.info(f"Building GAIA agent server image from base: {GAIA_BASE_IMAGE}") + logger.info( + f"Building GAIA agent server image from base: {constants.GAIA_BASE_IMAGE}" + ) logger.info(f"Target: {args.target}") logger.info(f"Image: {args.image}") logger.info(f"Push: {args.push}") def tag_fn(_base: str) -> str: - return f"gaia-{args.target}" + return f"{constants.IMAGE_TAG_PREFIX}-{args.target}" # Build base GAIA image - build_dir = default_build_output_dir("gaia", "validation") + build_dir = default_build_output_dir( + constants.IMAGE_TAG_PREFIX, constants.DATASET_SPLIT_VALIDATION + ) exit_code = build_all_images( base_images=base_images, target=args.target, @@ -93,7 +96,9 @@ def tag_fn(_base: str) -> str: # Build MCP-enhanced layer after base image succeeds git_ref, git_sha, sdk_version = _get_sdk_submodule_info() - base_gaia_image = f"{args.image}:{git_sha[:7]}-gaia-{args.target}" + base_gaia_image = ( + f"{args.image}:{git_sha[:7]}-{constants.IMAGE_TAG_PREFIX}-{args.target}" + ) logger.info("Building MCP-enhanced GAIA image from base: %s", base_gaia_image) mcp_result = build_gaia_mcp_layer(base_gaia_image, push=args.push) diff --git a/benchmarks/gaia/constants.py b/benchmarks/gaia/constants.py new file mode 100644 index 00000000..c7f9c714 --- /dev/null +++ b/benchmarks/gaia/constants.py @@ -0,0 +1,38 @@ +""" +GAIA benchmark constants and hyperparameters. + +This module serves as the single source of truth for all constant values +used throughout the GAIA benchmark implementation. + +Note: Default values for CLI arguments (max_iterations, num_workers, output_dir, +max_attempts, critic, split) are defined in benchmarks/utils/args_parser.py +which is the shared argument parser used by all benchmarks. +""" + +from typing import Final, Literal + + +# ============================================================================= +# Dataset Configuration +# ============================================================================= +DATASET_NAME: Final[str] = "gaia-benchmark/GAIA" +DATASET_YEAR: Final[str] = "2023" +DATASET_SPLIT_VALIDATION: Final[str] = "validation" + +# ============================================================================= +# Docker/Image Configuration +# ============================================================================= +GAIA_BASE_IMAGE: Final[str] = "nikolaik/python-nodejs:python3.12-nodejs22" +TARGET_TYPE: Final[Literal["binary", "source"]] = "binary" +IMAGE_TAG_PREFIX: Final[str] = "gaia" + +# ============================================================================= +# Runtime Configuration +# ============================================================================= +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" +DEFAULT_STARTUP_TIMEOUT: Final[float] = 600.0 + +# ============================================================================= +# Default Values +# ============================================================================= +DEFAULT_MODEL_NAME: Final[str] = "openhands" diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 889d132d..4b3b47d0 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -18,6 +18,7 @@ import sys from pathlib import Path +from benchmarks.gaia import constants from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -29,7 +30,7 @@ def process_gaia_results( input_file: str, output_file: str, - model_name: str = "openhands", + model_name: str = constants.DEFAULT_MODEL_NAME, ) -> None: """ Process GAIA output.jsonl and generate evaluation report. @@ -197,8 +198,8 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + default=constants.DEFAULT_MODEL_NAME, + help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})", ) args = parser.parse_args() diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 9a0a700d..c6856847 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -11,6 +11,7 @@ from datasets import DatasetDict, load_dataset from PIL import Image +from benchmarks.gaia import constants from benchmarks.gaia.scorer import question_scorer from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url from benchmarks.utils.args_parser import get_parser @@ -75,13 +76,13 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info( f"Loading GAIA dataset: {level}, split: {self.metadata.dataset_split}" ) - dataset = cast(DatasetDict, load_dataset("gaia-benchmark/GAIA", level)) + dataset = cast(DatasetDict, load_dataset(constants.DATASET_NAME, level)) # Download dataset files logger.info(f"Downloading GAIA dataset files to {DATASET_CACHE_DIR}") DATASET_CACHE_DIR.mkdir(parents=True, exist_ok=True) huggingface_hub.snapshot_download( - "gaia-benchmark/GAIA", + constants.DATASET_NAME, repo_type="dataset", local_dir=str(DATASET_CACHE_DIR), ) @@ -151,14 +152,14 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": # Use DockerDevWorkspace with base image (same as main branch) workspace = DockerDevWorkspace( - base_image="nikolaik/python-nodejs:python3.12-nodejs22", + base_image=constants.GAIA_BASE_IMAGE, working_dir="/workspace", forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image # GAIA uses a universal agent server image (one image for all instances) - # Built from nikolaik/python-nodejs:python3.12-nodejs22 base + # Built from constants.GAIA_BASE_IMAGE base # Using binary target (not binary-minimal) to include Chromium for browser operations # Image includes pre-cached MCP server to eliminate startup delays runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -168,9 +169,7 @@ def prepare_workspace( ) sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary" - ) + agent_server_image = f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-{constants.TARGET_TYPE}" if not image_exists(agent_server_image): raise RuntimeError( @@ -182,16 +181,21 @@ def prepare_workspace( f"Using remote workspace with GAIA image {agent_server_image} " f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) - startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + str(constants.DEFAULT_STARTUP_TIMEOUT), + ) + ) workspace = APIRemoteWorkspace( runtime_api_url=os.getenv( - "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL ), runtime_api_key=runtime_api_key, server_image=agent_server_image, init_timeout=startup_timeout, startup_wait_timeout=startup_timeout, - target_type="binary", # GAIA images use binary target + target_type=constants.TARGET_TYPE, # GAIA images use binary target forward_env=forward_env or [], resource_factor=resource_factor, ) @@ -211,7 +215,10 @@ def prepare_workspace( # Construct source file path src_file = ( - DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name + DATASET_CACHE_DIR + / constants.DATASET_YEAR + / self.metadata.dataset_split + / file_name ) if not src_file.exists(): @@ -289,7 +296,10 @@ def evaluate_instance( # Load image and encode as base64 assert self.metadata.details is not None src_file = ( - DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name + DATASET_CACHE_DIR + / constants.DATASET_YEAR + / self.metadata.dataset_split + / file_name ) if src_file.exists(): image = Image.open(src_file) @@ -400,7 +410,10 @@ def _build_instruction(self, instance: EvalInstance) -> str: if extension_name == "zip": # List files from zip src_file = ( - DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name + DATASET_CACHE_DIR + / constants.DATASET_YEAR + / self.metadata.dataset_split + / file_name ) if src_file.exists(): with zipfile.ZipFile(src_file, "r") as zip_ref: @@ -585,7 +598,7 @@ def main() -> None: # Create metadata metadata = EvalMetadata( llm=llm, - dataset="gaia-benchmark/GAIA", + dataset=constants.DATASET_NAME, dataset_split=args.split, max_iterations=args.max_iterations, eval_output_dir=structured_output_dir,