Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions benchmarks/gaia/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys
from pathlib import Path

from benchmarks.gaia import constants
from benchmarks.utils.build_utils import (
BuildOutput,
_get_sdk_submodule_info,
Expand All @@ -26,8 +27,6 @@

logger = get_logger(__name__)

# GAIA base image: Python 3.12 + Node.js 22 (default for agent server)
GAIA_BASE_IMAGE = "nikolaik/python-nodejs:python3.12-nodejs22"
# MCP layer Dockerfile
MCP_DOCKERFILE = Path(__file__).with_name("Dockerfile.gaia")

Expand Down Expand Up @@ -63,18 +62,22 @@ def main(argv: list[str]) -> int:
args = parser.parse_args(argv)

# GAIA only needs one universal image for all instances
base_images = [GAIA_BASE_IMAGE]
base_images = [constants.GAIA_BASE_IMAGE]

logger.info(f"Building GAIA agent server image from base: {GAIA_BASE_IMAGE}")
logger.info(
f"Building GAIA agent server image from base: {constants.GAIA_BASE_IMAGE}"
)
logger.info(f"Target: {args.target}")
logger.info(f"Image: {args.image}")
logger.info(f"Push: {args.push}")

def tag_fn(_base: str) -> str:
return f"gaia-{args.target}"
return f"{constants.IMAGE_TAG_PREFIX}-{args.target}"

# Build base GAIA image
build_dir = default_build_output_dir("gaia", "validation")
build_dir = default_build_output_dir(
constants.IMAGE_TAG_PREFIX, constants.DATASET_SPLIT_VALIDATION
)
exit_code = build_all_images(
base_images=base_images,
target=args.target,
Expand All @@ -93,7 +96,9 @@ def tag_fn(_base: str) -> str:

# Build MCP-enhanced layer after base image succeeds
git_ref, git_sha, sdk_version = _get_sdk_submodule_info()
base_gaia_image = f"{args.image}:{git_sha[:7]}-gaia-{args.target}"
base_gaia_image = (
f"{args.image}:{git_sha[:7]}-{constants.IMAGE_TAG_PREFIX}-{args.target}"
)

logger.info("Building MCP-enhanced GAIA image from base: %s", base_gaia_image)
mcp_result = build_gaia_mcp_layer(base_gaia_image, push=args.push)
Expand Down
38 changes: 38 additions & 0 deletions benchmarks/gaia/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
GAIA benchmark constants and hyperparameters.

This module serves as the single source of truth for all constant values
used throughout the GAIA benchmark implementation.

Note: Default values for CLI arguments (max_iterations, num_workers, output_dir,
max_attempts, critic, split) are defined in benchmarks/utils/args_parser.py
which is the shared argument parser used by all benchmarks.
"""

from typing import Final, Literal


# =============================================================================
# Dataset Configuration
# =============================================================================
DATASET_NAME: Final[str] = "gaia-benchmark/GAIA"
DATASET_YEAR: Final[str] = "2023"
DATASET_SPLIT_VALIDATION: Final[str] = "validation"

# =============================================================================
# Docker/Image Configuration
# =============================================================================
GAIA_BASE_IMAGE: Final[str] = "nikolaik/python-nodejs:python3.12-nodejs22"
TARGET_TYPE: Final[Literal["binary", "source"]] = "binary"
IMAGE_TAG_PREFIX: Final[str] = "gaia"

# =============================================================================
# Runtime Configuration
# =============================================================================
DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
DEFAULT_STARTUP_TIMEOUT: Final[float] = 600.0

# =============================================================================
# Default Values
# =============================================================================
DEFAULT_MODEL_NAME: Final[str] = "openhands"
7 changes: 4 additions & 3 deletions benchmarks/gaia/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
from pathlib import Path

from benchmarks.gaia import constants
from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger
Expand All @@ -29,7 +30,7 @@
def process_gaia_results(
input_file: str,
output_file: str,
model_name: str = "openhands",
model_name: str = constants.DEFAULT_MODEL_NAME,
) -> None:
"""
Process GAIA output.jsonl and generate evaluation report.
Expand Down Expand Up @@ -197,8 +198,8 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
default=constants.DEFAULT_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_MODEL_NAME})",
)

args = parser.parse_args()
Expand Down
41 changes: 27 additions & 14 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from datasets import DatasetDict, load_dataset
from PIL import Image

from benchmarks.gaia import constants
from benchmarks.gaia.scorer import question_scorer
from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url
from benchmarks.utils.args_parser import get_parser
Expand Down Expand Up @@ -75,13 +76,13 @@ def prepare_instances(self) -> List[EvalInstance]:
logger.info(
f"Loading GAIA dataset: {level}, split: {self.metadata.dataset_split}"
)
dataset = cast(DatasetDict, load_dataset("gaia-benchmark/GAIA", level))
dataset = cast(DatasetDict, load_dataset(constants.DATASET_NAME, level))

# Download dataset files
logger.info(f"Downloading GAIA dataset files to {DATASET_CACHE_DIR}")
DATASET_CACHE_DIR.mkdir(parents=True, exist_ok=True)
huggingface_hub.snapshot_download(
"gaia-benchmark/GAIA",
constants.DATASET_NAME,
repo_type="dataset",
local_dir=str(DATASET_CACHE_DIR),
)
Expand Down Expand Up @@ -151,14 +152,14 @@ def prepare_workspace(
if self.metadata.workspace_type == "docker":
# Use DockerDevWorkspace with base image (same as main branch)
workspace = DockerDevWorkspace(
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
base_image=constants.GAIA_BASE_IMAGE,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
# GAIA uses a universal agent server image (one image for all instances)
# Built from nikolaik/python-nodejs:python3.12-nodejs22 base
# Built from constants.GAIA_BASE_IMAGE base
# Using binary target (not binary-minimal) to include Chromium for browser operations
# Image includes pre-cached MCP server to eliminate startup delays
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand All @@ -168,9 +169,7 @@ def prepare_workspace(
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
)
agent_server_image = f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-{constants.TARGET_TYPE}"

if not image_exists(agent_server_image):
raise RuntimeError(
Expand All @@ -182,16 +181,21 @@ def prepare_workspace(
f"Using remote workspace with GAIA image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
startup_timeout = float(
os.getenv(
"REMOTE_RUNTIME_STARTUP_TIMEOUT",
str(constants.DEFAULT_STARTUP_TIMEOUT),
)
)
workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
"RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL
),
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
init_timeout=startup_timeout,
startup_wait_timeout=startup_timeout,
target_type="binary", # GAIA images use binary target
target_type=constants.TARGET_TYPE, # GAIA images use binary target
forward_env=forward_env or [],
resource_factor=resource_factor,
)
Expand All @@ -211,7 +215,10 @@ def prepare_workspace(

# Construct source file path
src_file = (
DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name
DATASET_CACHE_DIR
/ constants.DATASET_YEAR
/ self.metadata.dataset_split
/ file_name
)

if not src_file.exists():
Expand Down Expand Up @@ -289,7 +296,10 @@ def evaluate_instance(
# Load image and encode as base64
assert self.metadata.details is not None
src_file = (
DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name
DATASET_CACHE_DIR
/ constants.DATASET_YEAR
/ self.metadata.dataset_split
/ file_name
)
if src_file.exists():
image = Image.open(src_file)
Expand Down Expand Up @@ -400,7 +410,10 @@ def _build_instruction(self, instance: EvalInstance) -> str:
if extension_name == "zip":
# List files from zip
src_file = (
DATASET_CACHE_DIR / "2023" / self.metadata.dataset_split / file_name
DATASET_CACHE_DIR
/ constants.DATASET_YEAR
/ self.metadata.dataset_split
/ file_name
)
if src_file.exists():
with zipfile.ZipFile(src_file, "r") as zip_ref:
Expand Down Expand Up @@ -585,7 +598,7 @@ def main() -> None:
# Create metadata
metadata = EvalMetadata(
llm=llm,
dataset="gaia-benchmark/GAIA",
dataset=constants.DATASET_NAME,
dataset_split=args.split,
max_iterations=args.max_iterations,
eval_output_dir=structured_output_dir,
Expand Down