Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/swebenchmultimodal/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import sys

from benchmarks.swebenchmultimodal.constants import DOCKER_IMAGE_PREFIX
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
Expand All @@ -24,7 +25,7 @@

def get_official_docker_image(
instance_id: str,
docker_image_prefix="docker.io/swebench/",
docker_image_prefix: str = DOCKER_IMAGE_PREFIX,
) -> str:
# For multimodal benchmark, we use regular SWE-bench images as base
# since multimodal-specific images (sweb.mm.eval.*) are not available
Expand Down
55 changes: 55 additions & 0 deletions benchmarks/swebenchmultimodal/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Constants for SWE-Bench Multimodal benchmark.

This module serves as the single source of truth for all hyperparameters
and constant values used in the SWE-Bench Multimodal evaluation workflow.
"""

# Dataset configuration
DEFAULT_DATASET = "princeton-nlp/SWE-bench_Multimodal"
DEFAULT_SPLIT = "dev"

# Docker image configuration
DOCKER_IMAGE_PREFIX = "docker.io/swebench/"

# Build configuration
BUILD_TARGET = "source-minimal"

# Workspace configuration
WORKSPACE_DIR = "/workspace"

# Environment variable names
ENV_SKIP_BUILD = "SKIP_BUILD"
ENV_RUNTIME_API_KEY = "RUNTIME_API_KEY"
ENV_SDK_SHORT_SHA = "SDK_SHORT_SHA"
ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT = "REMOTE_RUNTIME_STARTUP_TIMEOUT"
ENV_RUNTIME_API_URL = "RUNTIME_API_URL"

# Default values for environment variables
DEFAULT_SKIP_BUILD = "1"
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = "600"
DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"

# Git configuration
GIT_USER_EMAIL = "evaluation@openhands.dev"
GIT_USER_NAME = "OpenHands Evaluation"
GIT_COMMIT_MESSAGE = "patch"

# Environment setup commands
ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"]

# Image validation
ALLOWED_IMAGE_TYPES = ["image/jpeg", "image/png", "image/gif", "image/webp"]

# Evaluation configuration
DEFAULT_EVAL_WORKERS = "12"
DEFAULT_MODEL_NAME = "openhands"

# Annotation keywords
SOLVEABLE_KEYWORD = "SOLVEABLE"

# Files to remove from patches during evaluation
SETUP_FILES_TO_REMOVE = ["pyproject.toml", "tox.ini", "setup.py"]

# Annotations file name
ANNOTATIONS_FILENAME = "ambiguity_annotations.json"
43 changes: 25 additions & 18 deletions benchmarks/swebenchmultimodal/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
from pathlib import Path
from typing import Any

from benchmarks.swebenchmultimodal.constants import (
ANNOTATIONS_FILENAME,
DEFAULT_DATASET,
DEFAULT_EVAL_WORKERS,
DEFAULT_MODEL_NAME,
DEFAULT_SPLIT,
SETUP_FILES_TO_REMOVE,
SOLVEABLE_KEYWORD,
)
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger
Expand All @@ -24,7 +33,7 @@
logger = get_logger(__name__)

# Path to ambiguity annotations relative to this file
ANNOTATIONS_FILE = Path(__file__).parent / "ambiguity_annotations.json"
ANNOTATIONS_FILE = Path(__file__).parent / ANNOTATIONS_FILENAME


def load_ambiguity_annotations() -> dict[str, Any]:
Expand Down Expand Up @@ -77,7 +86,7 @@ def calculate_component_scores(

for instance_id, annotation in annotations.items():
keywords = annotation.get("keywords", [])
if "SOLVEABLE" in keywords:
if SOLVEABLE_KEYWORD in keywords:
solveable_ids.add(instance_id)
else:
unsolveable_ids.add(instance_id)
Expand Down Expand Up @@ -213,8 +222,7 @@ def convert_to_swebench_format(
git_patch = ""

# postprocess git_patch
setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
git_patch = remove_files_from_patch(git_patch, setup_files)
git_patch = remove_files_from_patch(git_patch, SETUP_FILES_TO_REMOVE)

# Create SWE-Bench format entry
swebench_entry = {
Expand Down Expand Up @@ -269,9 +277,9 @@ def find_report_json(predictions_dir: Path, run_id: str) -> Path | None:

def run_swebench_multimodal_evaluation(
predictions_file: str,
dataset: str = "princeton-nlp/SWE-bench_Multimodal",
split: str = "dev",
workers: str = "12",
dataset: str = DEFAULT_DATASET,
split: str = DEFAULT_SPLIT,
workers: str = DEFAULT_EVAL_WORKERS,
run_id: str | None = None,
) -> Path | None:
"""
Expand Down Expand Up @@ -363,10 +371,10 @@ def main() -> None:
parser = argparse.ArgumentParser(
description="Convert OpenHands output to SWE-Bench format and run multimodal evaluation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
epilog=f"""
Examples:
uv run swebenchmultimodal-eval output.jsonl
uv run swebenchmultimodal-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Multimodal
uv run swebenchmultimodal-eval /path/to/output.jsonl --dataset {DEFAULT_DATASET}
uv run swebenchmultimodal-eval output.jsonl --model-name "MyModel-v1.0"
""",
)
Expand All @@ -375,15 +383,14 @@ def main() -> None:

parser.add_argument(
"--dataset",
default="princeton-nlp/SWE-bench_Multimodal",
help="SWE-Bench dataset to evaluate against "
"(default: princeton-nlp/SWE-bench_Multimodal)",
default=DEFAULT_DATASET,
help=f"SWE-Bench dataset to evaluate against (default: {DEFAULT_DATASET})",
)

parser.add_argument(
"--split",
default="dev",
help="Dataset split to use (default: dev)",
default=DEFAULT_SPLIT,
help=f"Dataset split to use (default: {DEFAULT_SPLIT})",
)

parser.add_argument(
Expand All @@ -400,14 +407,14 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
default=DEFAULT_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})",
)

parser.add_argument(
"--workers",
default="12",
help="Number of workers to use when evaluating",
default=DEFAULT_EVAL_WORKERS,
help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})",
)

parser.add_argument(
Expand Down
67 changes: 46 additions & 21 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,25 @@
extract_custom_tag,
get_official_docker_image,
)
from benchmarks.swebenchmultimodal.constants import (
ALLOWED_IMAGE_TYPES,
BUILD_TARGET,
DEFAULT_DATASET,
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
DEFAULT_RUNTIME_API_URL,
DEFAULT_SKIP_BUILD,
DEFAULT_SPLIT,
ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT,
ENV_RUNTIME_API_KEY,
ENV_RUNTIME_API_URL,
ENV_SDK_SHORT_SHA,
ENV_SETUP_COMMANDS,
ENV_SKIP_BUILD,
GIT_COMMIT_MESSAGE,
GIT_USER_EMAIL,
GIT_USER_NAME,
WORKSPACE_DIR,
)
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
Expand Down Expand Up @@ -58,7 +77,7 @@ def is_valid_image_url(url: str, allowed_types: list | None = None) -> bool:
True if URL points to a valid image type, False otherwise
"""
if allowed_types is None:
allowed_types = ["image/jpeg", "image/png", "image/gif", "image/webp"]
allowed_types = ALLOWED_IMAGE_TYPES

try:
# Send a HEAD request first to check headers without downloading the entire file
Expand Down Expand Up @@ -152,18 +171,21 @@ def prepare_workspace(
"""
# Use multimodal image
official_docker_image = get_official_docker_image(instance.id)
build_target = "source-minimal"
custom_tag = extract_custom_tag(official_docker_image)
# For non-binary targets, append target suffix
suffix = f"-{build_target}" if build_target != "binary" else ""
suffix = f"-{BUILD_TARGET}" if BUILD_TARGET != "binary" else ""

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
skip_build = os.getenv(ENV_SKIP_BUILD, DEFAULT_SKIP_BUILD).lower() in (
"1",
"true",
"yes",
)
logger.info(f"SKIP_BUILD={skip_build}")
if not skip_build:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
Expand All @@ -177,7 +199,7 @@ def prepare_workspace(
base_image=official_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
target=BUILD_TARGET,
push=False,
)
logger.info(f"Image build output: {output}")
Expand All @@ -190,15 +212,15 @@ def prepare_workspace(

workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
working_dir=WORKSPACE_DIR,
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
runtime_api_key = os.getenv(ENV_RUNTIME_API_KEY)
sdk_short_sha = os.getenv(ENV_SDK_SHORT_SHA, SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
f"{ENV_RUNTIME_API_KEY} environment variable is not set for remote workspace"
)

agent_server_image = (
Expand All @@ -213,16 +235,19 @@ def prepare_workspace(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
startup_timeout = float(
os.getenv(
ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT,
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
)
)
workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
),
runtime_api_url=os.getenv(ENV_RUNTIME_API_URL, DEFAULT_RUNTIME_API_URL),
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
init_timeout=startup_timeout,
startup_wait_timeout=startup_timeout,
target_type="source" if "source" in build_target else "binary",
target_type="source" if "source" in BUILD_TARGET else "binary",
forward_env=forward_env or [],
resource_factor=resource_factor,
)
Expand Down Expand Up @@ -377,9 +402,9 @@ def evaluate_instance(
# and prevent the commit from being created
workspace.execute_command(
f"cd {repo_path} && "
"git config --global user.email 'evaluation@openhands.dev' && "
"git config --global user.name 'OpenHands Evaluation' && "
"git commit --no-verify -m 'patch'"
f"git config --global user.email '{GIT_USER_EMAIL}' && "
f"git config --global user.name '{GIT_USER_NAME}' && "
f"git commit --no-verify -m '{GIT_COMMIT_MESSAGE}'"
)

# Get git patch (same as regular swebench - use base_commit)
Expand Down Expand Up @@ -424,7 +449,7 @@ def main() -> None:
help="Path to prompt template file",
)
# Override the default dataset and split for multimodal
parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev")
parser.set_defaults(dataset=DEFAULT_DATASET, split=DEFAULT_SPLIT)
args = parser.parse_args()

# Validate max_attempts
Expand Down Expand Up @@ -464,7 +489,7 @@ def main() -> None:
details={},
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
env_setup_commands=ENV_SETUP_COMMANDS,
max_attempts=args.max_attempts,
critic=critic,
selected_instances_file=args.select,
Expand Down
Loading