Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions benchmarks/multiswebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
import os
from pathlib import Path

from benchmarks.multiswebench.constants import (
DEFAULT_DOCKER_IMAGE_PREFIX,
DEFAULT_LANGUAGE,
DOCKER_IMAGE_PREFIX_ENV_VAR,
LANGUAGE_ENV_VAR,
)
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
Expand All @@ -23,8 +29,10 @@
logger = get_logger(__name__)

# Environment variables for multi-language support
DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench")
LANGUAGE = os.environ.get("LANGUAGE", "java")
DOCKER_IMAGE_PREFIX = os.environ.get(
DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX
)
LANGUAGE = os.environ.get(LANGUAGE_ENV_VAR, DEFAULT_LANGUAGE)


def get_official_docker_image(
Expand Down
98 changes: 98 additions & 0 deletions benchmarks/multiswebench/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Constants and hyperparameters for Multi-SWE-Bench evaluation.

This module serves as the single source of truth for all constant values
used throughout the Multi-SWE-Bench benchmark implementation.
"""

# =============================================================================
# Dataset Configuration
# =============================================================================

# Default dataset name on HuggingFace
DEFAULT_DATASET = "bytedance-research/Multi-SWE-Bench"

# Default dataset split
DEFAULT_SPLIT = "test"

# Default programming language
DEFAULT_LANGUAGE = "java"

# Default model name for predictions
DEFAULT_MODEL_NAME = "OpenHands"

# =============================================================================
# Docker/Image Configuration
# =============================================================================

# Default Docker image prefix for Multi-SWE-Bench
DEFAULT_DOCKER_IMAGE_PREFIX = "mswebench"

# Default build target for agent server images
DEFAULT_BUILD_TARGET = "source-minimal"

# Environment variable names
DOCKER_IMAGE_PREFIX_ENV_VAR = "EVAL_DOCKER_IMAGE_PREFIX"
LANGUAGE_ENV_VAR = "LANGUAGE"
SKIP_BUILD_ENV_VAR = "MULTI_SWE_BENCH_SKIP_BUILD"

# =============================================================================
# Runtime Configuration
# =============================================================================

# Default runtime API URL for remote workspace
DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"

# Default startup timeout in seconds
DEFAULT_STARTUP_TIMEOUT = 600

# Environment variable names for runtime configuration
USE_HINT_TEXT_ENV_VAR = "USE_HINT_TEXT"
USE_INSTANCE_IMAGE_ENV_VAR = "USE_INSTANCE_IMAGE"
RUN_WITH_BROWSING_ENV_VAR = "RUN_WITH_BROWSING"
RUNTIME_API_KEY_ENV_VAR = "RUNTIME_API_KEY"
RUNTIME_API_URL_ENV_VAR = "RUNTIME_API_URL"
SDK_SHORT_SHA_ENV_VAR = "SDK_SHORT_SHA"
REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR = "REMOTE_RUNTIME_STARTUP_TIMEOUT"

# Default values for boolean environment variables
DEFAULT_USE_HINT_TEXT = False
DEFAULT_USE_INSTANCE_IMAGE = True
DEFAULT_RUN_WITH_BROWSING = False

# =============================================================================
# Evaluation Harness Configuration
# =============================================================================

# Default configuration template for Multi-SWE-Bench evaluation harness.
# Dynamic values (paths) are added at runtime.
DEFAULT_EVAL_HARNESS_CONFIG = {
"mode": "evaluation",
"force_build": True,
"need_clone": True,
"clear_env": True,
"stop_on_error": False,
"max_workers": 5,
"max_workers_build_image": 5,
"max_workers_run_instance": 5,
"log_level": "DEBUG",
"fix_patch_run_cmd": (
'bash -c "apt update ; apt install -y patch ; '
"sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;"
"patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; "
'chmod +x /home/*.sh ; /home/fix-run.sh"'
),
"specifics": [],
"skips": [],
"global_env": [],
}

# =============================================================================
# Workspace Configuration
# =============================================================================

# Default working directory in container
DEFAULT_WORKING_DIR = "/workspace"

# Default environment setup commands
DEFAULT_ENV_SETUP_COMMANDS = ["export PIP_CACHE_DIR=~/.cache/pip"]
22 changes: 13 additions & 9 deletions benchmarks/multiswebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
import subprocess
from pathlib import Path

from benchmarks.multiswebench.constants import (
DEFAULT_DATASET,
DEFAULT_LANGUAGE,
DEFAULT_MODEL_NAME,
DEFAULT_SPLIT,
)
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import (
update_multi_swe_config,
Expand All @@ -29,7 +35,7 @@ def run_multi_swebench_evaluation(
dataset_name: str | None = None,
split: str | None = None,
input_file: str | None = None,
lang: str = "java",
lang: str = DEFAULT_LANGUAGE,
):
"""
Run Multi-SWE-Bench evaluation using the predictions file.
Expand All @@ -46,9 +52,9 @@ def run_multi_swebench_evaluation(

# Default dataset and split if not provided
if dataset_name is None:
dataset_name = "bytedance-research/Multi-SWE-Bench"
dataset_name = DEFAULT_DATASET
if split is None:
split = "test"
split = DEFAULT_SPLIT

try:
if input_file is None:
Expand Down Expand Up @@ -108,14 +114,12 @@ def main():
parser = argparse.ArgumentParser(description="Multi-SWE-Bench Evaluation")
parser.add_argument("input_file", help="Path to OpenHands output.jsonl file")
parser.add_argument(
"--model-name", default="OpenHands", help="Model name for predictions"
)
parser.add_argument(
"--dataset", default="bytedance-research/Multi-SWE-Bench", help="Dataset name"
"--model-name", default=DEFAULT_MODEL_NAME, help="Model name for predictions"
)
parser.add_argument("--split", default="test", help="Dataset split")
parser.add_argument("--dataset", default=DEFAULT_DATASET, help="Dataset name")
parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split")
parser.add_argument(
"--lang", default="java", help="Language for Multi-SWE-bench dataset"
"--lang", default=DEFAULT_LANGUAGE, help="Language for Multi-SWE-bench dataset"
)
parser.add_argument(
"--skip-evaluation",
Expand Down
70 changes: 55 additions & 15 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,27 @@
extract_custom_tag,
get_official_docker_image,
)
from benchmarks.multiswebench.constants import (
DEFAULT_BUILD_TARGET,
DEFAULT_DOCKER_IMAGE_PREFIX,
DEFAULT_ENV_SETUP_COMMANDS,
DEFAULT_LANGUAGE,
DEFAULT_RUN_WITH_BROWSING,
DEFAULT_RUNTIME_API_URL,
DEFAULT_STARTUP_TIMEOUT,
DEFAULT_USE_HINT_TEXT,
DEFAULT_USE_INSTANCE_IMAGE,
DEFAULT_WORKING_DIR,
DOCKER_IMAGE_PREFIX_ENV_VAR,
REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR,
RUN_WITH_BROWSING_ENV_VAR,
RUNTIME_API_KEY_ENV_VAR,
RUNTIME_API_URL_ENV_VAR,
SDK_SHORT_SHA_ENV_VAR,
SKIP_BUILD_ENV_VAR,
USE_HINT_TEXT_ENV_VAR,
USE_INSTANCE_IMAGE_ENV_VAR,
)
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
from benchmarks.utils.args_parser import get_parser
Expand Down Expand Up @@ -42,18 +63,33 @@ class MultiSWEBenchEvalMetadata(EvalMetadata):
"""Extended metadata for Multi-SWE-bench evaluation with language support."""

lang: str = Field(
default="java", description="Language for Multi-SWE-bench dataset"
default=DEFAULT_LANGUAGE, description="Language for Multi-SWE-bench dataset"
)


logger = get_logger(__name__)

# Environment variables for Multi-SWE-Bench configuration
USE_HINT_TEXT = os.environ.get("USE_HINT_TEXT", "false").lower() == "true"
USE_INSTANCE_IMAGE = os.environ.get("USE_INSTANCE_IMAGE", "true").lower() == "true"
RUN_WITH_BROWSING = os.environ.get("RUN_WITH_BROWSING", "false").lower() == "true"
USE_HINT_TEXT = (
os.environ.get(USE_HINT_TEXT_ENV_VAR, str(DEFAULT_USE_HINT_TEXT).lower()).lower()
== "true"
)
USE_INSTANCE_IMAGE = (
os.environ.get(
USE_INSTANCE_IMAGE_ENV_VAR, str(DEFAULT_USE_INSTANCE_IMAGE).lower()
).lower()
== "true"
)
RUN_WITH_BROWSING = (
os.environ.get(
RUN_WITH_BROWSING_ENV_VAR, str(DEFAULT_RUN_WITH_BROWSING).lower()
).lower()
== "true"
)
# For Multi-SWE-Bench, force mswebench prefix instead of the general SWE-Bench prefix
DOCKER_IMAGE_PREFIX = os.environ.get("EVAL_DOCKER_IMAGE_PREFIX", "mswebench")
DOCKER_IMAGE_PREFIX = os.environ.get(
DOCKER_IMAGE_PREFIX_ENV_VAR, DEFAULT_DOCKER_IMAGE_PREFIX
)

logger.info(f"Using docker image prefix: {DOCKER_IMAGE_PREFIX}")

Expand Down Expand Up @@ -200,7 +236,7 @@ def prepare_workspace(
instance.data, docker_image_prefix=DOCKER_IMAGE_PREFIX
)
logger.info(f"Using official docker image: {official_docker_image}")
build_target = "source-minimal"
build_target = DEFAULT_BUILD_TARGET
custom_tag = extract_custom_tag(official_docker_image)
# For non-binary targets, append target suffix
suffix = f"-{build_target}" if build_target != "binary" else ""
Expand All @@ -209,7 +245,7 @@ def prepare_workspace(
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
SKIP_BUILD = os.getenv(SKIP_BUILD_ENV_VAR, "0").lower() in (
"1",
"true",
"yes",
Expand Down Expand Up @@ -241,15 +277,15 @@ def prepare_workspace(

workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
working_dir=DEFAULT_WORKING_DIR,
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
runtime_api_key = os.getenv(RUNTIME_API_KEY_ENV_VAR)
sdk_short_sha = os.getenv(SDK_SHORT_SHA_ENV_VAR, SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
f"{RUNTIME_API_KEY_ENV_VAR} environment variable is not set for remote workspace"
)

agent_server_image = (
Expand All @@ -264,10 +300,14 @@ def prepare_workspace(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
startup_timeout = float(
os.getenv(
REMOTE_RUNTIME_STARTUP_TIMEOUT_ENV_VAR, str(DEFAULT_STARTUP_TIMEOUT)
)
)
workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
RUNTIME_API_URL_ENV_VAR, DEFAULT_RUNTIME_API_URL
),
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
Expand Down Expand Up @@ -432,7 +472,7 @@ def main() -> None:
parser.add_argument(
"--lang",
type=str,
default="java",
default=DEFAULT_LANGUAGE,
help="Language for Multi-SWE-bench dataset",
)
args = parser.parse_args()
Expand Down Expand Up @@ -475,7 +515,7 @@ def main() -> None:
details={},
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
env_setup_commands=DEFAULT_ENV_SETUP_COMMANDS,
max_attempts=args.max_attempts,
critic=critic,
selected_instances_file=args.select,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import os

from benchmarks.multiswebench.constants import DEFAULT_EVAL_HARNESS_CONFIG
from benchmarks.multiswebench.scripts.eval.convert import convert_to_eval_format


Expand All @@ -18,32 +19,14 @@ def update_multi_swe_config(output_jsonl_path, config_path, dataset):
os.makedirs(os.path.join(path_to_parent, "eval_files", "repos"), exist_ok=True)
os.makedirs(os.path.join(path_to_parent, "eval_files", "logs"), exist_ok=True)

# Prepare config dict
config = {
"mode": "evaluation",
"workdir": os.path.join(path_to_parent, "eval_files", "workdir"),
"patch_files": [converted_path],
"dataset_files": [dataset],
"force_build": True,
"output_dir": os.path.join(path_to_parent, "eval_files", "dataset"),
"specifics": [],
"skips": [],
"repo_dir": os.path.join(path_to_parent, "eval_files", "repos"),
"need_clone": True,
"global_env": [],
"clear_env": True,
"stop_on_error": False,
"max_workers": 5,
"max_workers_build_image": 5,
"max_workers_run_instance": 5,
"log_dir": os.path.join(path_to_parent, "eval_files", "logs"),
"log_level": "DEBUG",
"fix_patch_run_cmd": (
'bash -c "apt update ; apt install -y patch ; '
"sed -i 's@git apply.*@patch --batch --fuzz=5 -p1 -i /home/test.patch;"
"patch --batch --fuzz=5 -p1 -i /home/fix.patch@g' /home/fix-run.sh ; chmod +x /home/*.sh ; /home/fix-run.sh\""
),
}
# Start with default config and add dynamic paths
config = DEFAULT_EVAL_HARNESS_CONFIG.copy()
config["workdir"] = os.path.join(path_to_parent, "eval_files", "workdir")
config["patch_files"] = [converted_path]
config["dataset_files"] = [dataset]
config["output_dir"] = os.path.join(path_to_parent, "eval_files", "dataset")
config["repo_dir"] = os.path.join(path_to_parent, "eval_files", "repos")
config["log_dir"] = os.path.join(path_to_parent, "eval_files", "logs")

# Save to multibench.config
os.makedirs(os.path.dirname(config_path), exist_ok=True)
Expand Down