Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions benchmarks/swebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sys
from pathlib import Path

from benchmarks.swebench import constants
from benchmarks.utils.build_utils import (
BuildOutput,
build_all_images,
Expand All @@ -26,19 +27,19 @@

logger = get_logger(__name__)
WRAPPER_DOCKERFILE = Path(__file__).with_name("Dockerfile.swebench-deps")
# Repos that require the docutils/roman wrapper layer
WRAPPED_REPOS = {"sphinx-doc"}


def get_official_docker_image(
instance_id: str,
docker_image_prefix="docker.io/swebench/",
docker_image_prefix: str = constants.DOCKER_IMAGE_PREFIX,
) -> str:
# Official SWE-Bench image
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
repo, name = instance_id.split("__")
official_image_name = docker_image_prefix.rstrip("/")
official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower()
official_image_name += (
f"/sweb.eval.x86_64.{repo}_1776_{name}:{constants.DOCKER_IMAGE_TAG}".lower()
)
logger.debug(f"Official SWE-Bench image: {official_image_name}")
return official_image_name

Expand All @@ -60,12 +61,12 @@ def should_wrap_custom_tag(custom_tag: str) -> bool:
prefix = "sweb.eval.x86_64."
if custom_tag.startswith(prefix):
custom_tag = custom_tag[len(prefix) :]
return custom_tag.split("_", 1)[0] in WRAPPED_REPOS
return custom_tag.split("_", 1)[0] in constants.WRAPPED_REPOS


def should_wrap_instance_id(instance_id: str) -> bool:
repo = instance_id.split("__")[0]
return repo in WRAPPED_REPOS
return repo in constants.WRAPPED_REPOS


def collect_unique_base_images(
Expand Down
48 changes: 48 additions & 0 deletions benchmarks/swebench/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
SWE-Bench hyperparameters and constant values.

This module serves as the single source of truth for all constant values
used in the SWE-Bench evaluation workflow.
"""

from typing import Final, Literal


# Dataset
DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified"

# Docker
DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
DOCKER_IMAGE_TAG: Final[str] = "latest"
WRAPPED_REPOS: Final[frozenset[str]] = frozenset(
{"sphinx-doc"}
) # Repos requiring docutils/roman wrapper

# Build target type (matches openhands.agent_server.docker.build.TargetType)
TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"]
BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal"
BUILD_TARGET_BINARY: Final[TargetType] = "binary"
DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL

# Runtime
DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600

# Evaluation
DEFAULT_EVAL_WORKERS: Final[int] = 12

# Model - preserving original behavior: function default is "OpenHands", CLI default is "openhands"
DEFAULT_MODEL_NAME: Final[str] = "OpenHands"
DEFAULT_CLI_MODEL_NAME: Final[str] = "openhands"

# Git
GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
GIT_USER_NAME: Final[str] = "OpenHands Evaluation"
GIT_COMMIT_MESSAGE: Final[str] = "patch"

# Patch Processing
SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = (
"pyproject.toml",
"tox.ini",
"setup.py",
)
26 changes: 14 additions & 12 deletions benchmarks/swebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
from pathlib import Path

from benchmarks.swebench import constants
from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
Expand All @@ -26,7 +27,7 @@


def convert_to_swebench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
input_file: str, output_file: str, model_name: str = constants.DEFAULT_MODEL_NAME
) -> None:
"""
Convert OpenHands output.jsonl to SWE-Bench prediction format.
Expand Down Expand Up @@ -82,8 +83,9 @@ def convert_to_swebench_format(
git_patch = ""

# postprocess git_patch
setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
git_patch = remove_files_from_patch(git_patch, setup_files)
git_patch = remove_files_from_patch(
git_patch, constants.SETUP_FILES_TO_REMOVE
)

# Create SWE-Bench format entry
swebench_entry = {
Expand Down Expand Up @@ -114,8 +116,8 @@ def convert_to_swebench_format(

def run_swebench_evaluation(
predictions_file: str,
dataset: str = "princeton-nlp/SWE-bench_Verified",
workers: str = "12",
dataset: str = constants.DEFAULT_DATASET,
workers: int = constants.DEFAULT_EVAL_WORKERS,
) -> None:
"""
Run SWE-Bench evaluation on the predictions file.
Expand Down Expand Up @@ -196,9 +198,8 @@ def main() -> None:

parser.add_argument(
"--dataset",
default="princeton-nlp/SWE-bench_Verified",
help="SWE-Bench dataset to evaluate against "
"(default: princeton-nlp/SWE-bench_Verified)",
default=constants.DEFAULT_DATASET,
help=f"SWE-Bench dataset to evaluate against (default: {constants.DEFAULT_DATASET})",
)

parser.add_argument(
Expand All @@ -215,14 +216,15 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
default=constants.DEFAULT_CLI_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {constants.DEFAULT_CLI_MODEL_NAME})",
)

parser.add_argument(
"--workers",
default="12",
help="Number of workers to use when evaluating",
type=int,
default=constants.DEFAULT_EVAL_WORKERS,
help=f"Number of workers to use when evaluating (default: {constants.DEFAULT_EVAL_WORKERS})",
)

args = parser.parse_args()
Expand Down
22 changes: 15 additions & 7 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from jinja2 import Environment, FileSystemLoader

from benchmarks.swebench import constants
from benchmarks.swebench.build_images import (
extract_custom_tag,
get_official_docker_image,
Expand Down Expand Up @@ -114,10 +115,12 @@ def prepare_workspace(
Used by APIRemoteWorkspace for remote runtime allocation.
"""
official_docker_image = get_official_docker_image(instance.id)
build_target = "source-minimal"
build_target = constants.DEFAULT_BUILD_TARGET
custom_tag = extract_custom_tag(official_docker_image)
# For non-binary targets, append target suffix
suffix = f"-{build_target}" if build_target != "binary" else ""
suffix = (
f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
)
base_agent_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
Expand Down Expand Up @@ -183,10 +186,15 @@ def prepare_workspace(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
startup_timeout = float(
os.getenv(
"REMOTE_RUNTIME_STARTUP_TIMEOUT",
str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),
)
)
workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
"RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL
),
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
Expand Down Expand Up @@ -280,9 +288,9 @@ def evaluate_instance(
# Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
workspace.execute_command(
f"cd {repo_path} && "
"git config --global user.email 'evaluation@openhands.dev' && "
"git config --global user.name 'OpenHands Evaluation' && "
"git commit --no-verify -m 'patch'"
f"git config --global user.email '{constants.GIT_USER_EMAIL}' && "
f"git config --global user.name '{constants.GIT_USER_NAME}' && "
f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'"
)

# Get git patch
Expand Down