Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions benchmarks/swtbench/build_eval_env_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@

import docker

from benchmarks.swtbench.constants import (
BUILD_MODE_CHOICES,
DEFAULT_BUILD_BATCH_SIZE,
DEFAULT_BUILD_MAX_RETRIES,
DEFAULT_BUILD_MAX_WORKERS,
DEFAULT_BUILD_MODE,
DEFAULT_EVAL_LIMIT,
DEFAULT_SPLIT,
PREBAKED_REGISTRY,
)
from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.image_utils import image_exists as remote_image_exists
Expand Down Expand Up @@ -258,11 +268,11 @@ def main() -> None:
description="Build and push prebaked SWT-bench eval env images."
)
parser.add_argument("--dataset", required=True, help="Dataset name")
parser.add_argument("--split", default="test", help="Dataset split")
parser.add_argument("--split", default=DEFAULT_SPLIT, help="Dataset split")
parser.add_argument(
"--eval-limit",
type=int,
default=1,
default=DEFAULT_EVAL_LIMIT,
help="Match inference sampling by limiting instances (0 to disable)",
)
parser.add_argument(
Expand All @@ -277,31 +287,31 @@ def main() -> None:
)
parser.add_argument(
"--image-prefix",
default="ghcr.io/openhands/swtbench-eval",
default=PREBAKED_REGISTRY,
help="Registry prefix for pushed images",
)
parser.add_argument(
"--max-workers",
type=int,
default=4,
default=DEFAULT_BUILD_MAX_WORKERS,
help="Parallel builds for env images",
)
parser.add_argument(
"--max-retries",
type=int,
default=2,
default=DEFAULT_BUILD_MAX_RETRIES,
help="Retries per batch for env image builds",
)
parser.add_argument(
"--build-batch-size",
type=int,
default=10,
default=DEFAULT_BUILD_BATCH_SIZE,
help="Number of env images to build per batch",
)
parser.add_argument(
"--build-mode",
choices=["api", "cli"],
default="cli",
choices=BUILD_MODE_CHOICES,
default=DEFAULT_BUILD_MODE,
help="swt-bench build mode",
)
parser.add_argument(
Expand Down
140 changes: 140 additions & 0 deletions benchmarks/swtbench/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
SWTBench Constants

This module serves as the single source of truth for all hyperparameters
and constant values used in the SWTBench evaluation workflow.
"""

from typing import Final, Literal, Tuple


# Type alias for build targets (matches openhands.sdk.workspace.TargetType)
TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"]


# =============================================================================
# Docker/Image Related Constants
# =============================================================================

# Docker image prefixes
SWEBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swebench/"
SWTBENCH_DOCKER_IMAGE_PREFIX: Final[str] = "docker.io/swtbench/"

# Agent server image base
AGENT_SERVER_IMAGE_BASE: Final[str] = "ghcr.io/all-hands-ai/agent-server"

# Prebaked evaluation images registry
PREBAKED_REGISTRY: Final[str] = "ghcr.io/openhands/swtbench-eval"

# Build target for agent server images
DEFAULT_BUILD_TARGET: Final[TargetType] = "source-minimal"

# Image tag constants
IMAGE_TAG_LATEST: Final[str] = "latest"
IMAGE_NAME_SEPARATOR: Final[str] = "1776"

# Build mode choices and default
BUILD_MODE_CHOICES: Final[Tuple[str, ...]] = ("api", "cli")
DEFAULT_BUILD_MODE: Final[str] = "cli"

# =============================================================================
# Dataset Related Constants
# =============================================================================

# Default dataset for evaluation
DEFAULT_DATASET: Final[str] = "princeton-nlp/SWE-bench_Verified"

# Default dataset split
DEFAULT_SPLIT: Final[str] = "test"

# Default model name for predictions
DEFAULT_MODEL_NAME: Final[str] = "OpenHands"

# =============================================================================
# Environment Variable Names
# =============================================================================

ENV_SKIP_BUILD: Final[str] = "SKIP_BUILD"
ENV_RUNTIME_API_KEY: Final[str] = "RUNTIME_API_KEY"
ENV_SDK_SHORT_SHA: Final[str] = "SDK_SHORT_SHA"
ENV_RUNTIME_API_URL: Final[str] = "RUNTIME_API_URL"
ENV_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[str] = "REMOTE_RUNTIME_STARTUP_TIMEOUT"
ENV_SWTBENCH_FORCE_CONDA: Final[str] = "SWTBENCH_FORCE_CONDA"

# =============================================================================
# Default Values
# =============================================================================

# Default value for SKIP_BUILD environment variable (truthy string)
DEFAULT_SKIP_BUILD: Final[str] = "1"

# Default runtime API URL
DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"

# Default startup timeout in seconds
DEFAULT_STARTUP_TIMEOUT: Final[int] = 600

# Default number of workers for evaluation
DEFAULT_EVAL_WORKERS: Final[int] = 12

# Default eval limit for image building
DEFAULT_EVAL_LIMIT: Final[int] = 1

# Default max workers for image building
DEFAULT_BUILD_MAX_WORKERS: Final[int] = 4

# Default max retries for image building
DEFAULT_BUILD_MAX_RETRIES: Final[int] = 2

# Default batch size for image building
DEFAULT_BUILD_BATCH_SIZE: Final[int] = 10

# =============================================================================
# File/Directory Paths
# =============================================================================

# SWT-bench repository directory name
SWT_BENCH_REPO_DIR: Final[str] = "swt-bench"

# Evaluation results directory name
EVALUATION_RESULTS_DIR: Final[str] = "evaluation_results"

# Report filename
REPORT_FILENAME: Final[str] = "output.report.json"

# Run ID prefix for evaluation
EVAL_RUN_ID_PREFIX: Final[str] = "eval_"

# Eval note prefix
EVAL_NOTE_PREFIX: Final[str] = "SWT-"

# =============================================================================
# Git/Repository Related Constants
# =============================================================================

# SWT-bench repository URL
SWT_BENCH_REPO_URL: Final[str] = "https://github.com/logic-star-ai/swt-bench.git"

# Git user configuration for commits
GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
GIT_USER_NAME: Final[str] = "OpenHands Evaluation"

# =============================================================================
# Patch Processing Constants
# =============================================================================

# Files to remove from patches during post-processing (tuple for immutability)
SETUP_FILES_TO_REMOVE: Final[Tuple[str, ...]] = (
"pyproject.toml",
"tox.ini",
"setup.py",
)

# =============================================================================
# Environment Setup Commands
# =============================================================================

# Default environment setup commands (tuple for immutability)
DEFAULT_ENV_SETUP_COMMANDS: Final[Tuple[str, ...]] = (
"export PIP_CACHE_DIR=~/.cache/pip",
)
58 changes: 34 additions & 24 deletions benchmarks/swtbench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@
from pathlib import Path
from time import monotonic

from benchmarks.swtbench.constants import (
DEFAULT_DATASET,
DEFAULT_EVAL_WORKERS,
DEFAULT_MODEL_NAME,
DEFAULT_SPLIT,
ENV_SWTBENCH_FORCE_CONDA,
EVAL_RUN_ID_PREFIX,
EVALUATION_RESULTS_DIR,
PREBAKED_REGISTRY,
REPORT_FILENAME,
SETUP_FILES_TO_REMOVE,
SWT_BENCH_REPO_DIR,
)
from benchmarks.swtbench.image_utils import (
compute_required_images,
ensure_swt_bench_repo,
Expand All @@ -30,8 +43,6 @@

logger = get_logger(__name__)

PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval"


def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
instance_ids: list[str] = []
Expand Down Expand Up @@ -67,7 +78,7 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
def try_pull_prebaked_images(
predictions_file: Path,
dataset: str,
split: str = "test",
split: str = DEFAULT_SPLIT,
registry: str = PREBAKED_REGISTRY,
) -> None:
"""
Expand Down Expand Up @@ -147,7 +158,7 @@ def update_report_with_submitted_instances(


def convert_to_swtbench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME
) -> None:
"""
Convert OpenHands output.jsonl to SWT-Bench prediction format.
Expand Down Expand Up @@ -203,8 +214,7 @@ def convert_to_swtbench_format(
git_patch = ""

# postprocess git_patch
setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
git_patch = remove_files_from_patch(git_patch, setup_files)
git_patch = remove_files_from_patch(git_patch, SETUP_FILES_TO_REMOVE)

# Create SWT-Bench format entry
swtbench_entry = {
Expand Down Expand Up @@ -236,8 +246,8 @@ def convert_to_swtbench_format(
def run_swtbench_evaluation(
predictions_file: str,
# Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness.
dataset: str = "princeton-nlp/SWE-bench_Verified",
workers: str = "12",
dataset: str = DEFAULT_DATASET,
workers: int = DEFAULT_EVAL_WORKERS,
) -> None:
"""
Run SWT-Bench evaluation on the predictions file.
Expand All @@ -252,7 +262,7 @@ def run_swtbench_evaluation(
dataset: SWT-Bench dataset to evaluate against
workers: Number of workers to use for evaluation
"""
use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes")
use_legacy = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() in ("1", "true", "yes")
mode = "legacy-conda" if use_legacy else "prebaked-images"
logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode)

Expand Down Expand Up @@ -301,7 +311,7 @@ def run_swtbench_evaluation(
"--max_workers",
str(workers),
"--run_id",
f"eval_{predictions_path.stem}",
f"{EVAL_RUN_ID_PREFIX}{predictions_path.stem}",
]

logger.info(f"Using Python executable: {python_executable}")
Expand Down Expand Up @@ -359,9 +369,8 @@ def main() -> None:
# Must use SWE-bench dataset because SWT-bench dataset (which is based on SWE-bench) contains a bug in their harness.
parser.add_argument(
"--dataset",
default="princeton-nlp/SWE-bench_Verified",
help="SWT-Bench dataset to evaluate against "
"(default: princeton-nlp/SWE-bench_Verified)",
default=DEFAULT_DATASET,
help=f"SWT-Bench dataset to evaluate against (default: {DEFAULT_DATASET})",
)

parser.add_argument(
Expand All @@ -378,14 +387,15 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="OpenHands",
help="Model name to use in the model_name_or_path field (default: OpenHands)",
default=DEFAULT_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})",
)

parser.add_argument(
"--workers",
default="12",
help="Number of workers to use when evaluating",
type=int,
default=DEFAULT_EVAL_WORKERS,
help=f"Number of workers to use when evaluating (default: {DEFAULT_EVAL_WORKERS})",
)

args = parser.parse_args()
Expand Down Expand Up @@ -414,8 +424,8 @@ def main() -> None:
# Convert format
convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)

# Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
# Default: use prebaked images; SWTBENCH_FORCE_CONDA opts into legacy flow.
use_prebaked = os.getenv(ENV_SWTBENCH_FORCE_CONDA, "").lower() not in (
"1",
"true",
"yes",
Expand All @@ -427,7 +437,7 @@ def main() -> None:
)
else:
logger.info(
"SWTBENCH_FORCE_CONDA set; skipping prebaked image pull "
f"{ENV_SWTBENCH_FORCE_CONDA} set; skipping prebaked image pull "
"and using legacy (pre-mamba) evaluation flow"
)

Expand All @@ -440,14 +450,14 @@ def main() -> None:
cleanup_phase_start = monotonic()
# Move SWT-Bench evaluation report to same folder as output.jsonl
cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench"
swt_bench_dir = cache_dir / "swt-bench"
report_dir = swt_bench_dir / "evaluation_results"
run_id = f"eval_{output_file.stem}"
swt_bench_dir = cache_dir / SWT_BENCH_REPO_DIR
report_dir = swt_bench_dir / EVALUATION_RESULTS_DIR
run_id = f"{EVAL_RUN_ID_PREFIX}{output_file.stem}"
model_name_safe = args.model_name.replace("/", "__")
report_file = report_dir / f"{model_name_safe}.{run_id}.json"

target_dir = input_file.parent
target_file = target_dir / "output.report.json"
target_file = target_dir / REPORT_FILENAME
shutil.move(str(report_file), str(target_file))
logger.info(f"Moved evaluation report to: {target_file}")
update_report_with_submitted_instances(target_file, output_file)
Expand Down
Loading