Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
025a941
Fix dataset loading schema validation issue in CI
juanmichelini Jan 13, 2026
6da366d
Add trust_remote_code=True to dataset loading
juanmichelini Jan 13, 2026
59190a1
Add manual dataset loading fallback for schema validation issues
juanmichelini Jan 13, 2026
7f3373a
Fix UnboundLocalError for os import
juanmichelini Jan 13, 2026
7469400
fix: correct exception handling in dataset loading retry logic
juanmichelini Jan 13, 2026
4f68c31
fix: load parquet files directly to bypass schema validation issues
juanmichelini Jan 13, 2026
ca511d3
fix: use streaming dataset to bypass schema validation
juanmichelini Jan 13, 2026
78e6165
fix: fallback to 'train' split when requested split doesn't exist
juanmichelini Jan 13, 2026
4d13d26
fix: use 'number' field instead of 'version' in Multi-SWE-Bench dataset
juanmichelini Jan 13, 2026
caae3e5
fix: lowercase Docker repository names for Multi-SWE-Bench
juanmichelini Jan 13, 2026
85a8297
Allow up to 5 image build failures in multiswebench build workflow
juanmichelini Jan 13, 2026
9a4a56f
fix: specify --output-dir builds in multiswebench build workflow
juanmichelini Jan 13, 2026
191a1bd
feat: add real-time build progress logging
juanmichelini Jan 13, 2026
d8b3ecd
fix: respect n-limit parameter in multiswebench build_images.py
juanmichelini Jan 13, 2026
e854657
fix: use get_dataset() utility in multiswebench run_infer.py
juanmichelini Jan 13, 2026
7e6d413
fix: correct attribute name from eval_n_limit to eval_limit in run_in…
juanmichelini Jan 14, 2026
403e7cc
fix: restore language filtering in multiswebench run_infer.py
juanmichelini Jan 14, 2026
3e0cce0
fix: apply language filtering to build_images.py
juanmichelini Jan 14, 2026
de7ee86
fix: correct import path for format_data_for_inference in build_image…
juanmichelini Jan 14, 2026
4b59f19
fix(multiswebench): handle bytedance-research dataset name and Path c…
juanmichelini Jan 14, 2026
2e200f1
fix(multiswebench): apply n_limit before extracting base images
juanmichelini Jan 14, 2026
5ff73fe
fix(multiswebench): pass push and custom_tag_fn to build_all_images
juanmichelini Jan 14, 2026
c934cb0
Fix instance selection inconsistency between build and evaluation
juanmichelini Jan 14, 2026
39abd36
Fix multiswebench build_images.py to use correct worker and retry arg…
juanmichelini Jan 15, 2026
c21c615
Add workaround for apt repository metadata changes in Multi-SWE-Bench…
juanmichelini Jan 15, 2026
970f0ad
Merge remote-tracking branch 'origin/main' into fix-dataset-schema-va…
juanmichelini Jan 16, 2026
09e8b4d
Fix race condition in Dockerfile patching with file locking
juanmichelini Jan 16, 2026
1fdf9ed
Fix: Reset repo to base_commit to prevent including PR changes in patch
juanmichelini Jan 17, 2026
e285f3b
Change eval_infer default split from test to train
juanmichelini Jan 18, 2026
0f2c515
Fix Docker-in-Docker file mounting issue
juanmichelini Jan 18, 2026
2c9c0e8
Switch to GitHub-hosted runners for reliable capacity
juanmichelini Jan 19, 2026
27df41d
Switch to 8-core GitHub runners for better availability
juanmichelini Jan 19, 2026
e382829
Merge remote-tracking branch 'origin/main' into fix-dataset-schema-va…
juanmichelini Jan 20, 2026
0a1f796
fix: Use Blacksmith 32vCPU runners for multiswebench image builds
juanmichelini Jan 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/build-multiswebench-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ jobs:
--image ghcr.io/openhands/eval-agent-server \
--push \
--max-workers '${MAX_WORKERS}' \
--max-retries '${MAX_RETRIES}'"
--max-retries '${MAX_RETRIES}' \
--output-dir builds"

# Only include --n-limit if provided (non-empty)
if [ -n "${N_LIMIT}" ]; then
Expand All @@ -252,6 +253,7 @@ jobs:
BUILDKIT_PROGRESS: plain
BUILDKIT_RESET_ON_FAILURE: 1
LANGUAGE: ${{ env.LANGUAGE }}
PYTHONUNBUFFERED: 1

- name: Archive build logs
if: always()
Expand Down Expand Up @@ -324,9 +326,17 @@ jobs:
PY
fi

if [ "$FAILURES" -gt 0 ]; then
echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL"
# Allow up to 5 failures or 85% success rate (whichever is more lenient)
MAX_ALLOWED_FAILURES=5
MIN_SUCCESS_RATE=85
SUCCESS_RATE=$((SUCCESSES * 100 / TOTAL))

if [ "$FAILURES" -gt "$MAX_ALLOWED_FAILURES" ] && [ "$SUCCESS_RATE" -lt "$MIN_SUCCESS_RATE" ]; then
echo "::error::Too many failures: $FAILURES failed out of $TOTAL (success rate: $SUCCESS_RATE%)"
echo "::error::Maximum allowed failures: $MAX_ALLOWED_FAILURES or minimum success rate: $MIN_SUCCESS_RATE%"
exit 1
elif [ "$FAILURES" -gt 0 ]; then
echo "::warning::Detected $FAILURES failed images out of $TOTAL (success rate: $SUCCESS_RATE%), but within acceptable threshold"
fi

- name: Comment on tracker issue
Expand Down
75 changes: 67 additions & 8 deletions benchmarks/multiswebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,21 @@
--image ghcr.io/openhands/eval-agent-server --target source-minimal
"""

import json
import os
import tempfile
from pathlib import Path

import pandas as pd

from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
get_build_parser,
)
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.dataset import get_dataset, prepare_dataset
from openhands.sdk import get_logger


Expand All @@ -37,7 +43,8 @@ def get_official_docker_image(

# For Multi-SWE-Bench, the image naming depends on the language
repo = instance["repo"]
version = instance["version"]
# Multi-SWE-Bench dataset uses "number" instead of "version"
version = instance.get("version", str(instance.get("number", "")))

if LANGUAGE == "python":
# Use SWE-bench style naming for Python
Expand All @@ -52,7 +59,7 @@ def get_official_docker_image(
else:
org = instance.get("org", repo)
repo_name = repo
official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()

logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
return official_image_name
Expand All @@ -77,9 +84,53 @@ def extract_custom_tag(base_image: str) -> str:
return name


def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
def get_base_images_from_dataset(
dataset_name: str,
split: str,
n_limit: int | None = None,
selected_instances_file: str | None = None,
) -> list[str]:
"""Get all unique base images from the dataset."""
dataset = get_dataset(dataset_name, split)
# Check if this is a Multi-SWE-bench dataset that needs language filtering
if "Multi-SWE-bench" in dataset_name or "Multi-SWE-Bench" in dataset_name:
logger.info(
f"Downloading Multi-SWE-bench dataset for language: {LANGUAGE}"
)
downloaded_path = download_and_concat_dataset(dataset_name, LANGUAGE)

# Create a temporary formatted file
with tempfile.NamedTemporaryFile(
mode="w", suffix=".jsonl", delete=False
) as temp_file:
formatted_path = temp_file.name

format_data_for_inference(downloaded_path, formatted_path)
logger.info(f"Using formatted dataset: {formatted_path}")

# Load dataset from the local file
logger.info(f"Loading dataset {formatted_path}")
data = []
with open(formatted_path, "r") as f:
for line in f:
data.append(json.loads(line))

dataset = pd.DataFrame(data)

# Apply n_limit using prepare_dataset for consistency with evaluation
dataset = prepare_dataset(
dataset,
n_limit=n_limit,
selected_instances_file=selected_instances_file,
)
else:
# For non-Multi-SWE-bench datasets, use get_dataset
dataset = get_dataset(
dataset_name,
split,
eval_limit=n_limit if n_limit else None,
selected_instances_file=selected_instances_file,
)

base_images = set()

for _, row in dataset.iterrows():
Expand All @@ -95,7 +146,12 @@ def main():
args = parser.parse_args()

# Get base images from dataset
base_images = get_base_images_from_dataset(args.dataset, args.split)
base_images = get_base_images_from_dataset(
args.dataset,
args.split,
n_limit=args.n_limit if args.n_limit > 0 else None,
selected_instances_file=args.select,
)

logger.info(f"Found {len(base_images)} unique base images")

Expand All @@ -107,8 +163,11 @@ def main():
build_dir=Path(
args.output_dir or default_build_output_dir(args.dataset, args.split)
),
max_workers=args.num_workers,
dry_run=False,
max_workers=args.max_workers,
push=args.push,
max_retries=args.max_retries,
base_image_to_custom_tag_fn=extract_custom_tag,
dry_run=args.dry_run,
)


Expand Down
38 changes: 33 additions & 5 deletions benchmarks/multiswebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,35 @@ def run_multi_swebench_evaluation(
if dataset_name is None:
dataset_name = "bytedance-research/Multi-SWE-Bench"
if split is None:
split = "test"
split = "train"

try:
if input_file is None:
raise ValueError("input_file cannot be None")
input_path = Path(input_file)
work_dir = input_path.parent
original_work_dir = work_dir # Save original for copying back results

# Check if running in K8s with Docker-in-Docker shared volume
shared_dir = Path("/shared")
using_shared = False
if shared_dir.exists() and shared_dir.is_dir():
logger.info("Detected /shared volume (Docker-in-Docker), copying eval outputs...")
# Copy work_dir to /shared so DinD can access it
shared_work_dir = shared_dir / work_dir.name
if shared_work_dir.exists():
shutil.rmtree(shared_work_dir)
shutil.copytree(work_dir, shared_work_dir, symlinks=True)
work_dir = shared_work_dir
input_file = str(shared_work_dir / input_path.name)
using_shared = True
logger.info(f"Using shared work_dir: {work_dir}")

# Create config file for Multi-SWE-Bench
config_file = work_dir / "config.json"

# Handle dataset path - download if it's a ByteDance-Seed/Multi-SWE-bench dataset
if dataset_name.startswith("ByteDance-Seed/Multi-SWE-bench"):
# Handle dataset path - download if it's a Multi-SWE-Bench HuggingFace dataset
if dataset_name.startswith(("ByteDance-Seed/Multi-SWE-bench", "bytedance-research/Multi-SWE-Bench")):
logger.info(f"Downloading Multi-SWE-bench dataset for language: {lang}")
dataset_path = download_and_concat_dataset(dataset_name, lang)
else:
Expand Down Expand Up @@ -92,6 +108,18 @@ def run_multi_swebench_evaluation(

logger.info(f"Return code: {result.returncode}")

# Copy results back from /shared to original location
if using_shared:
logger.info(f"Copying results back from {work_dir} to {original_work_dir}")
# Only copy back the eval_files directory (contains results)
eval_files_src = work_dir / "eval_files"
eval_files_dst = original_work_dir / "eval_files"
if eval_files_src.exists():
if eval_files_dst.exists():
shutil.rmtree(eval_files_dst)
shutil.copytree(eval_files_src, eval_files_dst, symlinks=True)
logger.info("Results copied back successfully")

if result.returncode != 0:
error_msg = f"Evaluation failed with return code {result.returncode}"
print(f"ERROR: {error_msg}")
Expand All @@ -113,7 +141,7 @@ def main():
parser.add_argument(
"--dataset", default="bytedance-research/Multi-SWE-Bench", help="Dataset name"
)
parser.add_argument("--split", default="test", help="Dataset split")
parser.add_argument("--split", default="train", help="Dataset split")
parser.add_argument(
"--lang", default="java", help="Language for Multi-SWE-bench dataset"
)
Expand All @@ -140,7 +168,7 @@ def main():
logger.info(f"Results saved to {results_file}")

# Move the report file to the output location
output_report_path = args.input_file.with_suffix(".report.json")
output_report_path = Path(args.input_file).with_suffix(".report.json")
shutil.move(str(results_file), str(output_report_path))
logger.info(f"Report moved to {output_report_path}")

Expand Down
65 changes: 41 additions & 24 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,11 @@ def __init__(self, metadata: MultiSWEBenchEvalMetadata, **kwargs):
def prepare_instances(self) -> List[EvalInstance]:
logger.info("Setting up Multi-SWE-bench evaluation data")

# Check if this is a ByteDance-Seed/Multi-SWE-bench dataset that needs downloading
dataset_path = self.metadata.dataset
if dataset_path.startswith("ByteDance-Seed/Multi-SWE-bench"):
metadata = cast(MultiSWEBenchEvalMetadata, self.metadata)
metadata = cast(MultiSWEBenchEvalMetadata, self.metadata)
dataset_path = metadata.dataset

# Check if this is a Multi-SWE-bench dataset that needs language filtering
if "Multi-SWE-bench" in dataset_path or "Multi-SWE-Bench" in dataset_path:
logger.info(
f"Downloading Multi-SWE-bench dataset for language: {metadata.lang}"
)
Expand All @@ -138,15 +139,27 @@ def prepare_instances(self) -> List[EvalInstance]:
format_data_for_inference(downloaded_path, formatted_path)
dataset_path = formatted_path
logger.info(f"Using formatted dataset: {dataset_path}")
else:
# For non-Multi-SWE-bench datasets (e.g., local files), use get_dataset
from benchmarks.utils.dataset import get_dataset
logger.info(f"Loading dataset {metadata.dataset}")

df = get_dataset(
dataset_name=metadata.dataset,
split=metadata.dataset_split,
eval_limit=self.metadata.eval_limit if self.metadata.eval_limit > 0 else None,
selected_instances_file=metadata.selected_instances_file,
)

# Load dataset using direct JSON loading to handle complex nested structures
logger.info(f"Loading dataset {dataset_path}")
data = []
with open(dataset_path, "r") as f:
for line in f:
data.append(json.loads(line))
# Load dataset from the local file (for Multi-SWE-bench path)
if "Multi-SWE-bench" in metadata.dataset or "Multi-SWE-Bench" in metadata.dataset:
logger.info(f"Loading dataset {dataset_path}")
data = []
with open(dataset_path, "r") as f:
for line in f:
data.append(json.loads(line))

df = pd.DataFrame(data)
df = pd.DataFrame(data)

# Filter out instances with NaN instance_id before applying limits
original_count = len(df)
Expand Down Expand Up @@ -353,8 +366,22 @@ def evaluate_instance(
f"cp_testebed_repo failed: {cp_testebed_repo.stderr}"
)

# git reset
git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard")
# Get base_commit first - handle both SWE-Bench and Multi-SWE-Bench data formats
if "base" in instance.data and isinstance(instance.data["base"], dict):
# SWE-Bench format: {"base": {"sha": "..."}}
base_commit = instance.data["base"]["sha"]
elif "base_commit" in instance.data:
# Multi-SWE-Bench format: {"base_commit": "..."}
base_commit = instance.data["base_commit"]
else:
raise ValueError(
f"No base commit found in instance data. Available keys: {list(instance.data.keys())}"
)

logger.info("base_commit: %s", base_commit)

# git reset to base_commit (not just --hard which stays on current commit)
git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard {base_commit}")
assert git_reset.exit_code == 0, f"git reset failed: {git_reset.stderr}"

metadata = cast(MultiSWEBenchEvalMetadata, self.metadata)
Expand All @@ -378,17 +405,7 @@ def evaluate_instance(
"git commit -m 'patch'"
)

# Get git patch - handle both SWE-Bench and Multi-SWE-Bench data formats
if "base" in instance.data and isinstance(instance.data["base"], dict):
# SWE-Bench format: {"base": {"sha": "..."}}
base_commit = instance.data["base"]["sha"]
elif "base_commit" in instance.data:
# Multi-SWE-Bench format: {"base_commit": "..."}
base_commit = instance.data["base_commit"]
else:
raise ValueError(
f"No base commit found in instance data. Available keys: {list(instance.data.keys())}"
)
# Get git patch (base_commit already extracted earlier)
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
)
Expand Down
Loading
Loading