Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions benchmarks/commit0/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@

from commit0.harness.constants import SPLIT

from benchmarks.commit0.constants import (
CUSTOM_TAG_PREFIX,
DEFAULT_DATASET,
DEFAULT_DOCKER_IMAGE_PREFIX,
DEFAULT_IMAGE_TAG,
DEFAULT_REPO_SPLIT,
)
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
Expand All @@ -22,7 +29,6 @@


logger = get_logger(__name__)
DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"


def get_base_docker_image(
Expand All @@ -33,14 +39,14 @@ def get_base_docker_image(
prefix = docker_image_prefix or os.getenv(
"EVAL_DOCKER_IMAGE_PREFIX", DEFAULT_DOCKER_IMAGE_PREFIX
)
return (prefix.rstrip("/") + "/" + repo_name).lower() + ":v0"
return (prefix.rstrip("/") + "/" + repo_name).lower() + f":{DEFAULT_IMAGE_TAG}"


def extract_custom_tag(base_image: str) -> str:
"""Extract Commit0 custom tag from a base image name."""
repo_tag = base_image.rsplit("/", 1)[-1]
repo_name = repo_tag.split(":", 1)[0].lower()
return f"commit0-{repo_name}"
return f"{CUSTOM_TAG_PREFIX}{repo_name}"


def _load_selected_instances(selected_instances_file: str) -> list[str]:
Expand Down Expand Up @@ -90,7 +96,7 @@ def main(argv: list[str]) -> int:
parser.add_argument(
"--repo-split",
type=str,
default="lite",
default=DEFAULT_REPO_SPLIT,
help="Commit0 repo split (lite, all, or repo name)",
)
parser.add_argument(
Expand All @@ -99,7 +105,7 @@ def main(argv: list[str]) -> int:
default="",
help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)",
)
parser.set_defaults(dataset="wentingzhao/commit0_combined")
parser.set_defaults(dataset=DEFAULT_DATASET)
args = parser.parse_args(argv)

docker_image_prefix = args.docker_image_prefix or None
Expand Down
35 changes: 35 additions & 0 deletions benchmarks/commit0/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Commit0 Benchmark Constants

This module serves as the single source of truth for all hyperparameters
and constant values used in the Commit0 benchmark evaluation workflow.
"""

# Dataset configuration
DEFAULT_DATASET = "wentingzhao/commit0_combined"
DEFAULT_DATASET_SPLIT = "test"
DEFAULT_REPO_SPLIT = "lite"

# Docker image configuration
DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"
DEFAULT_IMAGE_TAG = "v0"
CUSTOM_TAG_PREFIX = "commit0-"

# Build configuration
BUILD_TARGET = "source-minimal"

# Git configuration
GIT_BRANCH_NAME = "commit0_combined"
AGENT_BRANCH_NAME = "openhands"

# Model configuration
DEFAULT_MODEL_NAME = "openhands"

# Runtime configuration
DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600
DEFAULT_CONVERSATION_TIMEOUT = 3600
DEFAULT_COMMAND_TIMEOUT = 600

# Evaluation configuration
TOTAL_INSTANCES = 16
9 changes: 5 additions & 4 deletions benchmarks/commit0/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sys
from pathlib import Path

from benchmarks.commit0.constants import DEFAULT_MODEL_NAME, TOTAL_INSTANCES
from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.report_costs import generate_cost_report

Expand All @@ -27,7 +28,7 @@


def process_commit0_results(
input_file: str, output_file: str, model_name: str = "openhands"
input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME
) -> None:
"""
Process Commit0 output.jsonl and generate evaluation report.
Expand Down Expand Up @@ -123,7 +124,7 @@ def process_commit0_results(
# Generate report
report = {
"model_name_or_path": model_name,
"total_instances": 16, # Fixed as per requirement
"total_instances": TOTAL_INSTANCES,
"submitted_instances": len(completed_ids),
"completed_instances": len(completed_ids),
"resolved_instances": len(resolved_ids),
Expand Down Expand Up @@ -174,8 +175,8 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
default=DEFAULT_MODEL_NAME,
help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})",
)

args = parser.parse_args()
Expand Down
83 changes: 54 additions & 29 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@
extract_custom_tag,
get_base_docker_image,
)
from benchmarks.commit0.constants import (
AGENT_BRANCH_NAME,
BUILD_TARGET,
DEFAULT_COMMAND_TIMEOUT,
DEFAULT_CONVERSATION_TIMEOUT,
DEFAULT_DATASET,
DEFAULT_DATASET_SPLIT,
DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,
DEFAULT_REPO_SPLIT,
DEFAULT_RUNTIME_API_URL,
GIT_BRANCH_NAME,
)
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand Down Expand Up @@ -110,9 +122,9 @@ def __init__(
self,
metadata: EvalMetadata,
num_workers: int = 1,
repo_split: str = "lite",
dataset_name: str = "wentingzhao/commit0_combined",
dataset_split: str = "test",
repo_split: str = DEFAULT_REPO_SPLIT,
dataset_name: str = DEFAULT_DATASET,
dataset_split: str = DEFAULT_DATASET_SPLIT,
):
super().__init__(metadata=metadata, num_workers=num_workers)
# Store additional parameters in metadata.details for access in methods
Expand All @@ -130,9 +142,9 @@ def prepare_instances(self) -> List[EvalInstance]:
logger.info("Setting up Commit0 evaluation data")

details = self.metadata.details or {}
dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined")
dataset_split = details.get("dataset_split", "test")
repo_split = details.get("repo_split", "lite")
dataset_name = details.get("dataset_name", DEFAULT_DATASET)
dataset_split = details.get("dataset_split", DEFAULT_DATASET_SPLIT)
repo_split = details.get("repo_split", DEFAULT_REPO_SPLIT)

dataset = load_dataset(dataset_name, split=dataset_split)
df = commit0_setup(dataset, repo_split)
Expand Down Expand Up @@ -180,7 +192,7 @@ def prepare_workspace(
"""
repo_name = instance.data["repo"].split("/")[1]
base_docker_image = get_base_docker_image(repo_name)
build_target = "source-minimal"
build_target = BUILD_TARGET
logger.info(f"Using base docker image: {base_docker_image}")

if self.metadata.workspace_type == "docker":
Expand Down Expand Up @@ -218,11 +230,14 @@ def prepare_workspace(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
startup_timeout = float(
os.getenv(
"REMOTE_RUNTIME_STARTUP_TIMEOUT",
str(DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),
)
)
workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
),
runtime_api_url=os.getenv("RUNTIME_API_URL", DEFAULT_RUNTIME_API_URL),
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
Expand All @@ -238,30 +253,34 @@ def prepare_workspace(

# Clone the repository to the specific directory
workspace_dir_name = instance.data["repo"].split("/")[1]
clone_cmd = f"cd /workspace/ && git clone -b commit0_combined https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
res = workspace.execute_command(clone_cmd, timeout=600)
clone_cmd = f"cd /workspace/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}"
res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
if res.exit_code != 0:
raise RuntimeError(f"Failed to clone repo: {res.stderr}")
logger.info(f"Cloned repository: {instance.data['repo']}")

# Create new branch
branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands"
res = workspace.execute_command(branch_cmd, timeout=600)
branch_cmd = (
f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"
)
res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
if res.exit_code != 0:
raise RuntimeError(f"Failed to create branch: {res.stderr}")
logger.info("Created new branch: openhands")
logger.info(f"Created new branch: {AGENT_BRANCH_NAME}")

# Install commit0
# Try uv first, fall back to pip if uv is not available
install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"
res = workspace.execute_command(install_cmd, timeout=600)
res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)
if res.exit_code != 0:
raise RuntimeError(f"Failed to install commit0: {res.stderr}")
logger.info("Installed commit0")

# Install pytest and required plugins for test reporting
plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"
res = workspace.execute_command(plugin_install_cmd, timeout=600)
res = workspace.execute_command(
plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT
)
if res.exit_code != 0:
raise RuntimeError(f"Failed to install pytest and plugins: {res.stderr}")
logger.info("Installed pytest and required plugins")
Expand Down Expand Up @@ -323,20 +342,24 @@ def evaluate_instance(
metadata=self.metadata,
)
conversation.send_message(instruction)
run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
run_timeout = int(
os.getenv("CONVERSATION_TIMEOUT", str(DEFAULT_CONVERSATION_TIMEOUT))
)
conversation.run(timeout=run_timeout)

history = list(conversation.state.events)

# Complete runtime: git add, commit, diff, run tests
workspace.execute_command(f"cd {repo_path} && git add .", timeout=600)
workspace.execute_command(
f"cd {repo_path} && git add .", timeout=DEFAULT_COMMAND_TIMEOUT
)
# Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
workspace.execute_command(
f"cd {repo_path} && "
'git config --global user.email "evaluation@openhands.dev" && '
'git config --global user.name "OpenHands Evaluation" && '
'git commit --no-verify -m "openhands edits"',
timeout=600,
f'git commit --no-verify -m "{AGENT_BRANCH_NAME} edits"',
timeout=DEFAULT_COMMAND_TIMEOUT,
)

# Get git patch
Expand All @@ -345,7 +368,7 @@ def evaluate_instance(
for retry in range(5):
patch_result = workspace.execute_command(
f"cd {repo_path} && git diff {base_commit} HEAD -- . ':(exclude)spec.pdf.bz2'",
timeout=600 + 100 * retry,
timeout=DEFAULT_COMMAND_TIMEOUT + 100 * retry,
)
if patch_result.exit_code == 0:
git_patch = patch_result.stdout.strip()
Expand All @@ -363,7 +386,9 @@ def evaluate_instance(
test_cmd = "python -m pytest"
full_test_cmd = f"cd {repo_path} && {test_cmd} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
logger.info(f"Running test command: {full_test_cmd}")
test_result = workspace.execute_command(full_test_cmd, timeout=600)
test_result = workspace.execute_command(
full_test_cmd, timeout=DEFAULT_COMMAND_TIMEOUT
)
logger.info(f"Test command exit code: {test_result.exit_code}")
if test_result.exit_code != 0:
logger.warning(f"Test command failed with stderr: {test_result.stderr}")
Expand All @@ -372,7 +397,7 @@ def evaluate_instance(
# Read test output
test_output_result = workspace.execute_command(
f"cd {repo_path} && cat test_output.txt",
timeout=600,
timeout=DEFAULT_COMMAND_TIMEOUT,
)
test_output = (
test_output_result.stdout.strip()
Expand All @@ -388,7 +413,7 @@ def evaluate_instance(
repo_name_normalized = repo_name.replace(".", "-")
test_ids_result = workspace.execute_command(
f"cd {repo_path} && commit0 get-tests {repo_name_normalized}",
timeout=600,
timeout=DEFAULT_COMMAND_TIMEOUT,
)
test_ids = (
test_ids_result.stdout.strip().split("\n")
Expand All @@ -405,7 +430,7 @@ def evaluate_instance(
# Read test report
report_result = workspace.execute_command(
f"cd {repo_path} && cat report.json",
timeout=600,
timeout=DEFAULT_COMMAND_TIMEOUT,
)

# Debug logging for report
Expand Down Expand Up @@ -593,11 +618,11 @@ def main() -> None:
parser.add_argument(
"--repo-split",
type=str,
default="lite",
default=DEFAULT_REPO_SPLIT,
help="all, lite, or each repo name",
)
# Override the default dataset for commit0
parser.set_defaults(dataset="wentingzhao/commit0_combined")
parser.set_defaults(dataset=DEFAULT_DATASET)
args = parser.parse_args()

# Validate max_attempts
Expand Down