refactor(commit0): consolidate hyperparameters in constants.py #370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

simonrosenberg wants to merge 3 commits into main from openhands/commit0-constants-refactor

benchmarks/commit0/build_images.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -13,6 +13,13 @@
  
    from commit0.harness.constants import SPLIT

    from benchmarks.commit0.constants import (

        CUSTOM_TAG_PREFIX,

        DEFAULT_DATASET,

        DEFAULT_DOCKER_IMAGE_PREFIX,

        DEFAULT_IMAGE_TAG,

        DEFAULT_REPO_SPLIT,

    )

    from benchmarks.utils.build_utils import (

        build_all_images,

        default_build_output_dir,

    @@ -22,7 +29,6 @@
  
    logger = get_logger(__name__)

    DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"

    def get_base_docker_image(

    @@ -33,14 +39,14 @@ def get_base_docker_image(
  
        prefix = docker_image_prefix or os.getenv(

            "EVAL_DOCKER_IMAGE_PREFIX", DEFAULT_DOCKER_IMAGE_PREFIX

        )

        return (prefix.rstrip("/") + "/" + repo_name).lower() + ":v0"

        return (prefix.rstrip("/") + "/" + repo_name).lower() + f":{DEFAULT_IMAGE_TAG}"

    def extract_custom_tag(base_image: str) -> str:

        """Extract Commit0 custom tag from a base image name."""

        repo_tag = base_image.rsplit("/", 1)[-1]

        repo_name = repo_tag.split(":", 1)[0].lower()

        return f"commit0-{repo_name}"

        return f"{CUSTOM_TAG_PREFIX}{repo_name}"

    def _load_selected_instances(selected_instances_file: str) -> list[str]:

    @@ -90,7 +96,7 @@ def main(argv: list[str]) -> int:
  
        parser.add_argument(

            "--repo-split",

            type=str,

            default="lite",

            default=DEFAULT_REPO_SPLIT,

            help="Commit0 repo split (lite, all, or repo name)",

        )

        parser.add_argument(

    @@ -99,7 +105,7 @@ def main(argv: list[str]) -> int:
  
            default="",

            help="Override base image prefix (default: env EVAL_DOCKER_IMAGE_PREFIX)",

        )

        parser.set_defaults(dataset="wentingzhao/commit0_combined")

        parser.set_defaults(dataset=DEFAULT_DATASET)

        args = parser.parse_args(argv)

        docker_image_prefix = args.docker_image_prefix or None

benchmarks/commit0/constants.py

-Original file line number
+Diff line change
@@ -0,0 +1,35 @@
+    """
+    Commit0 Benchmark Constants
+    This module serves as the single source of truth for all hyperparameters
+    and constant values used in the Commit0 benchmark evaluation workflow.
+    """
+    # Dataset configuration
+    DEFAULT_DATASET = "wentingzhao/commit0_combined"
+    DEFAULT_DATASET_SPLIT = "test"
+    DEFAULT_REPO_SPLIT = "lite"
+    # Docker image configuration
+    DEFAULT_DOCKER_IMAGE_PREFIX = "docker.io/wentingzhao/"
+    DEFAULT_IMAGE_TAG = "v0"
+    CUSTOM_TAG_PREFIX = "commit0-"
+    # Build configuration
+    BUILD_TARGET = "source-minimal"
+    # Git configuration
+    GIT_BRANCH_NAME = "commit0_combined"
+    AGENT_BRANCH_NAME = "openhands"
+    # Model configuration
+    DEFAULT_MODEL_NAME = "openhands"
+    # Runtime configuration
+    DEFAULT_RUNTIME_API_URL = "https://runtime.eval.all-hands.dev"
+    DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT = 600
+    DEFAULT_CONVERSATION_TIMEOUT = 3600
+    DEFAULT_COMMAND_TIMEOUT = 600
+    # Evaluation configuration
+    TOTAL_INSTANCES = 16

benchmarks/commit0/eval_infer.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -15,6 +15,7 @@
  
    import sys

    from pathlib import Path

    from benchmarks.commit0.constants import DEFAULT_MODEL_NAME, TOTAL_INSTANCES

    from benchmarks.utils.laminar import LaminarService

    from benchmarks.utils.report_costs import generate_cost_report

    @@ -27,7 +28,7 @@
  
    def process_commit0_results(

        input_file: str, output_file: str, model_name: str = "openhands"

        input_file: str, output_file: str, model_name: str = DEFAULT_MODEL_NAME

    ) -> None:

        """

        Process Commit0 output.jsonl and generate evaluation report.

    @@ -123,7 +124,7 @@ def process_commit0_results(
  
        # Generate report

        report = {

            "model_name_or_path": model_name,

            "total_instances": 16,  # Fixed as per requirement

            "total_instances": TOTAL_INSTANCES,

            "submitted_instances": len(completed_ids),

            "completed_instances": len(completed_ids),

            "resolved_instances": len(resolved_ids),

    @@ -174,8 +175,8 @@ def main() -> None:
  
        parser.add_argument(

            "--model-name",

            default="openhands",

            help="Model name to use in the model_name_or_path field (default: openhands)",

            default=DEFAULT_MODEL_NAME,

            help=f"Model name to use in the model_name_or_path field (default: {DEFAULT_MODEL_NAME})",

        )

        args = parser.parse_args()

benchmarks/commit0/run_infer.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -12,6 +12,18 @@
  
        extract_custom_tag,

        get_base_docker_image,

    )

    from benchmarks.commit0.constants import (

        AGENT_BRANCH_NAME,

        BUILD_TARGET,

        DEFAULT_COMMAND_TIMEOUT,

        DEFAULT_CONVERSATION_TIMEOUT,

        DEFAULT_DATASET,

        DEFAULT_DATASET_SPLIT,

        DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT,

        DEFAULT_REPO_SPLIT,

        DEFAULT_RUNTIME_API_URL,

        GIT_BRANCH_NAME,

    )

    from benchmarks.utils.args_parser import get_parser

    from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE

    from benchmarks.utils.conversation import build_event_persistence_callback

    @@ -110,9 +122,9 @@ def __init__(
  
            self,

            metadata: EvalMetadata,

            num_workers: int = 1,

            repo_split: str = "lite",

            dataset_name: str = "wentingzhao/commit0_combined",

            dataset_split: str = "test",

            repo_split: str = DEFAULT_REPO_SPLIT,

            dataset_name: str = DEFAULT_DATASET,

            dataset_split: str = DEFAULT_DATASET_SPLIT,

        ):

            super().__init__(metadata=metadata, num_workers=num_workers)

            # Store additional parameters in metadata.details for access in methods

    @@ -130,9 +142,9 @@ def prepare_instances(self) -> List[EvalInstance]:
  
            logger.info("Setting up Commit0 evaluation data")

            details = self.metadata.details or {}

            dataset_name = details.get("dataset_name", "wentingzhao/commit0_combined")

            dataset_split = details.get("dataset_split", "test")

            repo_split = details.get("repo_split", "lite")

            dataset_name = details.get("dataset_name", DEFAULT_DATASET)

            dataset_split = details.get("dataset_split", DEFAULT_DATASET_SPLIT)

            repo_split = details.get("repo_split", DEFAULT_REPO_SPLIT)

            dataset = load_dataset(dataset_name, split=dataset_split)

            df = commit0_setup(dataset, repo_split)

    @@ -180,7 +192,7 @@ def prepare_workspace(
  
            """

            repo_name = instance.data["repo"].split("/")[1]

            base_docker_image = get_base_docker_image(repo_name)

            build_target = "source-minimal"

            build_target = BUILD_TARGET

            logger.info(f"Using base docker image: {base_docker_image}")

            if self.metadata.workspace_type == "docker":

    @@ -218,11 +230,14 @@ def prepare_workspace(
  
                    f"Using remote workspace with image {agent_server_image} "

                    f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"

                )

                startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))

                startup_timeout = float(

                    os.getenv(

                        "REMOTE_RUNTIME_STARTUP_TIMEOUT",

                        str(DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),

                    )

                )

                workspace = APIRemoteWorkspace(

                    runtime_api_url=os.getenv(

                        "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"

                    ),

                    runtime_api_url=os.getenv("RUNTIME_API_URL", DEFAULT_RUNTIME_API_URL),

                    runtime_api_key=runtime_api_key,

                    server_image=agent_server_image,

                    target_type="source" if "source" in build_target else "binary",

    @@ -238,30 +253,34 @@ def prepare_workspace(
  
            # Clone the repository to the specific directory

            workspace_dir_name = instance.data["repo"].split("/")[1]

            clone_cmd = f"cd /workspace/ && git clone -b commit0_combined https://github.com/{instance.data['repo']}.git {workspace_dir_name}"

            res = workspace.execute_command(clone_cmd, timeout=600)

            clone_cmd = f"cd /workspace/ && git clone -b {GIT_BRANCH_NAME} https://github.com/{instance.data['repo']}.git {workspace_dir_name}"

            res = workspace.execute_command(clone_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)

            if res.exit_code != 0:

                raise RuntimeError(f"Failed to clone repo: {res.stderr}")

            logger.info(f"Cloned repository: {instance.data['repo']}")

            # Create new branch

            branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands"

            res = workspace.execute_command(branch_cmd, timeout=600)

            branch_cmd = (

                f"cd /workspace/{workspace_dir_name} && git checkout -b {AGENT_BRANCH_NAME}"

            )

            res = workspace.execute_command(branch_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)

            if res.exit_code != 0:

                raise RuntimeError(f"Failed to create branch: {res.stderr}")

            logger.info("Created new branch: openhands")

            logger.info(f"Created new branch: {AGENT_BRANCH_NAME}")

            # Install commit0

            # Try uv first, fall back to pip if uv is not available

            install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install commit0 || pip install commit0)"

            res = workspace.execute_command(install_cmd, timeout=600)

            res = workspace.execute_command(install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT)

            if res.exit_code != 0:

                raise RuntimeError(f"Failed to install commit0: {res.stderr}")

            logger.info("Installed commit0")

            # Install pytest and required plugins for test reporting

            plugin_install_cmd = f"cd /workspace/{workspace_dir_name} && (uv pip install pytest pytest-json-report pytest-cov || pip install pytest pytest-json-report pytest-cov)"

            res = workspace.execute_command(plugin_install_cmd, timeout=600)

            res = workspace.execute_command(

                plugin_install_cmd, timeout=DEFAULT_COMMAND_TIMEOUT

            )

            if res.exit_code != 0:

                raise RuntimeError(f"Failed to install pytest and plugins: {res.stderr}")

            logger.info("Installed pytest and required plugins")

    @@ -323,20 +342,24 @@ def evaluate_instance(
  
                metadata=self.metadata,

            )

            conversation.send_message(instruction)

            run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))

            run_timeout = int(

                os.getenv("CONVERSATION_TIMEOUT", str(DEFAULT_CONVERSATION_TIMEOUT))

            )

            conversation.run(timeout=run_timeout)

            history = list(conversation.state.events)

            # Complete runtime: git add, commit, diff, run tests

            workspace.execute_command(f"cd {repo_path} && git add .", timeout=600)

            workspace.execute_command(

                f"cd {repo_path} && git add .", timeout=DEFAULT_COMMAND_TIMEOUT

            )

            # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail

            workspace.execute_command(

                f"cd {repo_path} && "

                'git config --global user.email "evaluation@openhands.dev" && '

                'git config --global user.name "OpenHands Evaluation" && '

                'git commit --no-verify -m "openhands edits"',

                timeout=600,

                f'git commit --no-verify -m "{AGENT_BRANCH_NAME} edits"',

                timeout=DEFAULT_COMMAND_TIMEOUT,

            )

            # Get git patch

    @@ -345,7 +368,7 @@ def evaluate_instance(
  
            for retry in range(5):

                patch_result = workspace.execute_command(

                    f"cd {repo_path} && git diff {base_commit} HEAD -- . ':(exclude)spec.pdf.bz2'",

                    timeout=600 + 100 * retry,

                    timeout=DEFAULT_COMMAND_TIMEOUT + 100 * retry,

                )

                if patch_result.exit_code == 0:

                    git_patch = patch_result.stdout.strip()

    @@ -363,7 +386,9 @@ def evaluate_instance(
  
                test_cmd = "python -m pytest"

            full_test_cmd = f"cd {repo_path} && {test_cmd} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"

            logger.info(f"Running test command: {full_test_cmd}")

            test_result = workspace.execute_command(full_test_cmd, timeout=600)

            test_result = workspace.execute_command(

                full_test_cmd, timeout=DEFAULT_COMMAND_TIMEOUT

            )

            logger.info(f"Test command exit code: {test_result.exit_code}")

            if test_result.exit_code != 0:

                logger.warning(f"Test command failed with stderr: {test_result.stderr}")

    @@ -372,7 +397,7 @@ def evaluate_instance(
  
            # Read test output

            test_output_result = workspace.execute_command(

                f"cd {repo_path} && cat test_output.txt",

                timeout=600,

                timeout=DEFAULT_COMMAND_TIMEOUT,

            )

            test_output = (

                test_output_result.stdout.strip()

    @@ -388,7 +413,7 @@ def evaluate_instance(
  
            repo_name_normalized = repo_name.replace(".", "-")

            test_ids_result = workspace.execute_command(

                f"cd {repo_path} && commit0 get-tests {repo_name_normalized}",

                timeout=600,

                timeout=DEFAULT_COMMAND_TIMEOUT,

            )

            test_ids = (

                test_ids_result.stdout.strip().split("\n")

    @@ -405,7 +430,7 @@ def evaluate_instance(
  
            # Read test report

            report_result = workspace.execute_command(

                f"cd {repo_path} && cat report.json",

                timeout=600,

                timeout=DEFAULT_COMMAND_TIMEOUT,

            )

            # Debug logging for report

    @@ -593,11 +618,11 @@ def main() -> None:
  
        parser.add_argument(

            "--repo-split",

            type=str,

            default="lite",

            default=DEFAULT_REPO_SPLIT,

            help="all, lite, or each repo name",

        )

        # Override the default dataset for commit0

        parser.set_defaults(dataset="wentingzhao/commit0_combined")

        parser.set_defaults(dataset=DEFAULT_DATASET)

        args = parser.parse_args()

        # Validate max_attempts

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

refactor(commit0): consolidate hyperparameters in constants.py #370

Uh oh!

Diff view

Diff view

There are no files selected for viewing

refactor(commit0): consolidate hyperparameters in constants.py #370

Are you sure you want to change the base?

Uh oh!

refactor(commit0): consolidate hyperparameters in constants.py #370

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing