From 92e98f65239677a2bd241abae9a15749eca4fa66 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 04:35:27 +0000 Subject: [PATCH 01/22] feat: Enable llm_completions logging in aider_bench - Added update_llm_config_for_completions_logging to imports - Modified get_config to accept instance parameter - Updated llm_config to enable completions logging - Updated process_instance to pass instance to get_config This change makes aider_bench save llm_completions in the same way as swe_bench, with completions being saved in {eval_output_dir}/llm_completions/{instance_id}/ --- evaluation/benchmarks/aider_bench/run_infer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 8045f948d3f9..1ee68c21c2f0 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -20,6 +20,7 @@ prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -45,6 +46,7 @@ def get_config( + instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: config = AppConfig( @@ -67,7 +69,13 @@ def get_config( workspace_base=None, workspace_mount_path=None, ) - config.set_llm_config(metadata.llm_config) + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False @@ -170,7 +178,7 @@ def process_instance( metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: - config = get_config(metadata) + config = get_config(instance, metadata) # Setup the logger properly, so you can run multi-processing to parallelize the evaluation if reset_logger: From bc8f20d35a6639ee1789832b3d1c4fe830caef3c Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:22:02 +0000 Subject: [PATCH 02/22] Add polyglot benchmark implementation --- .../benchmarks/polyglot_benchmark/Dockerfile | 63 +++ .../benchmarks/polyglot_benchmark/README.md | 90 ++++ .../polyglot_benchmark/helper/__init__.py | 0 .../polyglot_benchmark/helper/prompts.py | 28 + .../polyglot_benchmark/run_infer.py | 487 ++++++++++++++++++ .../scripts/build_docker.sh | 12 + .../polyglot_benchmark/scripts/run_infer.sh | 35 ++ .../scripts/summarize_results.py | 84 +++ .../polyglot_benchmark/test_load_dataset.py | 40 ++ .../benchmarks/polyglot_benchmark/test_run.py | 73 +++ 10 files changed, 912 insertions(+) create mode 100644 evaluation/benchmarks/polyglot_benchmark/Dockerfile create mode 100644 evaluation/benchmarks/polyglot_benchmark/README.md create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/__init__.py create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/prompts.py create mode 100644 evaluation/benchmarks/polyglot_benchmark/run_infer.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_run.py diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile new file mode 100644 index 000000000000..ed789e6d8000 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/Dockerfile @@ -0,0 +1,63 @@ +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + wget \ + software-properties-common \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + libboost-all-dev \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip3 install --no-cache-dir pytest pytest-timeout + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \ + && rm go1.20.5.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java +RUN apt-get update && apt-get install -y openjdk-17-jdk \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Install Gradle +RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \ + && mkdir /opt/gradle \ + && unzip -d /opt/gradle gradle-7.6-bin.zip \ + && rm gradle-7.6-bin.zip +ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}" + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md new file mode 100644 index 000000000000..d92251acb9f7 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -0,0 +1,90 @@ +# Polyglot Benchmark + +This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages. + +## Features + +- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java) +- End-to-end evaluation of code editing capabilities +- Automated test execution and validation +- Parallel evaluation with multiple workers +- Detailed metrics and logging + +## Setup + +1. Clone the polyglot-benchmark repository: + ```bash + git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark + ``` + +2. Build the Docker image for the benchmark: + ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh + ``` + +## Usage + +1. Make sure you have the required dependencies installed: + ```bash + pip install -e .[dev] + ``` + +2. Run the benchmark: + ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh + ``` + +### Command Line Arguments + +- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`) +- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`) +- `agent`: Agent class name (e.g., `CodeActAgent`) +- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all) +- `eval-num-workers`: Number of parallel workers (default: `1`) +- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`) +- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`) + +### Environment Variables + +You can also set the following environment variables: + +```bash +export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the polyglot-benchmark repository +export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) +``` + +### Example + +```bash +# Run evaluation on CodeActAgent for all Python instances with 2 workers +export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark" +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python" +``` + +## Summarize Results + +After running the benchmark, you can summarize the results: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py +``` + +Example: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl +``` + +## Supported Languages + +The benchmark supports the following languages and test frameworks: +- Python: pytest +- JavaScript: npm test +- Rust: cargo test +- Go: go test +- C++: make test +- Java: Gradle test + +## Docker Support + +The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks. \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py new file mode 100644 index 000000000000..61bc0e54cb11 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py @@ -0,0 +1,28 @@ +"""Prompts used in the polyglot benchmark.""" + +INSTRUCTIONS_ADDENDUM = """ +I've provided the following files that need to be modified: +{file_list} + +Please help me implement the necessary changes to meet the requirements. +You should ONLY modify these files, and NOT create any new files. +""" + +TEST_FAILURES = """ +The tests failed. Please fix the issues and try again. +Remember to only modify the following files: +{file_list} +""" + +# Dictionary mapping agent class names to their specific instruction suffixes +INST_SUFFIXES = { + 'CodeActAgent': ( + 'REMEMBER: All edits must be made directly in the files. Do NOT send' + ' the edited file as output to the user.\n' + ) +} + +# Dictionary mapping agent class names to their fake response functions +FAKE_RESPONSES = { + 'CodeActAgent': lambda _: None, # Will be replaced with codeact_user_response from shared.py +} \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py new file mode 100644 index 000000000000..45a9ee4f91ac --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -0,0 +1,487 @@ +import asyncio +import copy +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from evaluation.benchmarks.polyglot_benchmark.helper.prompts import ( + INSTRUCTIONS_ADDENDUM, + INST_SUFFIXES, + TEST_FAILURES, + FAKE_RESPONSES, +) +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + update_llm_config_for_completions_logging, + codeact_user_response, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + load_from_toml, + parse_arguments, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +# Configure visibility of unit tests to the Agent. +USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true' + +# Map of file extensions to test commands +TEST_COMMANDS = { + ".py": ["python3", "-m", "pytest"], + ".rs": ["cargo", "test", "--", "--include-ignored"], + ".go": ["go", "test", "./..."], + ".js": ["npm", "test"], + ".cpp": ["make", "test"], + ".java": ["./gradlew", "test"], +} + +# Update fake responses with the actual function +FAKE_RESPONSES['CodeActAgent'] = codeact_user_response + +def get_config( + instance: pd.Series, + metadata: EvalMetadata, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'docker'), + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0', # TODO: Create this image + enable_auto_lint=True, + use_host_network=False, + timeout=300, # Longer timeout for compilation + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=1800, + remote_runtime_enable_retries=True, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + # Enable logging of LLM completions + llm_config.log_completions = True + config.set_llm_config(llm_config) + + agent_config = config.get_agent_config(metadata.agent_class) + agent_config.enable_prompt_extensions = False + + # copy 'draft_editor' config if exists + config_copy = copy.deepcopy(config) + load_from_toml(config_copy) + if 'draft_editor' in config_copy.llms: + config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor') + + return config + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, +): + """Initialize the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Initialization Fn') + logger.info('-' * 30) + obs: CmdOutputObservation + + # Create workspace + action = CmdRunAction(command='mkdir -p /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + # Copy files to workspace + with tempfile.TemporaryDirectory() as tmpdir: + # Copy solution files + for file_path in instance.solution_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.solution_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + # Copy test files if enabled + if USE_UNIT_TESTS: + for file_path in instance.test_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.test_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + logger.info('-' * 30) + logger.info('END Runtime Initialization Fn') + logger.info('-' * 30) + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> Dict[str, Any]: + """Complete the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Completion Fn') + logger.info('-' * 30) + + # Run tests + test_output = "" + exit_code = 1 + + if USE_UNIT_TESTS: + # Get unique file extensions from test files + extensions = {Path(f).suffix for f in instance.test_files} + + # Find matching test command + command = None + for ext in extensions: + if ext in TEST_COMMANDS: + command = TEST_COMMANDS[ext] + break + + if command: + try: + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=180, # 3 minutes timeout + cwd="/workspace", + encoding="utf-8", + errors="replace", + ) + exit_code = result.returncode + test_output = result.stdout + + # Clean up output + test_output = test_output.replace("/workspace", "workspace") + + # Log test output to history file + with open("/workspace/.aider.chat.history.md", "a") as fh: + fh.write(f"```\n{test_output}\n```") + + except subprocess.TimeoutExpired: + test_output = "Tests timed out!" + exit_code = 1 + + logger.info('-' * 30) + logger.info('END Runtime Completion Fn') + logger.info('-' * 30) + + runtime.close() + + return { + 'test_output': test_output, + 'exit_code': exit_code, + } + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + config = get_config(instance, metadata) + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir) + else: + logger.info( + f'\nStarting evaluation for instance {str(instance.instance_id)}.\n' + ) + + # ============================================= + # build instruction + # ============================================= + + # Prepare instruction + logger.info(instance) + instruction = instance.instruction + + # Add file list to instruction + file_list = " ".join(instance.solution_files) + instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list) + + if USE_UNIT_TESTS: + test_files = " ".join(instance.test_files) + logger.info(f'\nTest files: {test_files}\n') + instruction += ( + f'Use the appropriate test command to run the tests and verify your solution. ' + 'DO NOT EDIT the test files.\n\n' + ) + + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided ' + 'to you AND NEVER ASK FOR HUMAN HELP.\n' + ) + + # Add agent-specific instruction suffix + if metadata.agent_class in INST_SUFFIXES: + instruction += INST_SUFFIXES[metadata.agent_class] + + # ============================================= + # create sandbox and run the agent + # ============================================= + + runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + + initialize_runtime(runtime, instance=instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) + ) + if state is None: + raise ValueError('State should not be None.') + + # ============================================= + # result evaluation + # ============================================= + + return_val = complete_runtime(runtime, instance) + exit_code = return_val['exit_code'] + test_output = return_val['test_output'] + + errors = [] + test_cases = None + if test_output: + if 'SyntaxError' in test_output: + errors.append('SyntaxError') + elif 'IndentationError' in test_output: + errors.append('IndentationError') + else: + test_cases = test_output + + test_result = { + 'exit_code': exit_code, + 'test_cases': test_cases, + 'errors': errors, + } + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + histories = compatibility_for_eval_history_pairs(state.history) + metrics = state.metrics.get() if state.metrics else None + + # Save the output + output = EvalOutput( + instance_id=str(instance.instance_id), + instance=instance.to_dict(), + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + +def load_polyglot_dataset(): + """Load the polyglot benchmark dataset from the repository.""" + import glob + import json + import os + + # Path to the polyglot-benchmark repository + repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark') + + all_tests = [] + instance_id = 0 + + # Process each language directory + for lang_dir in ['python', 'javascript', 'rust', 'go', 'cpp', 'java']: + lang_path = os.path.join(repo_path, lang_dir, 'exercises', 'practice') + if not os.path.exists(lang_path): + logger.warning(f"Language directory not found: {lang_path}") + continue + + # Process each exercise directory + for exercise_dir in os.listdir(lang_path): + exercise_path = os.path.join(lang_path, exercise_dir) + if not os.path.isdir(exercise_path): + continue + + # Check for config.json + config_file = os.path.join(exercise_path, '.meta', 'config.json') + if not os.path.exists(config_file): + logger.warning(f"Config file not found: {config_file}") + continue + + # Load config + with open(config_file, 'r') as f: + config = json.load(f) + + # Get solution and test files + solution_files = config.get('files', {}).get('solution', []) + test_files = config.get('files', {}).get('test', []) + + if not solution_files or not test_files: + logger.warning(f"Missing solution or test files in {exercise_path}") + continue + + # Load instructions + instruction = "" + intro_file = os.path.join(exercise_path, '.docs', 'introduction.md') + if os.path.exists(intro_file): + with open(intro_file, 'r') as f: + instruction += f.read() + "\n\n" + + instructions_file = os.path.join(exercise_path, '.docs', 'instructions.md') + if os.path.exists(instructions_file): + with open(instructions_file, 'r') as f: + instruction += f.read() + "\n\n" + + if not instruction: + logger.warning(f"No instructions found for {exercise_path}") + continue + + # Load solution and test content + solution_content = {} + for file_path in solution_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + solution_content[os.path.basename(file_path)] = f.read() + + test_content = {} + for file_path in test_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + test_content[os.path.basename(file_path)] = f.read() + + # Create test instance + test_instance = { + 'instance_id': instance_id, + 'instance_name': exercise_dir, + 'language': lang_dir, + 'instruction': instruction, + 'solution_files': [os.path.basename(f) for f in solution_files], + 'test_files': [os.path.basename(f) for f in test_files], + 'solution_content': solution_content, + 'test_content': test_content, + } + + all_tests.append(test_instance) + instance_id += 1 + + return pd.DataFrame(all_tests) + +def add_arguments(parser): + """Add polyglot benchmark specific arguments to the parser.""" + parser.add_argument( + '--eval-languages', + type=str, + help='Comma-separated list of languages to test (e.g., "python,javascript,rust")', + ) + return parser + +if __name__ == '__main__': + # Add custom arguments + parser = parse_arguments.__self__ + add_arguments(parser) + args = parser.parse_args() + + # Load the polyglot benchmark dataset + polyglot_tests = load_polyglot_dataset() + + if polyglot_tests.empty: + logger.error("Failed to load polyglot benchmark dataset") + exit(1) + + logger.info(f"Loaded {len(polyglot_tests)} test instances from polyglot benchmark") + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results + llm_config.modify_params = False + # Enable logging of LLM completions + llm_config.log_completions = True + + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + + # Parse dataset IDs if provided + eval_ids = None + if args.eval_ids: + eval_ids = str(args.eval_ids).split(',') + logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n') + + # Filter by language if specified + if hasattr(args, 'eval_languages') and args.eval_languages: + languages = [lang.strip().lower() for lang in args.eval_languages.split(',')] + polyglot_tests = polyglot_tests[polyglot_tests['language'].str.lower().isin(languages)] + logger.info(f'\nFiltered to languages: {languages}, {len(polyglot_tests)} instances remaining\n') + + instances = prepare_dataset( + polyglot_tests, + output_file, + args.eval_n_limit, + eval_ids=eval_ids, + ) + + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + ) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh new file mode 100755 index 000000000000..1c6a2dfff7a1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" + +# Build the Docker image +docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 -f "${BENCHMARK_DIR}/Dockerfile" "${BENCHMARK_DIR}" + +echo "Docker image built successfully: ghcr.io/opendevin/eval-polyglot:v1.0.0" \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh new file mode 100755 index 000000000000..ce998a112330 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +# Default values +MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"} +GIT_VERSION=${2:-"HEAD"} +AGENT=${3:-"CodeActAgent"} +EVAL_LIMIT=${4:-"-1"} +EVAL_NUM_WORKERS=${5:-"1"} +EVAL_IDS=${6:-""} +EVAL_LANGUAGES=${7:-""} + +# Set environment variables +export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"} +export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} + +# Add additional arguments based on provided parameters +ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}" + +if [ "${EVAL_LIMIT}" != "-1" ]; then + ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}" +fi + +if [ -n "${EVAL_IDS}" ]; then + ARGS="${ARGS} --eval-ids ${EVAL_IDS}" +fi + +if [ -n "${EVAL_LANGUAGES}" ]; then + ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}" +fi + +# Run the evaluation +cd "$(git rev-parse --show-toplevel)" +poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py new file mode 100755 index 000000000000..988f3a618bff --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +from collections import defaultdict + +def load_jsonl(file_path): + """Load data from a jsonl file.""" + data = [] + with open(file_path, 'r') as f: + for line in f: + data.append(json.loads(line)) + return data + +def summarize_results(output_file): + """Summarize the results of the polyglot benchmark evaluation.""" + if not os.path.exists(output_file): + print(f"Error: Output file {output_file} does not exist.") + return + + results = load_jsonl(output_file) + + # Count total instances + total_instances = len(results) + print(f"Total instances: {total_instances}") + + # Count by language + language_counts = defaultdict(int) + language_passed = defaultdict(int) + + # Count passed and failed instances + passed_instances = [] + failed_instances = [] + + for result in results: + instance = result.get('instance', {}) + language = instance.get('language', 'unknown') + instance_name = instance.get('instance_name', 'unknown') + instance_id = result.get('instance_id', 'unknown') + + language_counts[language] += 1 + + # Check if all tests passed + test_result = result.get('test_result', {}) + exit_code = test_result.get('exit_code', 1) + + if exit_code == 0: + passed_instances.append((instance_id, language, instance_name)) + language_passed[language] += 1 + else: + failed_instances.append((instance_id, language, instance_name)) + + # Print summary + print("\nResults by language:") + print("--------------------") + for language, count in sorted(language_counts.items()): + passed = language_passed[language] + percentage = (passed / count) * 100 if count > 0 else 0 + print(f"{language}: {passed}/{count} ({percentage:.1f}%)") + + # Overall pass rate + total_passed = len(passed_instances) + overall_percentage = (total_passed / total_instances) * 100 if total_instances > 0 else 0 + print(f"\nOverall pass rate: {total_passed}/{total_instances} ({overall_percentage:.1f}%)") + + # Print passed instances + print("\nPassed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(passed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + + # Print failed instances + print("\nFailed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(failed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Summarize polyglot benchmark results") + parser.add_argument("output_file", help="Path to the output.jsonl file") + args = parser.parse_args() + + summarize_results(args.output_file) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py new file mode 100755 index 000000000000..708259732b02 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os +import sys +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import load_polyglot_dataset + +def main(): + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + # Print summary + print(f"Loaded {len(dataset)} test instances") + + # Print language distribution + language_counts = dataset['language'].value_counts() + print("\nLanguage distribution:") + for language, count in language_counts.items(): + print(f"{language}: {count}") + + # Print a sample instance + if not dataset.empty: + print("\nSample instance:") + sample = dataset.iloc[0] + print(f"ID: {sample.instance_id}") + print(f"Name: {sample.instance_name}") + print(f"Language: {sample.language}") + print(f"Solution files: {sample.solution_files}") + print(f"Test files: {sample.test_files}") + print(f"Instruction (first 100 chars): {sample.instruction[:100]}...") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py new file mode 100755 index 000000000000..a8671b0646f1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import ( + load_polyglot_dataset, + process_instance, + make_metadata, + get_llm_config_arg, +) +from openhands.core.logger import openhands_logger as logger + +def main(): + parser = argparse.ArgumentParser(description="Test the polyglot benchmark with a single instance") + parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name") + parser.add_argument("--agent", default="CodeActAgent", help="Agent class name") + parser.add_argument("--instance-id", type=int, default=0, help="Instance ID to test") + parser.add_argument("--language", help="Filter by language") + args = parser.parse_args() + + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + if args.language: + dataset = dataset[dataset['language'].str.lower() == args.language.lower()] + if dataset.empty: + print(f"No instances found for language: {args.language}") + return + + # Get the instance to test + if args.instance_id >= len(dataset): + print(f"Instance ID {args.instance_id} is out of range. Max ID: {len(dataset) - 1}") + return + + instance = dataset.iloc[args.instance_id] + print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})") + + # Get LLM config + llm_config = get_llm_config_arg(args.model) + if llm_config is None: + print(f"Could not find LLM config: {args.model}") + return + + # Create metadata + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent, + 30, # max_iterations + "test", + "evaluation/evaluation_outputs/test", + ) + + # Process the instance + try: + output = process_instance(instance, metadata, reset_logger=False) + print("\nTest completed successfully!") + print(f"Exit code: {output.test_result['exit_code']}") + print(f"Passed: {output.test_result['exit_code'] == 0}") + except Exception as e: + print(f"Error processing instance: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file From 37ba6965aaf5f5216f2a77ca191fde1ef12aef2f Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:26:06 +0000 Subject: [PATCH 03/22] Fix argument parser in polyglot benchmark --- evaluation/benchmarks/polyglot_benchmark/run_infer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index 45a9ee4f91ac..6fce76d9dbdf 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -424,10 +424,13 @@ def add_arguments(parser): return parser if __name__ == '__main__': - # Add custom arguments - parser = parse_arguments.__self__ + # Get the argument parser and add custom arguments + import argparse + from openhands.core.config import get_parser + + parser = get_parser() add_arguments(parser) - args = parser.parse_args() + args = parse_arguments() # Load the polyglot benchmark dataset polyglot_tests = load_polyglot_dataset() From 890377d28352f9742c92e0c336ab4ec9d1e3171f Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:27:21 +0000 Subject: [PATCH 04/22] Improve polyglot benchmark path handling and fix logging error --- .../polyglot_benchmark/run_infer.py | 26 ++++++++++++-- .../polyglot_benchmark/scripts/run_infer.sh | 35 ++++++++++++++++++- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index 6fce76d9dbdf..c5adbc64c572 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -328,9 +328,31 @@ def load_polyglot_dataset(): import glob import json import os + from pathlib import Path - # Path to the polyglot-benchmark repository - repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark') + # Try to find the polyglot-benchmark repository + # First check the environment variable + repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH') + + # If not set, try common locations + if not repo_path or not os.path.exists(repo_path): + possible_paths = [ + '/workspace/polyglot-benchmark', + str(Path.home() / 'polyglot-benchmark'), + str(Path.home() / 'thereal' / 'polyglot-benchmark'), + str(Path(__file__).parent.parent.parent.parent.parent / 'polyglot-benchmark'), + str(Path.cwd() / 'polyglot-benchmark'), + ] + + for path in possible_paths: + if os.path.exists(path): + repo_path = path + logger.info(f"Found polyglot-benchmark repository at: {repo_path}") + break + + if not repo_path or not os.path.exists(repo_path): + logger.error("Could not find polyglot-benchmark repository. Please set POLYGLOT_BENCHMARK_PATH environment variable.") + return pd.DataFrame() all_tests = [] instance_id = 0 diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index ce998a112330..206716c57958 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -12,9 +12,42 @@ EVAL_IDS=${6:-""} EVAL_LANGUAGES=${7:-""} # Set environment variables -export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"} export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} +# Try to find the polyglot-benchmark repository +if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then + # Check common locations + POSSIBLE_PATHS=( + "/workspace/polyglot-benchmark" + "$HOME/polyglot-benchmark" + "$HOME/thereal/polyglot-benchmark" + "$(git rev-parse --show-toplevel)/polyglot-benchmark" + "$(pwd)/polyglot-benchmark" + ) + + for path in "${POSSIBLE_PATHS[@]}"; do + if [ -d "$path" ]; then + export POLYGLOT_BENCHMARK_PATH="$path" + echo "Found polyglot-benchmark repository at: $POLYGLOT_BENCHMARK_PATH" + break + fi + done +fi + +# If still not found, try to clone it +if [ -z "$POLYGLOT_BENCHMARK_PATH" ] || [ ! -d "$POLYGLOT_BENCHMARK_PATH" ]; then + echo "Polyglot benchmark repository not found. Attempting to clone it..." + CLONE_DIR="$(git rev-parse --show-toplevel)/polyglot-benchmark" + git clone https://github.com/Aider-AI/polyglot-benchmark.git "$CLONE_DIR" + if [ $? -eq 0 ]; then + export POLYGLOT_BENCHMARK_PATH="$CLONE_DIR" + echo "Successfully cloned polyglot-benchmark to $POLYGLOT_BENCHMARK_PATH" + else + echo "Failed to clone polyglot-benchmark. Please set POLYGLOT_BENCHMARK_PATH manually." + exit 1 + fi +fi + # Add additional arguments based on provided parameters ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}" From 8af6f1111baf53831f1a2ca3edcd5a4d6851d70f Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:31:00 +0000 Subject: [PATCH 05/22] Add Docker configuration options and troubleshooting guide --- .../benchmarks/polyglot_benchmark/README.md | 31 +++++++++++++++++++ .../polyglot_benchmark/run_infer.py | 12 +++++-- .../polyglot_benchmark/scripts/run_infer.sh | 2 ++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index d92251acb9f7..46f79dfeb9c5 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -51,8 +51,39 @@ You can also set the following environment variables: ```bash export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the polyglot-benchmark repository export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) +export NO_DOCKER="true" # Skip Docker container creation and use local runtime (default: false) +export POLYGLOT_DOCKER_IMAGE="image:tag" # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0) ``` +### Troubleshooting + +#### Docker Issues + +If you encounter Docker-related errors like: + +``` +Command 'docker buildx build ...' returned non-zero exit status 1 +``` + +You can try the following solutions: + +1. Run with `NO_DOCKER=true` to use the local runtime instead: + ```bash + NO_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 + ``` + +2. Make sure Docker is installed and running: + ```bash + docker --version + docker ps + ``` + +3. Check if you have permission to use Docker: + ```bash + sudo usermod -aG docker $USER + # Then log out and log back in + ``` + ### Example ```bash diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index c5adbc64c572..4be3b75ae26a 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -62,13 +62,21 @@ def get_config( instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: + # Determine runtime type based on environment variable + runtime_type = os.environ.get('RUNTIME', 'docker') + + # Check if NO_DOCKER is set to skip Docker container creation + if os.environ.get('NO_DOCKER', 'false').lower() == 'true': + runtime_type = 'local' + logger.info("Using local runtime instead of Docker due to NO_DOCKER=true") + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'docker'), + runtime=runtime_type, max_iterations=metadata.max_iterations, sandbox=SandboxConfig( - base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0', # TODO: Create this image + base_container_image=os.environ.get('POLYGLOT_DOCKER_IMAGE', 'ghcr.io/opendevin/eval-polyglot:v1.0.0'), enable_auto_lint=True, use_host_network=False, timeout=300, # Longer timeout for compilation diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index 206716c57958..7c7a3726be5f 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -13,6 +13,8 @@ EVAL_LANGUAGES=${7:-""} # Set environment variables export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} +export NO_DOCKER=${NO_DOCKER:-"false"} +export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"} # Try to find the polyglot-benchmark repository if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then From 32335ffcb3862817cc85a3f44ce590353609c38a Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:32:00 +0000 Subject: [PATCH 06/22] Add local Docker image build support for polyglot benchmark --- .../benchmarks/polyglot_benchmark/README.md | 39 +++++++- .../scripts/build_local_docker.sh | 94 +++++++++++++++++++ .../polyglot_benchmark/scripts/run_infer.sh | 23 ++++- 3 files changed, 152 insertions(+), 4 deletions(-) create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index 46f79dfeb9c5..9fa8bfb1dfb3 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -53,6 +53,37 @@ export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the poly export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) export NO_DOCKER="true" # Skip Docker container creation and use local runtime (default: false) export POLYGLOT_DOCKER_IMAGE="image:tag" # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0) +export BUILD_LOCAL_DOCKER="true" # Build a local Docker image if one doesn't exist (default: false) +``` + +### Docker Support + +The benchmark uses Docker to create isolated environments for running code in different programming languages. There are two ways to use Docker with this benchmark: + +#### Option 1: Build a Local Docker Image + +You can build a local Docker image that contains all the necessary tools for the benchmark: + +```bash +# Build the Docker image +./evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh + +# Run the benchmark with the local image +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +Alternatively, you can set the `BUILD_LOCAL_DOCKER` environment variable: + +```bash +BUILD_LOCAL_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +#### Option 2: Use a Pre-built Docker Image + +You can specify a custom Docker image to use: + +```bash +POLYGLOT_DOCKER_IMAGE="your-custom-image:tag" ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 ``` ### Troubleshooting @@ -67,18 +98,20 @@ Command 'docker buildx build ...' returned non-zero exit status 1 You can try the following solutions: -1. Run with `NO_DOCKER=true` to use the local runtime instead: +1. Build a local Docker image as described above. + +2. Run with `NO_DOCKER=true` to use the local runtime instead: ```bash NO_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 ``` -2. Make sure Docker is installed and running: +3. Make sure Docker is installed and running: ```bash docker --version docker ps ``` -3. Check if you have permission to use Docker: +4. Check if you have permission to use Docker: ```bash sudo usermod -aG docker $USER # Then log out and log back in diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh new file mode 100755 index 000000000000..d129c5676ec1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" +REPO_ROOT="$( cd "${BENCHMARK_DIR}/../../.." && pwd )" + +# Create a temporary directory for the Docker build +BUILD_DIR=$(mktemp -d) +trap "rm -rf $BUILD_DIR" EXIT + +echo "Creating Docker build context in $BUILD_DIR" + +# Create a simple Dockerfile that includes all the necessary tools +cat > "$BUILD_DIR/Dockerfile" << 'EOF' +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + wget \ + software-properties-common \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + libboost-all-dev \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip3 install --no-cache-dir pytest pytest-timeout + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \ + && rm go1.20.5.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java +RUN apt-get update && apt-get install -y openjdk-17-jdk \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Install Gradle +RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \ + && mkdir /opt/gradle \ + && unzip -d /opt/gradle gradle-7.6-bin.zip \ + && rm gradle-7.6-bin.zip +ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}" + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +CMD ["/bin/bash"] +EOF + +# Build the Docker image +IMAGE_NAME="polyglot-benchmark:local" +echo "Building Docker image: $IMAGE_NAME" +docker build -t "$IMAGE_NAME" "$BUILD_DIR" + +# Export the image name as an environment variable +echo "export POLYGLOT_DOCKER_IMAGE=$IMAGE_NAME" > "$BENCHMARK_DIR/docker_image.env" + +echo "Docker image built successfully: $IMAGE_NAME" +echo "To use this image, run:" +echo "source $BENCHMARK_DIR/docker_image.env" +echo "Then run the benchmark as usual." \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index 7c7a3726be5f..a044219c27e1 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -14,7 +14,28 @@ EVAL_LANGUAGES=${7:-""} # Set environment variables export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} export NO_DOCKER=${NO_DOCKER:-"false"} -export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"} + +# Check if we have a local Docker image env file +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" +DOCKER_ENV_FILE="${BENCHMARK_DIR}/docker_image.env" + +if [ -f "$DOCKER_ENV_FILE" ]; then + echo "Loading Docker image configuration from $DOCKER_ENV_FILE" + source "$DOCKER_ENV_FILE" +else + # If no local image is available, use the default + export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"} + + # Check if we need to build a local Docker image + if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then + echo "Building local Docker image..." + "${SCRIPT_DIR}/build_local_docker.sh" + source "$DOCKER_ENV_FILE" + fi +fi + +echo "Using Docker image: $POLYGLOT_DOCKER_IMAGE" # Try to find the polyglot-benchmark repository if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then From 561001019a5d060acbfad9f3c5c171ed862bb658 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:33:36 +0000 Subject: [PATCH 07/22] Set Docker image to build automatically by default --- .../benchmarks/polyglot_benchmark/README.md | 29 ++++++++++++++----- .../polyglot_benchmark/scripts/run_infer.sh | 26 +++++++++++++---- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index 9fa8bfb1dfb3..603b3a787fba 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -53,16 +53,29 @@ export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the poly export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) export NO_DOCKER="true" # Skip Docker container creation and use local runtime (default: false) export POLYGLOT_DOCKER_IMAGE="image:tag" # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0) -export BUILD_LOCAL_DOCKER="true" # Build a local Docker image if one doesn't exist (default: false) +export BUILD_LOCAL_DOCKER="false" # Build a local Docker image if one doesn't exist (default: true) ``` ### Docker Support -The benchmark uses Docker to create isolated environments for running code in different programming languages. There are two ways to use Docker with this benchmark: +The benchmark uses Docker to create isolated environments for running code in different programming languages. By default, the script will: -#### Option 1: Build a Local Docker Image +1. Try to pull the specified Docker image from the registry +2. If the pull fails, automatically build a local Docker image -You can build a local Docker image that contains all the necessary tools for the benchmark: +You have several options for customizing this behavior: + +#### Option 1: Use the Default Behavior (Recommended) + +Simply run the benchmark script, and it will handle the Docker image automatically: + +```bash +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +#### Option 2: Manually Build a Local Docker Image + +You can explicitly build a local Docker image before running the benchmark: ```bash # Build the Docker image @@ -72,13 +85,15 @@ You can build a local Docker image that contains all the necessary tools for the ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 ``` -Alternatively, you can set the `BUILD_LOCAL_DOCKER` environment variable: +#### Option 3: Disable Automatic Docker Image Building + +If you want to disable the automatic building of a Docker image: ```bash -BUILD_LOCAL_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +BUILD_LOCAL_DOCKER=false ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 ``` -#### Option 2: Use a Pre-built Docker Image +#### Option 4: Use a Custom Docker Image You can specify a custom Docker image to use: diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index a044219c27e1..ebb3fc2d4a52 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -20,6 +20,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" DOCKER_ENV_FILE="${BENCHMARK_DIR}/docker_image.env" +# Set BUILD_LOCAL_DOCKER to true by default if not specified +export BUILD_LOCAL_DOCKER=${BUILD_LOCAL_DOCKER:-"true"} + if [ -f "$DOCKER_ENV_FILE" ]; then echo "Loading Docker image configuration from $DOCKER_ENV_FILE" source "$DOCKER_ENV_FILE" @@ -27,11 +30,24 @@ else # If no local image is available, use the default export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"} - # Check if we need to build a local Docker image - if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then - echo "Building local Docker image..." - "${SCRIPT_DIR}/build_local_docker.sh" - source "$DOCKER_ENV_FILE" + # Try to pull the image first + echo "Trying to pull Docker image: $POLYGLOT_DOCKER_IMAGE" + if ! docker pull "$POLYGLOT_DOCKER_IMAGE" 2>/dev/null; then + echo "Failed to pull Docker image: $POLYGLOT_DOCKER_IMAGE" + + # Build a local Docker image if pulling fails and BUILD_LOCAL_DOCKER is true + if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then + echo "Building local Docker image..." + "${SCRIPT_DIR}/build_local_docker.sh" + source "$DOCKER_ENV_FILE" + else + echo "WARNING: Docker image not found and BUILD_LOCAL_DOCKER is not set to true." + echo "You can build a local Docker image by running:" + echo " ${SCRIPT_DIR}/build_local_docker.sh" + echo "Or set BUILD_LOCAL_DOCKER=true to build it automatically." + fi + else + echo "Successfully pulled Docker image: $POLYGLOT_DOCKER_IMAGE" fi fi From c9e232e76412bbe7ec540f59696c851dbdf7dd73 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:40:24 +0000 Subject: [PATCH 08/22] Fix Docker build issues by adding unzip and simplifying Gradle installation --- .../polyglot_benchmark/scripts/build_local_docker.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh index d129c5676ec1..0f93c82164a0 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh @@ -30,6 +30,8 @@ RUN apt-get update && apt-get install -y \ python3-dev \ python3-venv \ wget \ + unzip \ + zip \ software-properties-common \ apt-transport-https \ ca-certificates \ @@ -63,11 +65,8 @@ RUN apt-get update && apt-get install -y openjdk-17-jdk \ ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 # Install Gradle -RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \ - && mkdir /opt/gradle \ - && unzip -d /opt/gradle gradle-7.6-bin.zip \ - && rm gradle-7.6-bin.zip -ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}" +RUN apt-get update && apt-get install -y gradle \ + && rm -rf /var/lib/apt/lists/* # Create workspace directory RUN mkdir -p /workspace From 97e7ca7f3bb6168e2978bd46bde9e9bff65d2ef5 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:51:59 +0000 Subject: [PATCH 09/22] Restrict polyglot benchmark to use only the same tools as SWE-Bench (execute_bash, finish, str_replace_editor) --- evaluation/benchmarks/polyglot_benchmark/README.md | 7 +++++++ evaluation/benchmarks/polyglot_benchmark/run_infer.py | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index 603b3a787fba..deb02b1969bb 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -2,6 +2,13 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages. +> **Note**: This benchmark has been modified to use only the same tools as SWE-Bench: +> - execute_bash +> - finish +> - str_replace_editor +> +> This restriction ensures consistent tool usage across benchmarks for more accurate comparisons. + ## Features - Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java) diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index 4be3b75ae26a..d79fc2a707aa 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -8,6 +8,11 @@ from pathlib import Path from typing import Any, Dict, List, Optional +# NOTE: This benchmark has been modified to use only the same tools as SWE-Bench: +# - execute_bash +# - finish +# - str_replace_editor + import pandas as pd from evaluation.benchmarks.polyglot_benchmark.helper.prompts import ( @@ -103,6 +108,11 @@ def get_config( agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False + + # Restrict tools to match SWE-Bench (only execute_bash, finish, and str_replace_editor) + agent_config.codeact_enable_jupyter = False + agent_config.codeact_enable_browsing = False + agent_config.codeact_enable_llm_editor = False # copy 'draft_editor' config if exists config_copy = copy.deepcopy(config) From 44bcb39b66a7578172809fe26174d11c53964155 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:57:55 +0000 Subject: [PATCH 10/22] Fix runtime completion to use Docker runtime for running tests --- .../polyglot_benchmark/run_infer.py | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index d79fc2a707aa..6b8a841562ca 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -198,28 +198,40 @@ def complete_runtime( if command: try: - result = subprocess.run( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - timeout=180, # 3 minutes timeout - cwd="/workspace", - encoding="utf-8", - errors="replace", - ) - exit_code = result.returncode - test_output = result.stdout + # Use the runtime to run the command inside the Docker container + cmd_str = " ".join(command) + logger.info(f"Running test command: {cmd_str}") + + action = CmdRunAction(command=cmd_str) + logger.info(action, extra={'msg_type': 'ACTION'}) + + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + if isinstance(obs, CmdOutputObservation): + exit_code = obs.exit_code + test_output = obs.content + else: + logger.error(f"Unexpected observation type: {type(obs)}") + exit_code = 1 + test_output = f"Error: Unexpected observation type: {type(obs)}" # Clean up output test_output = test_output.replace("/workspace", "workspace") # Log test output to history file - with open("/workspace/.aider.chat.history.md", "a") as fh: - fh.write(f"```\n{test_output}\n```") + with tempfile.TemporaryDirectory() as tmpdir: + history_path = os.path.join(tmpdir, ".aider.chat.history.md") + with open(history_path, 'w') as f: + f.write(f"```\n{test_output}\n```") + runtime.copy_to( + history_path, + '/workspace', + ) - except subprocess.TimeoutExpired: - test_output = "Tests timed out!" + except Exception as e: + logger.error(f"Error running tests: {e}") + test_output = f"Tests failed with error: {e}" exit_code = 1 logger.info('-' * 30) From 601da458cdd666efe112e5e202fad674a1cac95c Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 07:07:47 +0000 Subject: [PATCH 11/22] Add script to test one instance per language in polyglot benchmark --- .../polyglot_benchmark/test_all_languages.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_all_languages.py diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py new file mode 100755 index 000000000000..89e15b6720f1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import ( + load_polyglot_dataset, + process_instance, + make_metadata, + get_llm_config_arg, +) +from openhands.core.logger import openhands_logger as logger + +def test_language(language, model, agent): + """Test the first instance of a specific language.""" + print(f"\n{'=' * 50}") + print(f"Testing language: {language}") + print(f"{'=' * 50}\n") + + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + # Filter by language + dataset = dataset[dataset['language'].str.lower() == language.lower()] + if dataset.empty: + print(f"No instances found for language: {language}") + return False + + # Get the first instance + instance = dataset.iloc[0] + print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})") + + # Get LLM config + llm_config = get_llm_config_arg(model) + if llm_config is None: + print(f"Could not find LLM config: {model}") + return False + + # Create metadata + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + agent, + 30, # max_iterations + f"test_{language}", + f"evaluation/evaluation_outputs/test_{language}", + ) + + # Process the instance + try: + output = process_instance(instance, metadata, reset_logger=False) + print("\nTest completed successfully!") + print(f"Exit code: {output.test_result['exit_code']}") + print(f"Passed: {output.test_result['exit_code'] == 0}") + return output.test_result['exit_code'] == 0 + except Exception as e: + print(f"Error processing instance: {e}") + return False + +def main(): + parser = argparse.ArgumentParser(description="Test the polyglot benchmark with one instance per language") + parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name") + parser.add_argument("--agent", default="CodeActAgent", help="Agent class name") + parser.add_argument("--languages", default="python,rust,go,javascript,cpp,java", + help="Comma-separated list of languages to test") + args = parser.parse_args() + + languages = args.languages.split(',') + results = {} + + for language in languages: + language = language.strip() + if not language: + continue + + success = test_language(language, args.model, args.agent) + results[language] = "PASSED" if success else "FAILED" + + # Print summary + print("\n" + "=" * 50) + print("SUMMARY OF RESULTS") + print("=" * 50) + + for language, result in results.items(): + print(f"{language.ljust(12)}: {result}") + + # Check if all tests passed + all_passed = all(result == "PASSED" for result in results.values()) + print("\nOverall result:", "PASSED" if all_passed else "FAILED") + +if __name__ == "__main__": + main() \ No newline at end of file From 84293fd031abb846bda22a19974ccfc33758c307 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 07:10:24 +0000 Subject: [PATCH 12/22] Add one-per-language testing mode to polyglot benchmark run_infer.sh --- .../polyglot_benchmark/scripts/run_infer.sh | 135 ++++++++++++++++-- 1 file changed, 126 insertions(+), 9 deletions(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index ebb3fc2d4a52..e2b5044a00bf 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -2,14 +2,80 @@ set -e -# Default values -MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"} +# Display usage information +function show_usage { + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --help Show this help message" + echo " --model MODEL Model configuration (default: eval_gpt4_1106_preview)" + echo " --agent AGENT Agent class (default: CodeActAgent)" + echo " --limit LIMIT Evaluation limit (default: -1 for all)" + echo " --workers WORKERS Number of workers (default: 1)" + echo " --ids IDS Comma-separated list of instance IDs" + echo " --languages LANGUAGES Comma-separated list of languages" + echo " --one-per-language Test one instance per language" + echo "" + echo "Legacy positional arguments are still supported:" + echo " $0 MODEL_CONFIG GIT_VERSION AGENT EVAL_LIMIT EVAL_NUM_WORKERS EVAL_IDS EVAL_LANGUAGES" + exit 0 +} + +# Parse named arguments +ONE_PER_LANGUAGE=false +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_usage + ;; + --model) + MODEL_CONFIG="$2" + shift 2 + ;; + --agent) + AGENT="$2" + shift 2 + ;; + --limit) + EVAL_LIMIT="$2" + shift 2 + ;; + --workers) + EVAL_NUM_WORKERS="$2" + shift 2 + ;; + --ids) + EVAL_IDS="$2" + shift 2 + ;; + --languages) + EVAL_LANGUAGES="$2" + shift 2 + ;; + --one-per-language) + ONE_PER_LANGUAGE=true + shift + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +# Restore positional parameters +set -- "${POSITIONAL_ARGS[@]}" + +# Default values (if not set by named arguments) +MODEL_CONFIG=${MODEL_CONFIG:-${1:-"eval_gpt4_1106_preview"}} GIT_VERSION=${2:-"HEAD"} -AGENT=${3:-"CodeActAgent"} -EVAL_LIMIT=${4:-"-1"} -EVAL_NUM_WORKERS=${5:-"1"} -EVAL_IDS=${6:-""} -EVAL_LANGUAGES=${7:-""} +AGENT=${AGENT:-${3:-"CodeActAgent"}} +EVAL_LIMIT=${EVAL_LIMIT:-${4:-"-1"}} +EVAL_NUM_WORKERS=${EVAL_NUM_WORKERS:-${5:-"1"}} +EVAL_IDS=${EVAL_IDS:-${6:-""}} +EVAL_LANGUAGES=${EVAL_LANGUAGES:-${7:-""}} # Set environment variables export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} @@ -102,6 +168,57 @@ if [ -n "${EVAL_LANGUAGES}" ]; then ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}" fi -# Run the evaluation +# Change to the repository root directory cd "$(git rev-parse --show-toplevel)" -poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} \ No newline at end of file + +# If one-per-language mode is enabled +if [ "$ONE_PER_LANGUAGE" = true ]; then + echo "Running one instance per language mode..." + + # Define the languages to test + LANGUAGES=("python" "javascript" "rust" "go" "cpp" "java") + + # Create a temporary directory for results + RESULTS_DIR="evaluation/evaluation_outputs/one_per_language_test" + mkdir -p "$RESULTS_DIR" + + # Summary file + SUMMARY_FILE="$RESULTS_DIR/summary.txt" + echo "POLYGLOT BENCHMARK - ONE INSTANCE PER LANGUAGE TEST" > "$SUMMARY_FILE" + echo "=================================================" >> "$SUMMARY_FILE" + echo "Model: $MODEL_CONFIG" >> "$SUMMARY_FILE" + echo "Agent: $AGENT" >> "$SUMMARY_FILE" + echo "Date: $(date)" >> "$SUMMARY_FILE" + echo "=================================================" >> "$SUMMARY_FILE" + echo "" >> "$SUMMARY_FILE" + + # Test each language + for LANG in "${LANGUAGES[@]}"; do + echo "" + echo "===== Testing language: $LANG =====" + echo "" + + # Run with one instance for this language + LANG_ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers 1 --eval-n-limit 1 --eval-languages ${LANG} --eval-note one_per_language_${LANG}" + + # Run the evaluation for this language + if poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${LANG_ARGS}; then + RESULT="PASSED" + else + RESULT="FAILED" + fi + + # Add to summary + echo "${LANG}: ${RESULT}" >> "$SUMMARY_FILE" + done + + # Display summary + echo "" + echo "===== TEST SUMMARY =====" + cat "$SUMMARY_FILE" + echo "" + echo "Detailed results available in: $RESULTS_DIR" +else + # Run the normal evaluation + poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} +fi \ No newline at end of file From 87d9e15491913fe4ba8989dc4bb7e49b287aa845 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 07:10:54 +0000 Subject: [PATCH 13/22] Update README with one-per-language testing instructions and command-line options --- .../benchmarks/polyglot_benchmark/README.md | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index deb02b1969bb..f7ee5e0112fb 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -36,11 +36,34 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid pip install -e .[dev] ``` -2. Run the benchmark: +2. To test one instance per language (quick verification): ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo + ``` + + This will run one test for each supported language (Python, Rust, Go, JavaScript, C++, and Java) and provide a summary of results. + +3. Run the full benchmark: + ```bash + # Using named arguments (recommended) + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript + + # Or using positional arguments (legacy) ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh ``` +4. Available command-line options: + ``` + --help Show help message + --model MODEL Model configuration (default: eval_gpt4_1106_preview) + --agent AGENT Agent class (default: CodeActAgent) + --limit LIMIT Evaluation limit (default: -1 for all) + --workers WORKERS Number of workers (default: 1) + --ids IDS Comma-separated list of instance IDs + --languages LANGUAGES Comma-separated list of languages + --one-per-language Test one instance per language + ``` + ### Command Line Arguments - `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`) From 8a5dc594e5438b1ebf26085cf4a9a18fdbccb5a3 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 07:17:53 +0000 Subject: [PATCH 14/22] Enable LLM completions logging in aider_bench run_infer.py --- evaluation/benchmarks/aider_bench/run_infer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 1ee68c21c2f0..93dd5102359b 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -75,6 +75,8 @@ def get_config( metadata.eval_output_dir, str(instance.instance_id) ) + # Enable logging of LLM completions + llm_config.log_completions = True config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False From 8ffe33e88e6512540247efe1d955696ddd809cb6 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 07:51:33 +0000 Subject: [PATCH 15/22] Include tools information in evaluation output directory names --- .../benchmarks/aider_bench/run_infer.py | 10 ++++++ .../polyglot_benchmark/run_infer.py | 10 ++++++ .../polyglot_benchmark/test_all_languages.py | 10 ++++++ .../benchmarks/polyglot_benchmark/test_run.py | 10 ++++++ evaluation/benchmarks/swe_bench/run_infer.py | 9 ++++- evaluation/utils/shared.py | 36 +++++++++++++++++-- 6 files changed, 82 insertions(+), 3 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 93dd5102359b..dc1cea9f5de3 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -295,6 +295,15 @@ def process_instance( if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + metadata = make_metadata( llm_config, 'AiderBench', @@ -302,6 +311,7 @@ def process_instance( args.max_iterations, args.eval_note, args.eval_output_dir, + details=agent_details, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index 6b8a841562ca..12d870bd3b1e 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -504,6 +504,15 @@ def add_arguments(parser): if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + metadata = make_metadata( llm_config, 'PolyglotBenchmark', @@ -511,6 +520,7 @@ def add_arguments(parser): args.max_iterations, args.eval_note, args.eval_output_dir, + details=agent_details, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py index 89e15b6720f1..f196651b890d 100755 --- a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py +++ b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py @@ -44,6 +44,15 @@ def test_language(language, model, agent): print(f"Could not find LLM config: {model}") return False + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + # Create metadata metadata = make_metadata( llm_config, @@ -52,6 +61,7 @@ def test_language(language, model, agent): 30, # max_iterations f"test_{language}", f"evaluation/evaluation_outputs/test_{language}", + details=agent_details, ) # Process the instance diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py index a8671b0646f1..c946356e90d6 100755 --- a/evaluation/benchmarks/polyglot_benchmark/test_run.py +++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py @@ -50,6 +50,15 @@ def main(): print(f"Could not find LLM config: {args.model}") return + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + # Create metadata metadata = make_metadata( llm_config, @@ -58,6 +67,7 @@ def main(): 30, # max_iterations "test", "evaluation/evaluation_outputs/test", + details=agent_details, ) # Process the instance diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 5e3f0e6a5bd7..71d37764ccb4 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -531,7 +531,14 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') - details = {} + # Create details dictionary with agent configuration + details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": RUN_WITH_BROWSING, + "codeact_enable_llm_editor": False, + } + } _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) dataset_descrption = ( diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 0f8ac8fa8332..0e49da8ae971 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -158,6 +158,35 @@ def cleanup(): process.join() +def get_tools_string(agent_class: str, details: dict[str, Any] | None = None) -> str: + """Generate a string representation of the tools used by the agent. + + Args: + agent_class: The agent class name. + details: Additional details that might contain tool configuration. + + Returns: + A string representation of the tools used, e.g., "bash+finish+str_replace". + """ + # Default tools for CodeActAgent + if agent_class == "CodeActAgent": + tools = ["bash", "finish", "str_replace"] + + # Check if additional tools are enabled + if details and "agent_config" in details: + agent_config = details.get("agent_config", {}) + if agent_config.get("codeact_enable_browsing", False): + tools.extend(["web_read", "browser"]) + if agent_config.get("codeact_enable_jupyter", False): + tools.append("ipython") + if agent_config.get("codeact_enable_llm_editor", False): + tools[-1] = "llm_editor" # Replace str_replace with llm_editor + + return "+".join(tools) + + # For other agents, return a default string + return "default_tools" + def make_metadata( llm_config: LLMConfig, dataset_name: str, @@ -172,12 +201,15 @@ def make_metadata( model_name = llm_config.model.split('/')[-1] model_path = model_name.replace(':', '_').replace('@', '-') eval_note = f'_N_{eval_note}' if eval_note else '' - + + # Get tools string + tools_string = get_tools_string(agent_class, details) + eval_output_path = os.path.join( eval_output_dir, dataset_name, agent_class, - f'{model_path}_maxiter_{max_iterations}{eval_note}', + f'{model_path}_maxiter_{max_iterations}_tools_{tools_string}{eval_note}', ) pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True) From d45b98dd1c800e8383480ab4c3e0481a601c1cbc Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:00:02 +0000 Subject: [PATCH 16/22] Add evaluation parameter to run_infer.sh scripts for aider_bench and polyglot_benchmark --- .../aider_bench/scripts/run_infer.sh | 30 +++++++++ .../polyglot_benchmark/scripts/run_infer.sh | 65 +++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 34249e94c527..3173b3d196f4 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -9,6 +9,7 @@ AGENT=$3 EVAL_LIMIT=$4 NUM_WORKERS=$5 EVAL_IDS=$6 +RUN_EVALUATION=$7 # New parameter to run evaluation after benchmark if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1 @@ -58,3 +59,32 @@ fi # Run the command eval $COMMAND + +# Get the output directory +OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" | sort -r | head -n 1) +OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" + +# Run evaluation if requested +if [ "$RUN_EVALUATION" = "eval" ]; then + echo "" + echo "======================================" + echo "Running evaluation on results..." + echo "======================================" + echo "" + + if [ -f "$OUTPUT_FILE" ]; then + echo "Evaluating results in: $OUTPUT_FILE" + poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + + echo "" + echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE" + else + echo "Error: Output file not found: $OUTPUT_FILE" + echo "Cannot run evaluation." + fi +fi diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index e2b5044a00bf..a70df608b454 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -15,6 +15,7 @@ function show_usage { echo " --ids IDS Comma-separated list of instance IDs" echo " --languages LANGUAGES Comma-separated list of languages" echo " --one-per-language Test one instance per language" + echo " --eval Run evaluation after benchmark" echo "" echo "Legacy positional arguments are still supported:" echo " $0 MODEL_CONFIG GIT_VERSION AGENT EVAL_LIMIT EVAL_NUM_WORKERS EVAL_IDS EVAL_LANGUAGES" @@ -23,6 +24,7 @@ function show_usage { # Parse named arguments ONE_PER_LANGUAGE=false +RUN_EVALUATION=false POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do @@ -58,6 +60,10 @@ while [[ $# -gt 0 ]]; do ONE_PER_LANGUAGE=true shift ;; + --eval) + RUN_EVALUATION=true + shift + ;; *) POSITIONAL_ARGS+=("$1") shift @@ -218,7 +224,66 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then cat "$SUMMARY_FILE" echo "" echo "Detailed results available in: $RESULTS_DIR" + + # Run evaluation if requested + if [ "$RUN_EVALUATION" = true ]; then + echo "" + echo "======================================" + echo "Running detailed evaluation on results..." + echo "======================================" + echo "" + + # Evaluate each language's results + for LANG in "${LANGUAGES[@]}"; do + LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}" + LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl" + + if [ -f "$LANG_OUTPUT_FILE" ]; then + echo "" + echo "===== Evaluating $LANG results =====" + echo "" + echo "Evaluating results in: $LANG_OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="${LANG_OUTPUT_DIR}/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$LANG_OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + fi + done + + echo "" + echo "Detailed evaluation complete." + fi else # Run the normal evaluation poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} + + # Run evaluation if requested + if [ "$RUN_EVALUATION" = true ]; then + echo "" + echo "======================================" + echo "Running evaluation on results..." + echo "======================================" + echo "" + + # Get the output directory + OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" | sort -r | head -n 1) + OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" + + if [ -f "$OUTPUT_FILE" ]; then + echo "Evaluating results in: $OUTPUT_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + + echo "" + echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE" + else + echo "Error: Output file not found: $OUTPUT_FILE" + echo "Cannot run evaluation." + fi + fi fi \ No newline at end of file From 62d2632c62eaa8760d2223792bda189e7b4c02b4 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:00:55 +0000 Subject: [PATCH 17/22] Update README files with documentation for the new evaluation parameter --- evaluation/benchmarks/aider_bench/README.md | 7 ++++++- evaluation/benchmarks/polyglot_benchmark/README.md | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md index 086cfe58160a..a011e6ec9d5c 100644 --- a/evaluation/benchmarks/aider_bench/README.md +++ b/evaluation/benchmarks/aider_bench/README.md @@ -16,7 +16,7 @@ development environment and LLM. ## Start the evaluation ```bash -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for @@ -31,6 +31,7 @@ development environment and LLM. - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`. - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated). +- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes. There are also following optional environment variables you can set: @@ -53,7 +54,11 @@ You can update the arguments in the script - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`. ```bash +# Run benchmark without evaluation ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" + +# Run benchmark with automatic evaluation +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" eval ``` ### Run Inference on `RemoteRuntime` (experimental) diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md index f7ee5e0112fb..f5e8ee6a2903 100644 --- a/evaluation/benchmarks/polyglot_benchmark/README.md +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -38,7 +38,11 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid 2. To test one instance per language (quick verification): ```bash + # Without evaluation ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo + + # With automatic evaluation + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo --eval ``` This will run one test for each supported language (Python, Rust, Go, JavaScript, C++, and Java) and provide a summary of results. @@ -48,6 +52,9 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid # Using named arguments (recommended) ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript + # With automatic evaluation + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript --eval + # Or using positional arguments (legacy) ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh ``` @@ -62,6 +69,7 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid --ids IDS Comma-separated list of instance IDs --languages LANGUAGES Comma-separated list of languages --one-per-language Test one instance per language + --eval Run evaluation after benchmark completes ``` ### Command Line Arguments From c8dab2c421e4eb8340b6b66bd27fb124d908f302 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:07:50 +0000 Subject: [PATCH 18/22] Fix output directory detection in evaluation scripts --- .../aider_bench/scripts/run_infer.sh | 20 +++++++++++-- .../polyglot_benchmark/scripts/run_infer.sh | 28 ++++++++++++++++--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 3173b3d196f4..3526381de5ab 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -60,9 +60,23 @@ fi # Run the command eval $COMMAND -# Get the output directory -OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" | sort -r | head -n 1) -OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" +# Get the output directory - first try the default location +OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) + +# If not found, try to find it anywhere under evaluation_outputs +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) +fi + +# If still not found, try to find any output.jsonl file +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1) + if [ -n "$OUTPUT_FILE" ]; then + OUTPUT_DIR=$(dirname "$OUTPUT_FILE") + fi +else + OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" +fi # Run evaluation if requested if [ "$RUN_EVALUATION" = "eval" ]; then diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index a70df608b454..112028eb7079 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -235,7 +235,13 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then # Evaluate each language's results for LANG in "${LANGUAGES[@]}"; do - LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}" + # Try to find the output directory for this language + LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + + if [ -z "$LANG_OUTPUT_DIR" ]; then + LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}" + fi + LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl" if [ -f "$LANG_OUTPUT_FILE" ]; then @@ -266,9 +272,23 @@ else echo "======================================" echo "" - # Get the output directory - OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" | sort -r | head -n 1) - OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" + # Get the output directory - first try the default location + OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + + # If not found, try to find it anywhere under evaluation_outputs + if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + fi + + # If still not found, try to find any output.jsonl file + if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1) + if [ -n "$OUTPUT_FILE" ]; then + OUTPUT_DIR=$(dirname "$OUTPUT_FILE") + fi + else + OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" + fi if [ -f "$OUTPUT_FILE" ]; then echo "Evaluating results in: $OUTPUT_FILE" From fa9a0f8b6bc682ebf89319bbf10873f1392faff1 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:10:52 +0000 Subject: [PATCH 19/22] Fix LLM completions logging to ensure it's enabled in all benchmarks --- evaluation/benchmarks/aider_bench/run_infer.py | 2 -- .../benchmarks/polyglot_benchmark/run_infer.py | 4 ---- evaluation/utils/shared.py | 17 +++++++++-------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index dc1cea9f5de3..fb035c5a4c1d 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -75,8 +75,6 @@ def get_config( metadata.eval_output_dir, str(instance.instance_id) ) - # Enable logging of LLM completions - llm_config.log_completions = True config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py index 12d870bd3b1e..334a0a769bcc 100644 --- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -102,8 +102,6 @@ def get_config( metadata.eval_output_dir, str(instance.instance_id) ) - # Enable logging of LLM completions - llm_config.log_completions = True config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) @@ -498,8 +496,6 @@ def add_arguments(parser): llm_config = get_llm_config_arg(args.llm_config) # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results llm_config.modify_params = False - # Enable logging of LLM completions - llm_config.log_completions = True if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 0e49da8ae971..124d2682fcf4 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -512,14 +512,15 @@ def update_llm_config_for_completions_logging( instance_id: str, ) -> LLMConfig: """Update the LLM config for logging completions.""" - if llm_config.log_completions: - llm_config.log_completions_folder = os.path.join( - eval_output_dir, 'llm_completions', instance_id - ) - logger.info( - f'Logging LLM completions for instance {instance_id} to ' - f'{llm_config.log_completions_folder}' - ) + # Always enable completions logging + llm_config.log_completions = True + llm_config.log_completions_folder = os.path.join( + eval_output_dir, 'llm_completions', instance_id + ) + logger.info( + f'Logging LLM completions for instance {instance_id} to ' + f'{llm_config.log_completions_folder}' + ) return llm_config From 8a4ca1e48c329f895682967aca70b824922570cc Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:15:04 +0000 Subject: [PATCH 20/22] Improve output directory detection in evaluation scripts with better path matching and debugging output --- .../aider_bench/scripts/run_infer.sh | 9 ++++++-- .../polyglot_benchmark/scripts/run_infer.sh | 21 ++++++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 3526381de5ab..737b004121c7 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -61,11 +61,11 @@ fi eval $COMMAND # Get the output directory - first try the default location -OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) +OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) # If not found, try to find it anywhere under evaluation_outputs if [ -z "$OUTPUT_DIR" ]; then - OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) fi # If still not found, try to find any output.jsonl file @@ -78,6 +78,11 @@ else OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" fi +# Print the output directory and file for debugging +echo "" +echo "Output directory: $OUTPUT_DIR" +echo "Output file: $OUTPUT_FILE" + # Run evaluation if requested if [ "$RUN_EVALUATION" = "eval" ]; then echo "" diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index 112028eb7079..34bd41287dcf 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -236,7 +236,11 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then # Evaluate each language's results for LANG in "${LANGUAGES[@]}"; do # Try to find the output directory for this language - LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + LANG_OUTPUT_DIR=$(find evaluation/evaluation_outputs -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + + if [ -z "$LANG_OUTPUT_DIR" ]; then + LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + fi if [ -z "$LANG_OUTPUT_DIR" ]; then LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}" @@ -244,6 +248,12 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl" + # Print the language output directory and file for debugging + echo "" + echo "Language: $LANG" + echo "Output directory: $LANG_OUTPUT_DIR" + echo "Output file: $LANG_OUTPUT_FILE" + if [ -f "$LANG_OUTPUT_FILE" ]; then echo "" echo "===== Evaluating $LANG results =====" @@ -273,11 +283,11 @@ else echo "" # Get the output directory - first try the default location - OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) # If not found, try to find it anywhere under evaluation_outputs if [ -z "$OUTPUT_DIR" ]; then - OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) fi # If still not found, try to find any output.jsonl file @@ -290,6 +300,11 @@ else OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" fi + # Print the output directory and file for debugging + echo "" + echo "Output directory: $OUTPUT_DIR" + echo "Output file: $OUTPUT_FILE" + if [ -f "$OUTPUT_FILE" ]; then echo "Evaluating results in: $OUTPUT_FILE" poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" From a2d7e631c68bdf4f5175e105dfebe64ae3329fc5 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:17:30 +0000 Subject: [PATCH 21/22] Fix handling of 'eval' parameter to prevent it from being treated as an instance ID --- .../benchmarks/aider_bench/scripts/run_infer.sh | 17 ++++++++++++++++- .../polyglot_benchmark/scripts/run_infer.sh | 8 +++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 737b004121c7..102f5d0158b6 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -11,6 +11,20 @@ NUM_WORKERS=$5 EVAL_IDS=$6 RUN_EVALUATION=$7 # New parameter to run evaluation after benchmark +# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval" +if [ "$RUN_EVALUATION" = "eval" ]; then + echo "Evaluation mode enabled" +fi + +# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval" +for param in "$@"; do + if [ "$param" = "eval" ]; then + RUN_EVALUATION="eval" + echo "Evaluation mode enabled" + break + fi +done + if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1 echo "Number of workers not specified, use default $NUM_WORKERS" @@ -52,7 +66,8 @@ if [ -n "$EVAL_LIMIT" ]; then COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" fi -if [ -n "$EVAL_IDS" ]; then +# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode) +if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then echo "EVAL_IDS: $EVAL_IDS" COMMAND="$COMMAND --eval-ids $EVAL_IDS" fi diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh index 34bd41287dcf..757cee5ac3bb 100755 --- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -64,6 +64,11 @@ while [[ $# -gt 0 ]]; do RUN_EVALUATION=true shift ;; + eval) + # Special case for the 'eval' parameter in the positional arguments + RUN_EVALUATION=true + shift + ;; *) POSITIONAL_ARGS+=("$1") shift @@ -166,7 +171,8 @@ if [ "${EVAL_LIMIT}" != "-1" ]; then ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}" fi -if [ -n "${EVAL_IDS}" ]; then +# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode) +if [ -n "${EVAL_IDS}" ] && [ "${EVAL_IDS}" != "eval" ]; then ARGS="${ARGS} --eval-ids ${EVAL_IDS}" fi From 880bc10c3c10b8fa59e274c2e9a0f9f5a366a091 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 08:56:28 +0000 Subject: [PATCH 22/22] Add benchmark runner script with retry functionality --- run_benchmarks.sh | 108 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100755 run_benchmarks.sh diff --git a/run_benchmarks.sh b/run_benchmarks.sh new file mode 100755 index 000000000000..fdf764bd00fa --- /dev/null +++ b/run_benchmarks.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Script to run OpenHands benchmarks with retry functionality +# This script will run the polyglot_benchmark and aider_bench benchmarks +# and retry them until they succeed or reach the maximum number of attempts. + +# Configuration +MAX_ATTEMPTS=10 +RETRY_DELAY=30 # seconds +MODEL_CONFIG="togetherDeepseek" +GIT_VERSION="HEAD" +AGENT="CodeActAgent" +EVAL_LIMIT=1000 +NUM_WORKERS=30 + +# Check if Docker is available +check_docker() { + if ! command -v docker &> /dev/null; then + echo "WARNING: Docker is not available in this environment." + echo "The benchmarks require Docker to run properly." + echo "Continuing anyway, but expect failures if Docker is required." + fi +} + +# Function to run a command and retry until it succeeds +run_with_retry() { + local cmd="$1" + local benchmark_name="$2" + local attempt=1 + local exit_code=1 + + echo "$(date '+%Y-%m-%d %H:%M:%S') - Running $benchmark_name benchmark" + echo "Command: $cmd" + + while [[ $exit_code -ne 0 && $attempt -le $MAX_ATTEMPTS ]]; do + echo "$(date '+%Y-%m-%d %H:%M:%S') - Attempt $attempt of $MAX_ATTEMPTS..." + + # Run the command + eval "$cmd" + exit_code=$? + + if [[ $exit_code -ne 0 ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - Command failed with exit code $exit_code." + + if [[ $attempt -lt $MAX_ATTEMPTS ]]; then + echo "Retrying in $RETRY_DELAY seconds..." + sleep $RETRY_DELAY + ((attempt++)) + fi + fi + done + + if [[ $exit_code -ne 0 ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark failed after $MAX_ATTEMPTS attempts." + return 1 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark succeeded on attempt $attempt." + return 0 + fi +} + +# Main execution +echo "=====================================================================" +echo "OpenHands Benchmark Runner" +echo "Started at: $(date '+%Y-%m-%d %H:%M:%S')" +echo "=====================================================================" +echo "Model config: $MODEL_CONFIG" +echo "Git version: $GIT_VERSION" +echo "Agent: $AGENT" +echo "Eval limit: $EVAL_LIMIT" +echo "Number of workers: $NUM_WORKERS" +echo "Maximum retry attempts: $MAX_ATTEMPTS" +echo "Retry delay: $RETRY_DELAY seconds" +echo "=====================================================================" + +# Check for Docker +check_docker + +# Run polyglot_benchmark +echo "=====================================================================" +echo "Running polyglot_benchmark" +echo "=====================================================================" +run_with_retry "./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS eval" "polyglot_benchmark" +POLYGLOT_RESULT=$? + +# Run aider_bench +echo "=====================================================================" +echo "Running aider_bench" +echo "=====================================================================" +run_with_retry "./evaluation/benchmarks/aider_bench/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS \"\" eval" "aider_bench" +AIDER_RESULT=$? + +# Summary +echo "=====================================================================" +echo "Benchmark Run Summary - Completed at: $(date '+%Y-%m-%d %H:%M:%S')" +echo "=====================================================================" +echo "polyglot_benchmark: $([ $POLYGLOT_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')" +echo "aider_bench: $([ $AIDER_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')" +echo "=====================================================================" + +# Exit with success only if both benchmarks succeeded +if [[ $POLYGLOT_RESULT -eq 0 && $AIDER_RESULT -eq 0 ]]; then + echo "All benchmarks completed successfully." + exit 0 +else + echo "One or more benchmarks failed." + exit 1 +fi \ No newline at end of file