From 92e98f65239677a2bd241abae9a15749eca4fa66 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 04:35:27 +0000 Subject: [PATCH 1/6] feat: Enable llm_completions logging in aider_bench - Added update_llm_config_for_completions_logging to imports - Modified get_config to accept instance parameter - Updated llm_config to enable completions logging - Updated process_instance to pass instance to get_config This change makes aider_bench save llm_completions in the same way as swe_bench, with completions being saved in {eval_output_dir}/llm_completions/{instance_id}/ --- evaluation/benchmarks/aider_bench/run_infer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 8045f948d3f9..1ee68c21c2f0 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -20,6 +20,7 @@ prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -45,6 +46,7 @@ def get_config( + instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: config = AppConfig( @@ -67,7 +69,13 @@ def get_config( workspace_base=None, workspace_mount_path=None, ) - config.set_llm_config(metadata.llm_config) + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False @@ -170,7 +178,7 @@ def process_instance( metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: - config = get_config(metadata) + config = get_config(instance, metadata) # Setup the logger properly, so you can run multi-processing to parallelize the evaluation if reset_logger: From c24d8baeb86b98995a87692968e21020e19e6fa4 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 10:00:35 +0000 Subject: [PATCH 2/6] feat: Add polyglot aider benchmark Added a new benchmark based on Aider's polyglot benchmark that supports: - Multiple programming languages (Python, JS, Rust, Go, C++, Java) - End-to-end evaluation of code editing capabilities - Automated test execution and validation - Parallel evaluation with multiple workers - Detailed metrics and logging Key components: - run_infer.py: Main benchmark implementation - Dockerfile: Multi-language development environment - Scripts for running benchmarks and building Docker image - Helper modules for prompts and utilities --- .../polyglot_aider_bench/Dockerfile | 47 +++ .../benchmarks/polyglot_aider_bench/README.md | 73 ++++ .../polyglot_aider_bench/helper/prompts.py | 15 + .../polyglot_aider_bench/run_infer.py | 382 ++++++++++++++++++ .../scripts/build_docker.sh | 8 + .../polyglot_aider_bench/scripts/run_infer.sh | 68 ++++ 6 files changed, 593 insertions(+) create mode 100644 evaluation/benchmarks/polyglot_aider_bench/Dockerfile create mode 100644 evaluation/benchmarks/polyglot_aider_bench/README.md create mode 100644 evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py create mode 100644 evaluation/benchmarks/polyglot_aider_bench/run_infer.py create mode 100755 evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh create mode 100755 evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh diff --git a/evaluation/benchmarks/polyglot_aider_bench/Dockerfile b/evaluation/benchmarks/polyglot_aider_bench/Dockerfile new file mode 100644 index 000000000000..5ba82d25dcaa --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/Dockerfile @@ -0,0 +1,47 @@ +FROM ubuntu:22.04 + +# Prevent interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-venv \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +RUN python3 -m pip install --no-cache-dir pytest + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.21.6.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.21.6.linux-amd64.tar.gz \ + && rm go1.21.6.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java and Gradle +RUN apt-get update && apt-get install -y \ + openjdk-17-jdk \ + gradle \ + && rm -rf /var/lib/apt/lists/* + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV AIDER_DOCKER=1 + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/README.md b/evaluation/benchmarks/polyglot_aider_bench/README.md new file mode 100644 index 000000000000..727866f097b6 --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/README.md @@ -0,0 +1,73 @@ +# Polyglot Aider Benchmark + +This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/aider/tree/main/benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages. + +## Features + +- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java) +- End-to-end evaluation of code editing capabilities +- Automated test execution and validation +- Parallel evaluation with multiple workers +- Detailed metrics and logging + +## Usage + +1. Make sure you have the required dependencies installed: + ```bash + pip install -e .[dev] + ``` + +2. Run the benchmark: + ```bash + ./scripts/run_infer.sh \ + --agent-cls CodeActAgent \ + --llm-config configs/llm/gpt-4.yaml \ + --eval-output-dir eval_output \ + --eval-num-workers 10 + ``` + +### Command Line Arguments + +- `--agent-cls`: The agent class to use (default: CodeActAgent) +- `--llm-config`: Path to the LLM configuration file (required) +- `--eval-output-dir`: Directory to store evaluation outputs (default: eval_output) +- `--eval-num-workers`: Number of parallel workers (default: 1) +- `--eval-n-limit`: Limit the number of test cases to run (-1 for all) +- `--eval-ids`: Comma-separated list of specific test IDs to run +- `--eval-note`: Optional note to append to the output directory name + +## Output Format + +The benchmark saves its results in the following structure: +``` +eval_output/ +├── PolyglotAiderBench/ +│ ├── CodeActAgent/ +│ │ ├── gpt-4_maxiter_10/ +│ │ │ ├── infer_logs/ +│ │ │ │ └── instance_*.log +│ │ │ ├── llm_completions/ +│ │ │ │ └── instance_*/ +│ │ │ └── output.jsonl +│ │ └── metadata.json +``` + +Each instance's results include: +- Test execution results +- LLM completions and costs +- Error tracking (syntax errors, timeouts, etc.) +- Full interaction history + +## Supported Languages + +The benchmark supports the following languages and test frameworks: +- Python: pytest +- JavaScript: npm test +- Rust: cargo test +- Go: go test +- C++: make test +- Java: Gradle test + +## Docker Support + +The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks. \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py b/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py new file mode 100644 index 000000000000..f74101755a37 --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py @@ -0,0 +1,15 @@ +"""Prompts used in the polyglot aider benchmark.""" + +INSTRUCTIONS_ADDENDUM = """ +I've provided the following files that need to be modified: +{file_list} + +Please help me implement the necessary changes to meet the requirements. +You should ONLY modify these files, and NOT create any new files. +""" + +TEST_FAILURES = """ +The tests failed. Please fix the issues and try again. +Remember to only modify the following files: +{file_list} +""" \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py new file mode 100644 index 000000000000..96399902c837 --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py @@ -0,0 +1,382 @@ +import asyncio +import copy +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd +from datasets import load_dataset + +from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import ( + INSTRUCTIONS_ADDENDUM, + TEST_FAILURES, +) +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + update_llm_config_for_completions_logging, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + parse_arguments, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +# Configure visibility of unit tests to the Agent. +USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true' + +# Map of file extensions to test commands +TEST_COMMANDS = { + ".py": ["python3", "-m", "pytest"], + ".rs": ["cargo", "test", "--", "--include-ignored"], + ".go": ["go", "test", "./..."], + ".js": ["npm", "test"], + ".cpp": ["make", "test"], + ".java": ["./gradlew", "test"], +} + +def get_config( + instance: pd.Series, + metadata: EvalMetadata, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'docker'), + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0', # TODO: Create this image + enable_auto_lint=True, + use_host_network=False, + timeout=300, # Longer timeout for compilation + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=1800, + remote_runtime_enable_retries=True, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + # Enable logging of LLM completions + llm_config.log_completions = True + config.set_llm_config(llm_config) + + agent_config = config.get_agent_config(metadata.agent_class) + agent_config.enable_prompt_extensions = False + + return config + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, +): + """Initialize the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Initialization Fn') + logger.info('-' * 30) + obs: CmdOutputObservation + + # Create workspace + action = CmdRunAction(command='mkdir -p /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + # Copy files to workspace + with tempfile.TemporaryDirectory() as tmpdir: + # Copy solution files + for file_path in instance.solution_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.solution_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + # Copy test files if enabled + if USE_UNIT_TESTS: + for file_path in instance.test_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.test_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + logger.info('-' * 30) + logger.info('END Runtime Initialization Fn') + logger.info('-' * 30) + +def run_unit_tests( + testdir: Path, + test_files: List[str], + history_fname: Path, +) -> Optional[str]: + """Run unit tests and return error output if any.""" + timeout = 180 # 3 minutes timeout + + # Get unique file extensions from test files + extensions = {Path(f).suffix for f in test_files} + + # Find matching test command + command = None + for ext in extensions: + if ext in TEST_COMMANDS: + command = TEST_COMMANDS[ext] + break + + if not command: + raise ValueError(f"No test command found for files with extensions: {extensions}") + + # Run tests + try: + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + cwd=testdir, + encoding="utf-8", + errors="replace", + ) + except subprocess.TimeoutExpired: + error = "Tests timed out!" + with history_fname.open("a") as fh: + fh.write(f"```\n{error}\n```") + return error + + success = result.returncode == 0 + output = result.stdout + + # Clean up output + output = output.replace(str(testdir), str(testdir.name)) + output = output.strip() + + with history_fname.open("a") as fh: + fh.write(f"```\n{output}\n```") + + if not success: + logger.info(f"Tests failed: {testdir}") + return output + + return None + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> Dict[str, Any]: + """Complete the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Completion Fn') + logger.info('-' * 30) + + # Run tests + if USE_UNIT_TESTS: + test_output = run_unit_tests( + Path('/workspace'), + instance.test_files, + Path('/workspace/.aider.chat.history.md'), + ) + exit_code = 1 if test_output else 0 + else: + test_output = "" + exit_code = 0 + + logger.info('-' * 30) + logger.info('END Runtime Completion Fn') + logger.info('-' * 30) + + runtime.close() + + return { + 'test_output': test_output, + 'exit_code': exit_code, + } + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + config = get_config(instance, metadata) + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir) + else: + logger.info( + f'\nStarting evaluation for instance {str(instance.instance_id)}.\n' + ) + + # ============================================= + # build instruction + # ============================================= + + # Prepare instruction + logger.info(instance) + instruction = instance.instruction + + # Add file list to instruction + file_list = " ".join(instance.solution_files) + instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list) + + if USE_UNIT_TESTS: + test_files = " ".join(instance.test_files) + logger.info(f'\nTest files: {test_files}\n') + instruction += ( + f'Use the appropriate test command to run the tests and verify your solution. ' + 'DO NOT EDIT the test files.\n\n' + ) + + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided ' + 'to you AND NEVER ASK FOR HUMAN HELP.\n' + ) + + # ============================================= + # create sandbox and run the agent + # ============================================= + + runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + + initialize_runtime(runtime, instance=instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + ) + ) + if state is None: + raise ValueError('State should not be None.') + + # ============================================= + # result evaluation + # ============================================= + + return_val = complete_runtime(runtime, instance) + exit_code = return_val['exit_code'] + test_output = return_val['test_output'] + + errors = [] + test_cases = None + if test_output: + if 'SyntaxError' in test_output: + errors.append('SyntaxError') + elif 'IndentationError' in test_output: + errors.append('IndentationError') + else: + test_cases = test_output + + test_result = { + 'exit_code': exit_code, + 'test_cases': test_cases, + 'errors': errors, + } + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + histories = compatibility_for_eval_history_pairs(state.history) + metrics = state.metrics.get() if state.metrics else None + + # Save the output + output = EvalOutput( + instance_id=str(instance.instance_id), + instance=instance.to_dict(), + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + +if __name__ == '__main__': + args = parse_arguments() + + # Load the polyglot benchmark dataset + dataset = load_dataset('Aider-AI/polyglot-benchmark') + polyglot_tests = dataset['train'].to_pandas() + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results + llm_config.modify_params = False + # Enable logging of LLM completions + llm_config.log_completions = True + + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'PolyglotAiderBench', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + + # Parse dataset IDs if provided + eval_ids = None + if args.eval_ids: + eval_ids = str(args.eval_ids).split(',') + logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n') + + instances = prepare_dataset( + polyglot_tests, + output_file, + args.eval_n_limit, + eval_ids=eval_ids, + ) + + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + ) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh new file mode 100755 index 000000000000..7719fe28c0d8 --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +cd "$SCRIPT_DIR/.." || exit 1 + +# Build the Docker image +docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 . \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh new file mode 100755 index 000000000000..51fe90c87bcc --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +cd "$SCRIPT_DIR/.." || exit 1 + +# Default values +AGENT_CLS="CodeActAgent" +EVAL_NOTE="" +EVAL_OUTPUT_DIR="eval_output" +EVAL_NUM_WORKERS=1 +EVAL_N_LIMIT=-1 +LLM_CONFIG="" +EVAL_IDS="" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --agent-cls) + AGENT_CLS="$2" + shift 2 + ;; + --eval-note) + EVAL_NOTE="$2" + shift 2 + ;; + --eval-output-dir) + EVAL_OUTPUT_DIR="$2" + shift 2 + ;; + --eval-num-workers) + EVAL_NUM_WORKERS="$2" + shift 2 + ;; + --eval-n-limit) + EVAL_N_LIMIT="$2" + shift 2 + ;; + --llm-config) + LLM_CONFIG="$2" + shift 2 + ;; + --eval-ids) + EVAL_IDS="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + exit 1 + ;; + esac +done + +# Check required arguments +if [ -z "$LLM_CONFIG" ]; then + echo "Error: --llm-config is required" + exit 1 +fi + +# Run the evaluation +python3 run_infer.py \ + --agent-cls "$AGENT_CLS" \ + --eval-note "$EVAL_NOTE" \ + --eval-output-dir "$EVAL_OUTPUT_DIR" \ + --eval-num-workers "$EVAL_NUM_WORKERS" \ + --eval-n-limit "$EVAL_N_LIMIT" \ + --llm-config "$LLM_CONFIG" \ + ${EVAL_IDS:+--eval-ids "$EVAL_IDS"} \ No newline at end of file From a386b423d44880a2f67921dc3f39a8b9519464ff Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 10:06:10 +0000 Subject: [PATCH 3/6] feat: Support old-style positional arguments in polyglot aider benchmark Modified run_infer.sh to support both argument styles: - Old style: - New style: --llm-config --agent-cls [other options] Updated README to document both usage styles with examples. This maintains backward compatibility with existing scripts. --- .../benchmarks/polyglot_aider_bench/README.md | 21 +++- .../polyglot_aider_bench/scripts/run_infer.sh | 97 +++++++++++-------- 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/evaluation/benchmarks/polyglot_aider_bench/README.md b/evaluation/benchmarks/polyglot_aider_bench/README.md index 727866f097b6..e3f07537ae56 100644 --- a/evaluation/benchmarks/polyglot_aider_bench/README.md +++ b/evaluation/benchmarks/polyglot_aider_bench/README.md @@ -17,7 +17,18 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid pip install -e .[dev] ``` -2. Run the benchmark: +2. Run the benchmark using either style: + + **Old Style (Positional Arguments)**: + ```bash + ./scripts/run_infer.sh + ``` + Example: + ```bash + ./scripts/run_infer.sh 4ominiSky HEAD CodeActAgent 1000 1 + ``` + + **New Style (Named Arguments)**: ```bash ./scripts/run_infer.sh \ --agent-cls CodeActAgent \ @@ -28,6 +39,14 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid ### Command Line Arguments +**Old Style (Positional)**: +1. `model`: Model name (will look for configs/llm/{model}.yaml) +2. `commit`: Git commit or note to append to output directory +3. `agent`: Agent class name +4. `max_iters`: Maximum iterations per test +5. `num_workers`: Number of parallel workers + +**New Style (Named)**: - `--agent-cls`: The agent class to use (default: CodeActAgent) - `--llm-config`: Path to the LLM configuration file (required) - `--eval-output-dir`: Directory to store evaluation outputs (default: eval_output) diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh index 51fe90c87bcc..13fe63ff1ca1 100755 --- a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh @@ -13,47 +13,65 @@ EVAL_N_LIMIT=-1 LLM_CONFIG="" EVAL_IDS="" -# Parse command line arguments -while [[ $# -gt 0 ]]; do - case $1 in - --agent-cls) - AGENT_CLS="$2" - shift 2 - ;; - --eval-note) - EVAL_NOTE="$2" - shift 2 - ;; - --eval-output-dir) - EVAL_OUTPUT_DIR="$2" - shift 2 - ;; - --eval-num-workers) - EVAL_NUM_WORKERS="$2" - shift 2 - ;; - --eval-n-limit) - EVAL_N_LIMIT="$2" - shift 2 - ;; - --llm-config) - LLM_CONFIG="$2" - shift 2 - ;; - --eval-ids) - EVAL_IDS="$2" - shift 2 - ;; - *) - echo "Unknown argument: $1" - exit 1 - ;; - esac -done +# Check if using positional arguments (old style) +if [[ $# -ge 5 && "$1" != "--"* ]]; then + # Old style: + MODEL="$1" + COMMIT="$2" + AGENT_CLS="$3" + MAX_ITERS="$4" + EVAL_NUM_WORKERS="$5" + + # Convert to new style arguments + LLM_CONFIG="configs/llm/${MODEL}.yaml" + EVAL_NOTE="${COMMIT}" + MAX_ITERATIONS="--max-iterations ${MAX_ITERS}" +else + # Parse named arguments (new style) + while [[ $# -gt 0 ]]; do + case $1 in + --agent-cls) + AGENT_CLS="$2" + shift 2 + ;; + --eval-note) + EVAL_NOTE="$2" + shift 2 + ;; + --eval-output-dir) + EVAL_OUTPUT_DIR="$2" + shift 2 + ;; + --eval-num-workers) + EVAL_NUM_WORKERS="$2" + shift 2 + ;; + --eval-n-limit) + EVAL_N_LIMIT="$2" + shift 2 + ;; + --llm-config) + LLM_CONFIG="$2" + shift 2 + ;; + --eval-ids) + EVAL_IDS="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + exit 1 + ;; + esac + done +fi # Check required arguments if [ -z "$LLM_CONFIG" ]; then - echo "Error: --llm-config is required" + echo "Error: LLM config is required" + echo "Usage:" + echo " Old style: $0 " + echo " New style: $0 --llm-config --agent-cls [other options]" exit 1 fi @@ -65,4 +83,5 @@ python3 run_infer.py \ --eval-num-workers "$EVAL_NUM_WORKERS" \ --eval-n-limit "$EVAL_N_LIMIT" \ --llm-config "$LLM_CONFIG" \ - ${EVAL_IDS:+--eval-ids "$EVAL_IDS"} \ No newline at end of file + ${EVAL_IDS:+--eval-ids "$EVAL_IDS"} \ + ${MAX_ITERATIONS:-} \ No newline at end of file From 0121e5711b5962c5baa824f034a92ff4f26ac57c Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 10:10:14 +0000 Subject: [PATCH 4/6] fix: Use relative imports in polyglot aider benchmark - Changed imports to use relative paths - Added __init__.py to helper directory - This fixes ModuleNotFoundError when running the benchmark --- evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py | 1 + evaluation/benchmarks/polyglot_aider_bench/run_infer.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py b/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py new file mode 100644 index 000000000000..f5f6062fa9a4 --- /dev/null +++ b/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py @@ -0,0 +1 @@ +"""Helper modules for the polyglot aider benchmark.""" \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py index 96399902c837..27ebe077b543 100644 --- a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py +++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py @@ -11,11 +11,11 @@ import pandas as pd from datasets import load_dataset -from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import ( +from .helper.prompts import ( INSTRUCTIONS_ADDENDUM, TEST_FAILURES, ) -from evaluation.utils.shared import ( +from ....utils.shared import ( EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, From 3a2b167309a0ad9bf6b6922327aebba1496b1d84 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 25 Feb 2025 10:11:01 +0000 Subject: [PATCH 5/6] fix: Fix Python package imports in polyglot aider benchmark - Added OpenHands root to PYTHONPATH in run_infer.sh - Changed back to absolute imports in run_infer.py - This fixes the 'no known parent package' error --- evaluation/benchmarks/polyglot_aider_bench/run_infer.py | 4 ++-- .../benchmarks/polyglot_aider_bench/scripts/run_infer.sh | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py index 27ebe077b543..96399902c837 100644 --- a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py +++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py @@ -11,11 +11,11 @@ import pandas as pd from datasets import load_dataset -from .helper.prompts import ( +from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import ( INSTRUCTIONS_ADDENDUM, TEST_FAILURES, ) -from ....utils.shared import ( +from evaluation.utils.shared import ( EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh index 13fe63ff1ca1..d802a454e605 100755 --- a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh @@ -2,7 +2,13 @@ # Get the directory where the script is located SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -cd "$SCRIPT_DIR/.." || exit 1 +BENCH_DIR="$( cd "$SCRIPT_DIR/.." &> /dev/null && pwd )" +ROOT_DIR="$( cd "$BENCH_DIR/../../.." &> /dev/null && pwd )" + +# Add OpenHands root to PYTHONPATH +export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}" + +cd "$BENCH_DIR" || exit 1 # Default values AGENT_CLS="CodeActAgent" From afbf10f8569264e337fd811d2b02c67fa4499d0f Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 26 Feb 2025 06:03:07 +0000 Subject: [PATCH 6/6] Add polyglot benchmark implementation --- .../benchmarks/polyglot_benchmark/Dockerfile | 63 +++ .../benchmarks/polyglot_benchmark/README.md | 90 ++++ .../polyglot_benchmark/helper/__init__.py | 0 .../polyglot_benchmark/helper/prompts.py | 28 + .../polyglot_benchmark/run_infer.py | 487 ++++++++++++++++++ .../scripts/build_docker.sh | 12 + .../polyglot_benchmark/scripts/run_infer.sh | 35 ++ .../scripts/summarize_results.py | 84 +++ .../polyglot_benchmark/test_load_dataset.py | 40 ++ .../benchmarks/polyglot_benchmark/test_run.py | 73 +++ 10 files changed, 912 insertions(+) create mode 100644 evaluation/benchmarks/polyglot_benchmark/Dockerfile create mode 100644 evaluation/benchmarks/polyglot_benchmark/README.md create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/__init__.py create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/prompts.py create mode 100644 evaluation/benchmarks/polyglot_benchmark/run_infer.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_run.py diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile new file mode 100644 index 000000000000..ed789e6d8000 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/Dockerfile @@ -0,0 +1,63 @@ +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + wget \ + software-properties-common \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + libboost-all-dev \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip3 install --no-cache-dir pytest pytest-timeout + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \ + && rm go1.20.5.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java +RUN apt-get update && apt-get install -y openjdk-17-jdk \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Install Gradle +RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \ + && mkdir /opt/gradle \ + && unzip -d /opt/gradle gradle-7.6-bin.zip \ + && rm gradle-7.6-bin.zip +ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}" + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md new file mode 100644 index 000000000000..d92251acb9f7 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -0,0 +1,90 @@ +# Polyglot Benchmark + +This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages. + +## Features + +- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java) +- End-to-end evaluation of code editing capabilities +- Automated test execution and validation +- Parallel evaluation with multiple workers +- Detailed metrics and logging + +## Setup + +1. Clone the polyglot-benchmark repository: + ```bash + git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark + ``` + +2. Build the Docker image for the benchmark: + ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh + ``` + +## Usage + +1. Make sure you have the required dependencies installed: + ```bash + pip install -e .[dev] + ``` + +2. Run the benchmark: + ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh + ``` + +### Command Line Arguments + +- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`) +- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`) +- `agent`: Agent class name (e.g., `CodeActAgent`) +- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all) +- `eval-num-workers`: Number of parallel workers (default: `1`) +- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`) +- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`) + +### Environment Variables + +You can also set the following environment variables: + +```bash +export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the polyglot-benchmark repository +export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) +``` + +### Example + +```bash +# Run evaluation on CodeActAgent for all Python instances with 2 workers +export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark" +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python" +``` + +## Summarize Results + +After running the benchmark, you can summarize the results: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py +``` + +Example: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl +``` + +## Supported Languages + +The benchmark supports the following languages and test frameworks: +- Python: pytest +- JavaScript: npm test +- Rust: cargo test +- Go: go test +- C++: make test +- Java: Gradle test + +## Docker Support + +The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks. \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py new file mode 100644 index 000000000000..61bc0e54cb11 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py @@ -0,0 +1,28 @@ +"""Prompts used in the polyglot benchmark.""" + +INSTRUCTIONS_ADDENDUM = """ +I've provided the following files that need to be modified: +{file_list} + +Please help me implement the necessary changes to meet the requirements. +You should ONLY modify these files, and NOT create any new files. +""" + +TEST_FAILURES = """ +The tests failed. Please fix the issues and try again. +Remember to only modify the following files: +{file_list} +""" + +# Dictionary mapping agent class names to their specific instruction suffixes +INST_SUFFIXES = { + 'CodeActAgent': ( + 'REMEMBER: All edits must be made directly in the files. Do NOT send' + ' the edited file as output to the user.\n' + ) +} + +# Dictionary mapping agent class names to their fake response functions +FAKE_RESPONSES = { + 'CodeActAgent': lambda _: None, # Will be replaced with codeact_user_response from shared.py +} \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py new file mode 100644 index 000000000000..45a9ee4f91ac --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -0,0 +1,487 @@ +import asyncio +import copy +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from evaluation.benchmarks.polyglot_benchmark.helper.prompts import ( + INSTRUCTIONS_ADDENDUM, + INST_SUFFIXES, + TEST_FAILURES, + FAKE_RESPONSES, +) +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + update_llm_config_for_completions_logging, + codeact_user_response, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + load_from_toml, + parse_arguments, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +# Configure visibility of unit tests to the Agent. +USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true' + +# Map of file extensions to test commands +TEST_COMMANDS = { + ".py": ["python3", "-m", "pytest"], + ".rs": ["cargo", "test", "--", "--include-ignored"], + ".go": ["go", "test", "./..."], + ".js": ["npm", "test"], + ".cpp": ["make", "test"], + ".java": ["./gradlew", "test"], +} + +# Update fake responses with the actual function +FAKE_RESPONSES['CodeActAgent'] = codeact_user_response + +def get_config( + instance: pd.Series, + metadata: EvalMetadata, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'docker'), + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0', # TODO: Create this image + enable_auto_lint=True, + use_host_network=False, + timeout=300, # Longer timeout for compilation + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=1800, + remote_runtime_enable_retries=True, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + # Enable logging of LLM completions + llm_config.log_completions = True + config.set_llm_config(llm_config) + + agent_config = config.get_agent_config(metadata.agent_class) + agent_config.enable_prompt_extensions = False + + # copy 'draft_editor' config if exists + config_copy = copy.deepcopy(config) + load_from_toml(config_copy) + if 'draft_editor' in config_copy.llms: + config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor') + + return config + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, +): + """Initialize the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Initialization Fn') + logger.info('-' * 30) + obs: CmdOutputObservation + + # Create workspace + action = CmdRunAction(command='mkdir -p /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + # Copy files to workspace + with tempfile.TemporaryDirectory() as tmpdir: + # Copy solution files + for file_path in instance.solution_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.solution_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + # Copy test files if enabled + if USE_UNIT_TESTS: + for file_path in instance.test_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.test_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + logger.info('-' * 30) + logger.info('END Runtime Initialization Fn') + logger.info('-' * 30) + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> Dict[str, Any]: + """Complete the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Completion Fn') + logger.info('-' * 30) + + # Run tests + test_output = "" + exit_code = 1 + + if USE_UNIT_TESTS: + # Get unique file extensions from test files + extensions = {Path(f).suffix for f in instance.test_files} + + # Find matching test command + command = None + for ext in extensions: + if ext in TEST_COMMANDS: + command = TEST_COMMANDS[ext] + break + + if command: + try: + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=180, # 3 minutes timeout + cwd="/workspace", + encoding="utf-8", + errors="replace", + ) + exit_code = result.returncode + test_output = result.stdout + + # Clean up output + test_output = test_output.replace("/workspace", "workspace") + + # Log test output to history file + with open("/workspace/.aider.chat.history.md", "a") as fh: + fh.write(f"```\n{test_output}\n```") + + except subprocess.TimeoutExpired: + test_output = "Tests timed out!" + exit_code = 1 + + logger.info('-' * 30) + logger.info('END Runtime Completion Fn') + logger.info('-' * 30) + + runtime.close() + + return { + 'test_output': test_output, + 'exit_code': exit_code, + } + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + config = get_config(instance, metadata) + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir) + else: + logger.info( + f'\nStarting evaluation for instance {str(instance.instance_id)}.\n' + ) + + # ============================================= + # build instruction + # ============================================= + + # Prepare instruction + logger.info(instance) + instruction = instance.instruction + + # Add file list to instruction + file_list = " ".join(instance.solution_files) + instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list) + + if USE_UNIT_TESTS: + test_files = " ".join(instance.test_files) + logger.info(f'\nTest files: {test_files}\n') + instruction += ( + f'Use the appropriate test command to run the tests and verify your solution. ' + 'DO NOT EDIT the test files.\n\n' + ) + + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided ' + 'to you AND NEVER ASK FOR HUMAN HELP.\n' + ) + + # Add agent-specific instruction suffix + if metadata.agent_class in INST_SUFFIXES: + instruction += INST_SUFFIXES[metadata.agent_class] + + # ============================================= + # create sandbox and run the agent + # ============================================= + + runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + + initialize_runtime(runtime, instance=instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) + ) + if state is None: + raise ValueError('State should not be None.') + + # ============================================= + # result evaluation + # ============================================= + + return_val = complete_runtime(runtime, instance) + exit_code = return_val['exit_code'] + test_output = return_val['test_output'] + + errors = [] + test_cases = None + if test_output: + if 'SyntaxError' in test_output: + errors.append('SyntaxError') + elif 'IndentationError' in test_output: + errors.append('IndentationError') + else: + test_cases = test_output + + test_result = { + 'exit_code': exit_code, + 'test_cases': test_cases, + 'errors': errors, + } + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + histories = compatibility_for_eval_history_pairs(state.history) + metrics = state.metrics.get() if state.metrics else None + + # Save the output + output = EvalOutput( + instance_id=str(instance.instance_id), + instance=instance.to_dict(), + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + +def load_polyglot_dataset(): + """Load the polyglot benchmark dataset from the repository.""" + import glob + import json + import os + + # Path to the polyglot-benchmark repository + repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark') + + all_tests = [] + instance_id = 0 + + # Process each language directory + for lang_dir in ['python', 'javascript', 'rust', 'go', 'cpp', 'java']: + lang_path = os.path.join(repo_path, lang_dir, 'exercises', 'practice') + if not os.path.exists(lang_path): + logger.warning(f"Language directory not found: {lang_path}") + continue + + # Process each exercise directory + for exercise_dir in os.listdir(lang_path): + exercise_path = os.path.join(lang_path, exercise_dir) + if not os.path.isdir(exercise_path): + continue + + # Check for config.json + config_file = os.path.join(exercise_path, '.meta', 'config.json') + if not os.path.exists(config_file): + logger.warning(f"Config file not found: {config_file}") + continue + + # Load config + with open(config_file, 'r') as f: + config = json.load(f) + + # Get solution and test files + solution_files = config.get('files', {}).get('solution', []) + test_files = config.get('files', {}).get('test', []) + + if not solution_files or not test_files: + logger.warning(f"Missing solution or test files in {exercise_path}") + continue + + # Load instructions + instruction = "" + intro_file = os.path.join(exercise_path, '.docs', 'introduction.md') + if os.path.exists(intro_file): + with open(intro_file, 'r') as f: + instruction += f.read() + "\n\n" + + instructions_file = os.path.join(exercise_path, '.docs', 'instructions.md') + if os.path.exists(instructions_file): + with open(instructions_file, 'r') as f: + instruction += f.read() + "\n\n" + + if not instruction: + logger.warning(f"No instructions found for {exercise_path}") + continue + + # Load solution and test content + solution_content = {} + for file_path in solution_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + solution_content[os.path.basename(file_path)] = f.read() + + test_content = {} + for file_path in test_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + test_content[os.path.basename(file_path)] = f.read() + + # Create test instance + test_instance = { + 'instance_id': instance_id, + 'instance_name': exercise_dir, + 'language': lang_dir, + 'instruction': instruction, + 'solution_files': [os.path.basename(f) for f in solution_files], + 'test_files': [os.path.basename(f) for f in test_files], + 'solution_content': solution_content, + 'test_content': test_content, + } + + all_tests.append(test_instance) + instance_id += 1 + + return pd.DataFrame(all_tests) + +def add_arguments(parser): + """Add polyglot benchmark specific arguments to the parser.""" + parser.add_argument( + '--eval-languages', + type=str, + help='Comma-separated list of languages to test (e.g., "python,javascript,rust")', + ) + return parser + +if __name__ == '__main__': + # Add custom arguments + parser = parse_arguments.__self__ + add_arguments(parser) + args = parser.parse_args() + + # Load the polyglot benchmark dataset + polyglot_tests = load_polyglot_dataset() + + if polyglot_tests.empty: + logger.error("Failed to load polyglot benchmark dataset") + exit(1) + + logger.info(f"Loaded {len(polyglot_tests)} test instances from polyglot benchmark") + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results + llm_config.modify_params = False + # Enable logging of LLM completions + llm_config.log_completions = True + + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + + # Parse dataset IDs if provided + eval_ids = None + if args.eval_ids: + eval_ids = str(args.eval_ids).split(',') + logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n') + + # Filter by language if specified + if hasattr(args, 'eval_languages') and args.eval_languages: + languages = [lang.strip().lower() for lang in args.eval_languages.split(',')] + polyglot_tests = polyglot_tests[polyglot_tests['language'].str.lower().isin(languages)] + logger.info(f'\nFiltered to languages: {languages}, {len(polyglot_tests)} instances remaining\n') + + instances = prepare_dataset( + polyglot_tests, + output_file, + args.eval_n_limit, + eval_ids=eval_ids, + ) + + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + ) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh new file mode 100755 index 000000000000..1c6a2dfff7a1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" + +# Build the Docker image +docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 -f "${BENCHMARK_DIR}/Dockerfile" "${BENCHMARK_DIR}" + +echo "Docker image built successfully: ghcr.io/opendevin/eval-polyglot:v1.0.0" \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh new file mode 100755 index 000000000000..ce998a112330 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +# Default values +MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"} +GIT_VERSION=${2:-"HEAD"} +AGENT=${3:-"CodeActAgent"} +EVAL_LIMIT=${4:-"-1"} +EVAL_NUM_WORKERS=${5:-"1"} +EVAL_IDS=${6:-""} +EVAL_LANGUAGES=${7:-""} + +# Set environment variables +export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"} +export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} + +# Add additional arguments based on provided parameters +ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}" + +if [ "${EVAL_LIMIT}" != "-1" ]; then + ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}" +fi + +if [ -n "${EVAL_IDS}" ]; then + ARGS="${ARGS} --eval-ids ${EVAL_IDS}" +fi + +if [ -n "${EVAL_LANGUAGES}" ]; then + ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}" +fi + +# Run the evaluation +cd "$(git rev-parse --show-toplevel)" +poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py new file mode 100755 index 000000000000..988f3a618bff --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +from collections import defaultdict + +def load_jsonl(file_path): + """Load data from a jsonl file.""" + data = [] + with open(file_path, 'r') as f: + for line in f: + data.append(json.loads(line)) + return data + +def summarize_results(output_file): + """Summarize the results of the polyglot benchmark evaluation.""" + if not os.path.exists(output_file): + print(f"Error: Output file {output_file} does not exist.") + return + + results = load_jsonl(output_file) + + # Count total instances + total_instances = len(results) + print(f"Total instances: {total_instances}") + + # Count by language + language_counts = defaultdict(int) + language_passed = defaultdict(int) + + # Count passed and failed instances + passed_instances = [] + failed_instances = [] + + for result in results: + instance = result.get('instance', {}) + language = instance.get('language', 'unknown') + instance_name = instance.get('instance_name', 'unknown') + instance_id = result.get('instance_id', 'unknown') + + language_counts[language] += 1 + + # Check if all tests passed + test_result = result.get('test_result', {}) + exit_code = test_result.get('exit_code', 1) + + if exit_code == 0: + passed_instances.append((instance_id, language, instance_name)) + language_passed[language] += 1 + else: + failed_instances.append((instance_id, language, instance_name)) + + # Print summary + print("\nResults by language:") + print("--------------------") + for language, count in sorted(language_counts.items()): + passed = language_passed[language] + percentage = (passed / count) * 100 if count > 0 else 0 + print(f"{language}: {passed}/{count} ({percentage:.1f}%)") + + # Overall pass rate + total_passed = len(passed_instances) + overall_percentage = (total_passed / total_instances) * 100 if total_instances > 0 else 0 + print(f"\nOverall pass rate: {total_passed}/{total_instances} ({overall_percentage:.1f}%)") + + # Print passed instances + print("\nPassed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(passed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + + # Print failed instances + print("\nFailed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(failed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Summarize polyglot benchmark results") + parser.add_argument("output_file", help="Path to the output.jsonl file") + args = parser.parse_args() + + summarize_results(args.output_file) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py new file mode 100755 index 000000000000..708259732b02 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os +import sys +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import load_polyglot_dataset + +def main(): + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + # Print summary + print(f"Loaded {len(dataset)} test instances") + + # Print language distribution + language_counts = dataset['language'].value_counts() + print("\nLanguage distribution:") + for language, count in language_counts.items(): + print(f"{language}: {count}") + + # Print a sample instance + if not dataset.empty: + print("\nSample instance:") + sample = dataset.iloc[0] + print(f"ID: {sample.instance_id}") + print(f"Name: {sample.instance_name}") + print(f"Language: {sample.language}") + print(f"Solution files: {sample.solution_files}") + print(f"Test files: {sample.test_files}") + print(f"Instruction (first 100 chars): {sample.instruction[:100]}...") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py new file mode 100755 index 000000000000..a8671b0646f1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import ( + load_polyglot_dataset, + process_instance, + make_metadata, + get_llm_config_arg, +) +from openhands.core.logger import openhands_logger as logger + +def main(): + parser = argparse.ArgumentParser(description="Test the polyglot benchmark with a single instance") + parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name") + parser.add_argument("--agent", default="CodeActAgent", help="Agent class name") + parser.add_argument("--instance-id", type=int, default=0, help="Instance ID to test") + parser.add_argument("--language", help="Filter by language") + args = parser.parse_args() + + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + if args.language: + dataset = dataset[dataset['language'].str.lower() == args.language.lower()] + if dataset.empty: + print(f"No instances found for language: {args.language}") + return + + # Get the instance to test + if args.instance_id >= len(dataset): + print(f"Instance ID {args.instance_id} is out of range. Max ID: {len(dataset) - 1}") + return + + instance = dataset.iloc[args.instance_id] + print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})") + + # Get LLM config + llm_config = get_llm_config_arg(args.model) + if llm_config is None: + print(f"Could not find LLM config: {args.model}") + return + + # Create metadata + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent, + 30, # max_iterations + "test", + "evaluation/evaluation_outputs/test", + ) + + # Process the instance + try: + output = process_instance(instance, metadata, reset_logger=False) + print("\nTest completed successfully!") + print(f"Exit code: {output.test_result['exit_code']}") + print(f"Passed: {output.test_result['exit_code'] == 0}") + except Exception as e: + print(f"Error processing instance: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file