diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md index 086cfe58160a..a011e6ec9d5c 100644 --- a/evaluation/benchmarks/aider_bench/README.md +++ b/evaluation/benchmarks/aider_bench/README.md @@ -16,7 +16,7 @@ development environment and LLM. ## Start the evaluation ```bash -./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation] ``` - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for @@ -31,6 +31,7 @@ development environment and LLM. - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`. - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated). +- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes. There are also following optional environment variables you can set: @@ -53,7 +54,11 @@ You can update the arguments in the script - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`. ```bash +# Run benchmark without evaluation ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" + +# Run benchmark with automatic evaluation +./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" eval ``` ### Run Inference on `RemoteRuntime` (experimental) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index ae5faadc098b..4247db909b0a 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -21,6 +21,7 @@ prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -45,6 +46,7 @@ def get_config( + instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: sandbox_config = get_default_sandbox_config_for_eval() @@ -59,7 +61,13 @@ def get_config( workspace_base=None, workspace_mount_path=None, ) - config.set_llm_config(metadata.llm_config) + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + config.set_llm_config(llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False @@ -162,7 +170,7 @@ def process_instance( metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: - config = get_config(metadata) + config = get_config(instance, metadata) # Setup the logger properly, so you can run multi-processing to parallelize the evaluation if reset_logger: @@ -277,6 +285,15 @@ def process_instance( if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + metadata = make_metadata( llm_config, 'AiderBench', @@ -284,6 +301,7 @@ def process_instance( args.max_iterations, args.eval_note, args.eval_output_dir, + details=agent_details, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh index 59d53cfb1980..8160a1ea40f9 100755 --- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh @@ -9,6 +9,21 @@ AGENT=$3 EVAL_LIMIT=$4 NUM_WORKERS=$5 EVAL_IDS=$6 +RUN_EVALUATION=$7 # New parameter to run evaluation after benchmark + +# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval" +if [ "$RUN_EVALUATION" = "eval" ]; then + echo "Evaluation mode enabled" +fi + +# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval" +for param in "$@"; do + if [ "$param" = "eval" ]; then + RUN_EVALUATION="eval" + echo "Evaluation mode enabled" + break + fi +done if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1 @@ -51,10 +66,59 @@ if [ -n "$EVAL_LIMIT" ]; then COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" fi -if [ -n "$EVAL_IDS" ]; then +# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode) +if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then echo "EVAL_IDS: $EVAL_IDS" COMMAND="$COMMAND --eval-ids $EVAL_IDS" fi # Run the command eval $COMMAND + +# Get the output directory - first try the default location +OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) + +# If not found, try to find it anywhere under evaluation_outputs +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1) +fi + +# If still not found, try to find any output.jsonl file +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1) + if [ -n "$OUTPUT_FILE" ]; then + OUTPUT_DIR=$(dirname "$OUTPUT_FILE") + fi +else + OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" +fi + +# Print the output directory and file for debugging +echo "" +echo "Output directory: $OUTPUT_DIR" +echo "Output file: $OUTPUT_FILE" + +# Run evaluation if requested +if [ "$RUN_EVALUATION" = "eval" ]; then + echo "" + echo "======================================" + echo "Running evaluation on results..." + echo "======================================" + echo "" + + if [ -f "$OUTPUT_FILE" ]; then + echo "Evaluating results in: $OUTPUT_FILE" + poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + + echo "" + echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE" + else + echo "Error: Output file not found: $OUTPUT_FILE" + echo "Cannot run evaluation." + fi +fi diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile new file mode 100644 index 000000000000..ed789e6d8000 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/Dockerfile @@ -0,0 +1,63 @@ +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + wget \ + software-properties-common \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + libboost-all-dev \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip3 install --no-cache-dir pytest pytest-timeout + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \ + && rm go1.20.5.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java +RUN apt-get update && apt-get install -y openjdk-17-jdk \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Install Gradle +RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \ + && mkdir /opt/gradle \ + && unzip -d /opt/gradle gradle-7.6-bin.zip \ + && rm gradle-7.6-bin.zip +ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}" + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md new file mode 100644 index 000000000000..f5e8ee6a2903 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/README.md @@ -0,0 +1,207 @@ +# Polyglot Benchmark + +This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages. + +> **Note**: This benchmark has been modified to use only the same tools as SWE-Bench: +> - execute_bash +> - finish +> - str_replace_editor +> +> This restriction ensures consistent tool usage across benchmarks for more accurate comparisons. + +## Features + +- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java) +- End-to-end evaluation of code editing capabilities +- Automated test execution and validation +- Parallel evaluation with multiple workers +- Detailed metrics and logging + +## Setup + +1. Clone the polyglot-benchmark repository: + ```bash + git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark + ``` + +2. Build the Docker image for the benchmark: + ```bash + ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh + ``` + +## Usage + +1. Make sure you have the required dependencies installed: + ```bash + pip install -e .[dev] + ``` + +2. To test one instance per language (quick verification): + ```bash + # Without evaluation + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo + + # With automatic evaluation + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo --eval + ``` + + This will run one test for each supported language (Python, Rust, Go, JavaScript, C++, and Java) and provide a summary of results. + +3. Run the full benchmark: + ```bash + # Using named arguments (recommended) + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript + + # With automatic evaluation + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript --eval + + # Or using positional arguments (legacy) + ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh + ``` + +4. Available command-line options: + ``` + --help Show help message + --model MODEL Model configuration (default: eval_gpt4_1106_preview) + --agent AGENT Agent class (default: CodeActAgent) + --limit LIMIT Evaluation limit (default: -1 for all) + --workers WORKERS Number of workers (default: 1) + --ids IDS Comma-separated list of instance IDs + --languages LANGUAGES Comma-separated list of languages + --one-per-language Test one instance per language + --eval Run evaluation after benchmark completes + ``` + +### Command Line Arguments + +- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`) +- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`) +- `agent`: Agent class name (e.g., `CodeActAgent`) +- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all) +- `eval-num-workers`: Number of parallel workers (default: `1`) +- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`) +- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`) + +### Environment Variables + +You can also set the following environment variables: + +```bash +export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark" # Path to the polyglot-benchmark repository +export USE_UNIT_TESTS="true" # Whether to run unit tests (default: true) +export NO_DOCKER="true" # Skip Docker container creation and use local runtime (default: false) +export POLYGLOT_DOCKER_IMAGE="image:tag" # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0) +export BUILD_LOCAL_DOCKER="false" # Build a local Docker image if one doesn't exist (default: true) +``` + +### Docker Support + +The benchmark uses Docker to create isolated environments for running code in different programming languages. By default, the script will: + +1. Try to pull the specified Docker image from the registry +2. If the pull fails, automatically build a local Docker image + +You have several options for customizing this behavior: + +#### Option 1: Use the Default Behavior (Recommended) + +Simply run the benchmark script, and it will handle the Docker image automatically: + +```bash +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +#### Option 2: Manually Build a Local Docker Image + +You can explicitly build a local Docker image before running the benchmark: + +```bash +# Build the Docker image +./evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh + +# Run the benchmark with the local image +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +#### Option 3: Disable Automatic Docker Image Building + +If you want to disable the automatic building of a Docker image: + +```bash +BUILD_LOCAL_DOCKER=false ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +#### Option 4: Use a Custom Docker Image + +You can specify a custom Docker image to use: + +```bash +POLYGLOT_DOCKER_IMAGE="your-custom-image:tag" ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 +``` + +### Troubleshooting + +#### Docker Issues + +If you encounter Docker-related errors like: + +``` +Command 'docker buildx build ...' returned non-zero exit status 1 +``` + +You can try the following solutions: + +1. Build a local Docker image as described above. + +2. Run with `NO_DOCKER=true` to use the local runtime instead: + ```bash + NO_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1 + ``` + +3. Make sure Docker is installed and running: + ```bash + docker --version + docker ps + ``` + +4. Check if you have permission to use Docker: + ```bash + sudo usermod -aG docker $USER + # Then log out and log back in + ``` + +### Example + +```bash +# Run evaluation on CodeActAgent for all Python instances with 2 workers +export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark" +./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python" +``` + +## Summarize Results + +After running the benchmark, you can summarize the results: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py +``` + +Example: + +```bash +poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl +``` + +## Supported Languages + +The benchmark supports the following languages and test frameworks: +- Python: pytest +- JavaScript: npm test +- Rust: cargo test +- Go: go test +- C++: make test +- Java: Gradle test + +## Docker Support + +The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks. \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py new file mode 100644 index 000000000000..61bc0e54cb11 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py @@ -0,0 +1,28 @@ +"""Prompts used in the polyglot benchmark.""" + +INSTRUCTIONS_ADDENDUM = """ +I've provided the following files that need to be modified: +{file_list} + +Please help me implement the necessary changes to meet the requirements. +You should ONLY modify these files, and NOT create any new files. +""" + +TEST_FAILURES = """ +The tests failed. Please fix the issues and try again. +Remember to only modify the following files: +{file_list} +""" + +# Dictionary mapping agent class names to their specific instruction suffixes +INST_SUFFIXES = { + 'CodeActAgent': ( + 'REMEMBER: All edits must be made directly in the files. Do NOT send' + ' the edited file as output to the user.\n' + ) +} + +# Dictionary mapping agent class names to their fake response functions +FAKE_RESPONSES = { + 'CodeActAgent': lambda _: None, # Will be replaced with codeact_user_response from shared.py +} \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py new file mode 100644 index 000000000000..334a0a769bcc --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py @@ -0,0 +1,548 @@ +import asyncio +import copy +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +# NOTE: This benchmark has been modified to use only the same tools as SWE-Bench: +# - execute_bash +# - finish +# - str_replace_editor + +import pandas as pd + +from evaluation.benchmarks.polyglot_benchmark.helper.prompts import ( + INSTRUCTIONS_ADDENDUM, + INST_SUFFIXES, + TEST_FAILURES, + FAKE_RESPONSES, +) +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + update_llm_config_for_completions_logging, + codeact_user_response, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + load_from_toml, + parse_arguments, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +# Configure visibility of unit tests to the Agent. +USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true' + +# Map of file extensions to test commands +TEST_COMMANDS = { + ".py": ["python3", "-m", "pytest"], + ".rs": ["cargo", "test", "--", "--include-ignored"], + ".go": ["go", "test", "./..."], + ".js": ["npm", "test"], + ".cpp": ["make", "test"], + ".java": ["./gradlew", "test"], +} + +# Update fake responses with the actual function +FAKE_RESPONSES['CodeActAgent'] = codeact_user_response + +def get_config( + instance: pd.Series, + metadata: EvalMetadata, +) -> AppConfig: + # Determine runtime type based on environment variable + runtime_type = os.environ.get('RUNTIME', 'docker') + + # Check if NO_DOCKER is set to skip Docker container creation + if os.environ.get('NO_DOCKER', 'false').lower() == 'true': + runtime_type = 'local' + logger.info("Using local runtime instead of Docker due to NO_DOCKER=true") + + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=runtime_type, + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image=os.environ.get('POLYGLOT_DOCKER_IMAGE', 'ghcr.io/opendevin/eval-polyglot:v1.0.0'), + enable_auto_lint=True, + use_host_network=False, + timeout=300, # Longer timeout for compilation + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_runtime_alive=False, + remote_runtime_init_timeout=1800, + remote_runtime_enable_retries=True, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + + # Update llm_config to enable completions logging + llm_config = update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + str(instance.instance_id) + ) + config.set_llm_config(llm_config) + + agent_config = config.get_agent_config(metadata.agent_class) + agent_config.enable_prompt_extensions = False + + # Restrict tools to match SWE-Bench (only execute_bash, finish, and str_replace_editor) + agent_config.codeact_enable_jupyter = False + agent_config.codeact_enable_browsing = False + agent_config.codeact_enable_llm_editor = False + + # copy 'draft_editor' config if exists + config_copy = copy.deepcopy(config) + load_from_toml(config_copy) + if 'draft_editor' in config_copy.llms: + config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor') + + return config + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, +): + """Initialize the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Initialization Fn') + logger.info('-' * 30) + obs: CmdOutputObservation + + # Create workspace + action = CmdRunAction(command='mkdir -p /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + # Copy files to workspace + with tempfile.TemporaryDirectory() as tmpdir: + # Copy solution files + for file_path in instance.solution_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.solution_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + # Copy test files if enabled + if USE_UNIT_TESTS: + for file_path in instance.test_files: + file_path = Path(file_path) + temp_file = Path(tmpdir) / file_path.name + with open(temp_file, 'w') as f: + f.write(instance.test_content[file_path.name]) + runtime.copy_to( + str(temp_file), + '/workspace', + ) + + logger.info('-' * 30) + logger.info('END Runtime Initialization Fn') + logger.info('-' * 30) + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> Dict[str, Any]: + """Complete the runtime for the agent.""" + logger.info('-' * 30) + logger.info('BEGIN Runtime Completion Fn') + logger.info('-' * 30) + + # Run tests + test_output = "" + exit_code = 1 + + if USE_UNIT_TESTS: + # Get unique file extensions from test files + extensions = {Path(f).suffix for f in instance.test_files} + + # Find matching test command + command = None + for ext in extensions: + if ext in TEST_COMMANDS: + command = TEST_COMMANDS[ext] + break + + if command: + try: + # Use the runtime to run the command inside the Docker container + cmd_str = " ".join(command) + logger.info(f"Running test command: {cmd_str}") + + action = CmdRunAction(command=cmd_str) + logger.info(action, extra={'msg_type': 'ACTION'}) + + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + if isinstance(obs, CmdOutputObservation): + exit_code = obs.exit_code + test_output = obs.content + else: + logger.error(f"Unexpected observation type: {type(obs)}") + exit_code = 1 + test_output = f"Error: Unexpected observation type: {type(obs)}" + + # Clean up output + test_output = test_output.replace("/workspace", "workspace") + + # Log test output to history file + with tempfile.TemporaryDirectory() as tmpdir: + history_path = os.path.join(tmpdir, ".aider.chat.history.md") + with open(history_path, 'w') as f: + f.write(f"```\n{test_output}\n```") + runtime.copy_to( + history_path, + '/workspace', + ) + + except Exception as e: + logger.error(f"Error running tests: {e}") + test_output = f"Tests failed with error: {e}" + exit_code = 1 + + logger.info('-' * 30) + logger.info('END Runtime Completion Fn') + logger.info('-' * 30) + + runtime.close() + + return { + 'test_output': test_output, + 'exit_code': exit_code, + } + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + config = get_config(instance, metadata) + + # Setup the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir) + else: + logger.info( + f'\nStarting evaluation for instance {str(instance.instance_id)}.\n' + ) + + # ============================================= + # build instruction + # ============================================= + + # Prepare instruction + logger.info(instance) + instruction = instance.instruction + + # Add file list to instruction + file_list = " ".join(instance.solution_files) + instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list) + + if USE_UNIT_TESTS: + test_files = " ".join(instance.test_files) + logger.info(f'\nTest files: {test_files}\n') + instruction += ( + f'Use the appropriate test command to run the tests and verify your solution. ' + 'DO NOT EDIT the test files.\n\n' + ) + + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided ' + 'to you AND NEVER ASK FOR HUMAN HELP.\n' + ) + + # Add agent-specific instruction suffix + if metadata.agent_class in INST_SUFFIXES: + instruction += INST_SUFFIXES[metadata.agent_class] + + # ============================================= + # create sandbox and run the agent + # ============================================= + + runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + + initialize_runtime(runtime, instance=instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) + ) + if state is None: + raise ValueError('State should not be None.') + + # ============================================= + # result evaluation + # ============================================= + + return_val = complete_runtime(runtime, instance) + exit_code = return_val['exit_code'] + test_output = return_val['test_output'] + + errors = [] + test_cases = None + if test_output: + if 'SyntaxError' in test_output: + errors.append('SyntaxError') + elif 'IndentationError' in test_output: + errors.append('IndentationError') + else: + test_cases = test_output + + test_result = { + 'exit_code': exit_code, + 'test_cases': test_cases, + 'errors': errors, + } + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + histories = compatibility_for_eval_history_pairs(state.history) + metrics = state.metrics.get() if state.metrics else None + + # Save the output + output = EvalOutput( + instance_id=str(instance.instance_id), + instance=instance.to_dict(), + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + +def load_polyglot_dataset(): + """Load the polyglot benchmark dataset from the repository.""" + import glob + import json + import os + from pathlib import Path + + # Try to find the polyglot-benchmark repository + # First check the environment variable + repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH') + + # If not set, try common locations + if not repo_path or not os.path.exists(repo_path): + possible_paths = [ + '/workspace/polyglot-benchmark', + str(Path.home() / 'polyglot-benchmark'), + str(Path.home() / 'thereal' / 'polyglot-benchmark'), + str(Path(__file__).parent.parent.parent.parent.parent / 'polyglot-benchmark'), + str(Path.cwd() / 'polyglot-benchmark'), + ] + + for path in possible_paths: + if os.path.exists(path): + repo_path = path + logger.info(f"Found polyglot-benchmark repository at: {repo_path}") + break + + if not repo_path or not os.path.exists(repo_path): + logger.error("Could not find polyglot-benchmark repository. Please set POLYGLOT_BENCHMARK_PATH environment variable.") + return pd.DataFrame() + + all_tests = [] + instance_id = 0 + + # Process each language directory + for lang_dir in ['python', 'javascript', 'rust', 'go', 'cpp', 'java']: + lang_path = os.path.join(repo_path, lang_dir, 'exercises', 'practice') + if not os.path.exists(lang_path): + logger.warning(f"Language directory not found: {lang_path}") + continue + + # Process each exercise directory + for exercise_dir in os.listdir(lang_path): + exercise_path = os.path.join(lang_path, exercise_dir) + if not os.path.isdir(exercise_path): + continue + + # Check for config.json + config_file = os.path.join(exercise_path, '.meta', 'config.json') + if not os.path.exists(config_file): + logger.warning(f"Config file not found: {config_file}") + continue + + # Load config + with open(config_file, 'r') as f: + config = json.load(f) + + # Get solution and test files + solution_files = config.get('files', {}).get('solution', []) + test_files = config.get('files', {}).get('test', []) + + if not solution_files or not test_files: + logger.warning(f"Missing solution or test files in {exercise_path}") + continue + + # Load instructions + instruction = "" + intro_file = os.path.join(exercise_path, '.docs', 'introduction.md') + if os.path.exists(intro_file): + with open(intro_file, 'r') as f: + instruction += f.read() + "\n\n" + + instructions_file = os.path.join(exercise_path, '.docs', 'instructions.md') + if os.path.exists(instructions_file): + with open(instructions_file, 'r') as f: + instruction += f.read() + "\n\n" + + if not instruction: + logger.warning(f"No instructions found for {exercise_path}") + continue + + # Load solution and test content + solution_content = {} + for file_path in solution_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + solution_content[os.path.basename(file_path)] = f.read() + + test_content = {} + for file_path in test_files: + full_path = os.path.join(exercise_path, file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as f: + test_content[os.path.basename(file_path)] = f.read() + + # Create test instance + test_instance = { + 'instance_id': instance_id, + 'instance_name': exercise_dir, + 'language': lang_dir, + 'instruction': instruction, + 'solution_files': [os.path.basename(f) for f in solution_files], + 'test_files': [os.path.basename(f) for f in test_files], + 'solution_content': solution_content, + 'test_content': test_content, + } + + all_tests.append(test_instance) + instance_id += 1 + + return pd.DataFrame(all_tests) + +def add_arguments(parser): + """Add polyglot benchmark specific arguments to the parser.""" + parser.add_argument( + '--eval-languages', + type=str, + help='Comma-separated list of languages to test (e.g., "python,javascript,rust")', + ) + return parser + +if __name__ == '__main__': + # Get the argument parser and add custom arguments + import argparse + from openhands.core.config import get_parser + + parser = get_parser() + add_arguments(parser) + args = parse_arguments() + + # Load the polyglot benchmark dataset + polyglot_tests = load_polyglot_dataset() + + if polyglot_tests.empty: + logger.error("Failed to load polyglot benchmark dataset") + exit(1) + + logger.info(f"Loaded {len(polyglot_tests)} test instances from polyglot benchmark") + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results + llm_config.modify_params = False + + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + details=agent_details, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + + # Parse dataset IDs if provided + eval_ids = None + if args.eval_ids: + eval_ids = str(args.eval_ids).split(',') + logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n') + + # Filter by language if specified + if hasattr(args, 'eval_languages') and args.eval_languages: + languages = [lang.strip().lower() for lang in args.eval_languages.split(',')] + polyglot_tests = polyglot_tests[polyglot_tests['language'].str.lower().isin(languages)] + logger.info(f'\nFiltered to languages: {languages}, {len(polyglot_tests)} instances remaining\n') + + instances = prepare_dataset( + polyglot_tests, + output_file, + args.eval_n_limit, + eval_ids=eval_ids, + ) + + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + ) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh new file mode 100755 index 000000000000..1c6a2dfff7a1 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" + +# Build the Docker image +docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 -f "${BENCHMARK_DIR}/Dockerfile" "${BENCHMARK_DIR}" + +echo "Docker image built successfully: ghcr.io/opendevin/eval-polyglot:v1.0.0" \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh new file mode 100755 index 000000000000..0f93c82164a0 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" +REPO_ROOT="$( cd "${BENCHMARK_DIR}/../../.." && pwd )" + +# Create a temporary directory for the Docker build +BUILD_DIR=$(mktemp -d) +trap "rm -rf $BUILD_DIR" EXIT + +echo "Creating Docker build context in $BUILD_DIR" + +# Create a simple Dockerfile that includes all the necessary tools +cat > "$BUILD_DIR/Dockerfile" << 'EOF' +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Install common dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + wget \ + unzip \ + zip \ + software-properties-common \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + libboost-all-dev \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +RUN pip3 install --no-cache-dir pytest pytest-timeout + +# Install Node.js and npm +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Go +RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \ + && rm go1.20.5.linux-amd64.tar.gz +ENV PATH="/usr/local/go/bin:${PATH}" + +# Install Java +RUN apt-get update && apt-get install -y openjdk-17-jdk \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Install Gradle +RUN apt-get update && apt-get install -y gradle \ + && rm -rf /var/lib/apt/lists/* + +# Create workspace directory +RUN mkdir -p /workspace +WORKDIR /workspace + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 + +CMD ["/bin/bash"] +EOF + +# Build the Docker image +IMAGE_NAME="polyglot-benchmark:local" +echo "Building Docker image: $IMAGE_NAME" +docker build -t "$IMAGE_NAME" "$BUILD_DIR" + +# Export the image name as an environment variable +echo "export POLYGLOT_DOCKER_IMAGE=$IMAGE_NAME" > "$BENCHMARK_DIR/docker_image.env" + +echo "Docker image built successfully: $IMAGE_NAME" +echo "To use this image, run:" +echo "source $BENCHMARK_DIR/docker_image.env" +echo "Then run the benchmark as usual." \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh new file mode 100755 index 000000000000..757cee5ac3bb --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +set -e + +# Display usage information +function show_usage { + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --help Show this help message" + echo " --model MODEL Model configuration (default: eval_gpt4_1106_preview)" + echo " --agent AGENT Agent class (default: CodeActAgent)" + echo " --limit LIMIT Evaluation limit (default: -1 for all)" + echo " --workers WORKERS Number of workers (default: 1)" + echo " --ids IDS Comma-separated list of instance IDs" + echo " --languages LANGUAGES Comma-separated list of languages" + echo " --one-per-language Test one instance per language" + echo " --eval Run evaluation after benchmark" + echo "" + echo "Legacy positional arguments are still supported:" + echo " $0 MODEL_CONFIG GIT_VERSION AGENT EVAL_LIMIT EVAL_NUM_WORKERS EVAL_IDS EVAL_LANGUAGES" + exit 0 +} + +# Parse named arguments +ONE_PER_LANGUAGE=false +RUN_EVALUATION=false +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_usage + ;; + --model) + MODEL_CONFIG="$2" + shift 2 + ;; + --agent) + AGENT="$2" + shift 2 + ;; + --limit) + EVAL_LIMIT="$2" + shift 2 + ;; + --workers) + EVAL_NUM_WORKERS="$2" + shift 2 + ;; + --ids) + EVAL_IDS="$2" + shift 2 + ;; + --languages) + EVAL_LANGUAGES="$2" + shift 2 + ;; + --one-per-language) + ONE_PER_LANGUAGE=true + shift + ;; + --eval) + RUN_EVALUATION=true + shift + ;; + eval) + # Special case for the 'eval' parameter in the positional arguments + RUN_EVALUATION=true + shift + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +# Restore positional parameters +set -- "${POSITIONAL_ARGS[@]}" + +# Default values (if not set by named arguments) +MODEL_CONFIG=${MODEL_CONFIG:-${1:-"eval_gpt4_1106_preview"}} +GIT_VERSION=${2:-"HEAD"} +AGENT=${AGENT:-${3:-"CodeActAgent"}} +EVAL_LIMIT=${EVAL_LIMIT:-${4:-"-1"}} +EVAL_NUM_WORKERS=${EVAL_NUM_WORKERS:-${5:-"1"}} +EVAL_IDS=${EVAL_IDS:-${6:-""}} +EVAL_LANGUAGES=${EVAL_LANGUAGES:-${7:-""}} + +# Set environment variables +export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"} +export NO_DOCKER=${NO_DOCKER:-"false"} + +# Check if we have a local Docker image env file +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" +DOCKER_ENV_FILE="${BENCHMARK_DIR}/docker_image.env" + +# Set BUILD_LOCAL_DOCKER to true by default if not specified +export BUILD_LOCAL_DOCKER=${BUILD_LOCAL_DOCKER:-"true"} + +if [ -f "$DOCKER_ENV_FILE" ]; then + echo "Loading Docker image configuration from $DOCKER_ENV_FILE" + source "$DOCKER_ENV_FILE" +else + # If no local image is available, use the default + export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"} + + # Try to pull the image first + echo "Trying to pull Docker image: $POLYGLOT_DOCKER_IMAGE" + if ! docker pull "$POLYGLOT_DOCKER_IMAGE" 2>/dev/null; then + echo "Failed to pull Docker image: $POLYGLOT_DOCKER_IMAGE" + + # Build a local Docker image if pulling fails and BUILD_LOCAL_DOCKER is true + if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then + echo "Building local Docker image..." + "${SCRIPT_DIR}/build_local_docker.sh" + source "$DOCKER_ENV_FILE" + else + echo "WARNING: Docker image not found and BUILD_LOCAL_DOCKER is not set to true." + echo "You can build a local Docker image by running:" + echo " ${SCRIPT_DIR}/build_local_docker.sh" + echo "Or set BUILD_LOCAL_DOCKER=true to build it automatically." + fi + else + echo "Successfully pulled Docker image: $POLYGLOT_DOCKER_IMAGE" + fi +fi + +echo "Using Docker image: $POLYGLOT_DOCKER_IMAGE" + +# Try to find the polyglot-benchmark repository +if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then + # Check common locations + POSSIBLE_PATHS=( + "/workspace/polyglot-benchmark" + "$HOME/polyglot-benchmark" + "$HOME/thereal/polyglot-benchmark" + "$(git rev-parse --show-toplevel)/polyglot-benchmark" + "$(pwd)/polyglot-benchmark" + ) + + for path in "${POSSIBLE_PATHS[@]}"; do + if [ -d "$path" ]; then + export POLYGLOT_BENCHMARK_PATH="$path" + echo "Found polyglot-benchmark repository at: $POLYGLOT_BENCHMARK_PATH" + break + fi + done +fi + +# If still not found, try to clone it +if [ -z "$POLYGLOT_BENCHMARK_PATH" ] || [ ! -d "$POLYGLOT_BENCHMARK_PATH" ]; then + echo "Polyglot benchmark repository not found. Attempting to clone it..." + CLONE_DIR="$(git rev-parse --show-toplevel)/polyglot-benchmark" + git clone https://github.com/Aider-AI/polyglot-benchmark.git "$CLONE_DIR" + if [ $? -eq 0 ]; then + export POLYGLOT_BENCHMARK_PATH="$CLONE_DIR" + echo "Successfully cloned polyglot-benchmark to $POLYGLOT_BENCHMARK_PATH" + else + echo "Failed to clone polyglot-benchmark. Please set POLYGLOT_BENCHMARK_PATH manually." + exit 1 + fi +fi + +# Add additional arguments based on provided parameters +ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}" + +if [ "${EVAL_LIMIT}" != "-1" ]; then + ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}" +fi + +# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode) +if [ -n "${EVAL_IDS}" ] && [ "${EVAL_IDS}" != "eval" ]; then + ARGS="${ARGS} --eval-ids ${EVAL_IDS}" +fi + +if [ -n "${EVAL_LANGUAGES}" ]; then + ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}" +fi + +# Change to the repository root directory +cd "$(git rev-parse --show-toplevel)" + +# If one-per-language mode is enabled +if [ "$ONE_PER_LANGUAGE" = true ]; then + echo "Running one instance per language mode..." + + # Define the languages to test + LANGUAGES=("python" "javascript" "rust" "go" "cpp" "java") + + # Create a temporary directory for results + RESULTS_DIR="evaluation/evaluation_outputs/one_per_language_test" + mkdir -p "$RESULTS_DIR" + + # Summary file + SUMMARY_FILE="$RESULTS_DIR/summary.txt" + echo "POLYGLOT BENCHMARK - ONE INSTANCE PER LANGUAGE TEST" > "$SUMMARY_FILE" + echo "=================================================" >> "$SUMMARY_FILE" + echo "Model: $MODEL_CONFIG" >> "$SUMMARY_FILE" + echo "Agent: $AGENT" >> "$SUMMARY_FILE" + echo "Date: $(date)" >> "$SUMMARY_FILE" + echo "=================================================" >> "$SUMMARY_FILE" + echo "" >> "$SUMMARY_FILE" + + # Test each language + for LANG in "${LANGUAGES[@]}"; do + echo "" + echo "===== Testing language: $LANG =====" + echo "" + + # Run with one instance for this language + LANG_ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers 1 --eval-n-limit 1 --eval-languages ${LANG} --eval-note one_per_language_${LANG}" + + # Run the evaluation for this language + if poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${LANG_ARGS}; then + RESULT="PASSED" + else + RESULT="FAILED" + fi + + # Add to summary + echo "${LANG}: ${RESULT}" >> "$SUMMARY_FILE" + done + + # Display summary + echo "" + echo "===== TEST SUMMARY =====" + cat "$SUMMARY_FILE" + echo "" + echo "Detailed results available in: $RESULTS_DIR" + + # Run evaluation if requested + if [ "$RUN_EVALUATION" = true ]; then + echo "" + echo "======================================" + echo "Running detailed evaluation on results..." + echo "======================================" + echo "" + + # Evaluate each language's results + for LANG in "${LANGUAGES[@]}"; do + # Try to find the output directory for this language + LANG_OUTPUT_DIR=$(find evaluation/evaluation_outputs -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + + if [ -z "$LANG_OUTPUT_DIR" ]; then + LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1) + fi + + if [ -z "$LANG_OUTPUT_DIR" ]; then + LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}" + fi + + LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl" + + # Print the language output directory and file for debugging + echo "" + echo "Language: $LANG" + echo "Output directory: $LANG_OUTPUT_DIR" + echo "Output file: $LANG_OUTPUT_FILE" + + if [ -f "$LANG_OUTPUT_FILE" ]; then + echo "" + echo "===== Evaluating $LANG results =====" + echo "" + echo "Evaluating results in: $LANG_OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="${LANG_OUTPUT_DIR}/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$LANG_OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + fi + done + + echo "" + echo "Detailed evaluation complete." + fi +else + # Run the normal evaluation + poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS} + + # Run evaluation if requested + if [ "$RUN_EVALUATION" = true ]; then + echo "" + echo "======================================" + echo "Running evaluation on results..." + echo "======================================" + echo "" + + # Get the output directory - first try the default location + OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + + # If not found, try to find it anywhere under evaluation_outputs + if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1) + fi + + # If still not found, try to find any output.jsonl file + if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1) + if [ -n "$OUTPUT_FILE" ]; then + OUTPUT_DIR=$(dirname "$OUTPUT_FILE") + fi + else + OUTPUT_FILE="$OUTPUT_DIR/output.jsonl" + fi + + # Print the output directory and file for debugging + echo "" + echo "Output directory: $OUTPUT_DIR" + echo "Output file: $OUTPUT_FILE" + + if [ -f "$OUTPUT_FILE" ]; then + echo "Evaluating results in: $OUTPUT_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" + + # Save the evaluation results + EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt" + echo "Saving evaluation results to: $EVAL_RESULTS_FILE" + poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE" + + echo "" + echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE" + else + echo "Error: Output file not found: $OUTPUT_FILE" + echo "Cannot run evaluation." + fi + fi +fi \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py new file mode 100755 index 000000000000..988f3a618bff --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +from collections import defaultdict + +def load_jsonl(file_path): + """Load data from a jsonl file.""" + data = [] + with open(file_path, 'r') as f: + for line in f: + data.append(json.loads(line)) + return data + +def summarize_results(output_file): + """Summarize the results of the polyglot benchmark evaluation.""" + if not os.path.exists(output_file): + print(f"Error: Output file {output_file} does not exist.") + return + + results = load_jsonl(output_file) + + # Count total instances + total_instances = len(results) + print(f"Total instances: {total_instances}") + + # Count by language + language_counts = defaultdict(int) + language_passed = defaultdict(int) + + # Count passed and failed instances + passed_instances = [] + failed_instances = [] + + for result in results: + instance = result.get('instance', {}) + language = instance.get('language', 'unknown') + instance_name = instance.get('instance_name', 'unknown') + instance_id = result.get('instance_id', 'unknown') + + language_counts[language] += 1 + + # Check if all tests passed + test_result = result.get('test_result', {}) + exit_code = test_result.get('exit_code', 1) + + if exit_code == 0: + passed_instances.append((instance_id, language, instance_name)) + language_passed[language] += 1 + else: + failed_instances.append((instance_id, language, instance_name)) + + # Print summary + print("\nResults by language:") + print("--------------------") + for language, count in sorted(language_counts.items()): + passed = language_passed[language] + percentage = (passed / count) * 100 if count > 0 else 0 + print(f"{language}: {passed}/{count} ({percentage:.1f}%)") + + # Overall pass rate + total_passed = len(passed_instances) + overall_percentage = (total_passed / total_instances) * 100 if total_instances > 0 else 0 + print(f"\nOverall pass rate: {total_passed}/{total_instances} ({overall_percentage:.1f}%)") + + # Print passed instances + print("\nPassed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(passed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + + # Print failed instances + print("\nFailed instances:") + print("----------------") + for instance_id, language, instance_name in sorted(failed_instances): + print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Summarize polyglot benchmark results") + parser.add_argument("output_file", help="Path to the output.jsonl file") + args = parser.parse_args() + + summarize_results(args.output_file) \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py new file mode 100755 index 000000000000..f196651b890d --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import ( + load_polyglot_dataset, + process_instance, + make_metadata, + get_llm_config_arg, +) +from openhands.core.logger import openhands_logger as logger + +def test_language(language, model, agent): + """Test the first instance of a specific language.""" + print(f"\n{'=' * 50}") + print(f"Testing language: {language}") + print(f"{'=' * 50}\n") + + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + # Filter by language + dataset = dataset[dataset['language'].str.lower() == language.lower()] + if dataset.empty: + print(f"No instances found for language: {language}") + return False + + # Get the first instance + instance = dataset.iloc[0] + print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})") + + # Get LLM config + llm_config = get_llm_config_arg(model) + if llm_config is None: + print(f"Could not find LLM config: {model}") + return False + + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + + # Create metadata + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + agent, + 30, # max_iterations + f"test_{language}", + f"evaluation/evaluation_outputs/test_{language}", + details=agent_details, + ) + + # Process the instance + try: + output = process_instance(instance, metadata, reset_logger=False) + print("\nTest completed successfully!") + print(f"Exit code: {output.test_result['exit_code']}") + print(f"Passed: {output.test_result['exit_code'] == 0}") + return output.test_result['exit_code'] == 0 + except Exception as e: + print(f"Error processing instance: {e}") + return False + +def main(): + parser = argparse.ArgumentParser(description="Test the polyglot benchmark with one instance per language") + parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name") + parser.add_argument("--agent", default="CodeActAgent", help="Agent class name") + parser.add_argument("--languages", default="python,rust,go,javascript,cpp,java", + help="Comma-separated list of languages to test") + args = parser.parse_args() + + languages = args.languages.split(',') + results = {} + + for language in languages: + language = language.strip() + if not language: + continue + + success = test_language(language, args.model, args.agent) + results[language] = "PASSED" if success else "FAILED" + + # Print summary + print("\n" + "=" * 50) + print("SUMMARY OF RESULTS") + print("=" * 50) + + for language, result in results.items(): + print(f"{language.ljust(12)}: {result}") + + # Check if all tests passed + all_passed = all(result == "PASSED" for result in results.values()) + print("\nOverall result:", "PASSED" if all_passed else "FAILED") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py new file mode 100755 index 000000000000..708259732b02 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os +import sys +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import load_polyglot_dataset + +def main(): + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + # Print summary + print(f"Loaded {len(dataset)} test instances") + + # Print language distribution + language_counts = dataset['language'].value_counts() + print("\nLanguage distribution:") + for language, count in language_counts.items(): + print(f"{language}: {count}") + + # Print a sample instance + if not dataset.empty: + print("\nSample instance:") + sample = dataset.iloc[0] + print(f"ID: {sample.instance_id}") + print(f"Name: {sample.instance_name}") + print(f"Language: {sample.language}") + print(f"Solution files: {sample.solution_files}") + print(f"Test files: {sample.test_files}") + print(f"Instruction (first 100 chars): {sample.instruction[:100]}...") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py new file mode 100755 index 000000000000..c946356e90d6 --- /dev/null +++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from pathlib import Path + +# Add the parent directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from evaluation.benchmarks.polyglot_benchmark.run_infer import ( + load_polyglot_dataset, + process_instance, + make_metadata, + get_llm_config_arg, +) +from openhands.core.logger import openhands_logger as logger + +def main(): + parser = argparse.ArgumentParser(description="Test the polyglot benchmark with a single instance") + parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name") + parser.add_argument("--agent", default="CodeActAgent", help="Agent class name") + parser.add_argument("--instance-id", type=int, default=0, help="Instance ID to test") + parser.add_argument("--language", help="Filter by language") + args = parser.parse_args() + + # Set the environment variable for the polyglot benchmark path + os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark' + + # Load the dataset + dataset = load_polyglot_dataset() + + if args.language: + dataset = dataset[dataset['language'].str.lower() == args.language.lower()] + if dataset.empty: + print(f"No instances found for language: {args.language}") + return + + # Get the instance to test + if args.instance_id >= len(dataset): + print(f"Instance ID {args.instance_id} is out of range. Max ID: {len(dataset) - 1}") + return + + instance = dataset.iloc[args.instance_id] + print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})") + + # Get LLM config + llm_config = get_llm_config_arg(args.model) + if llm_config is None: + print(f"Could not find LLM config: {args.model}") + return + + # Create details dictionary with agent configuration + agent_details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": False, + "codeact_enable_llm_editor": False, + } + } + + # Create metadata + metadata = make_metadata( + llm_config, + 'PolyglotBenchmark', + args.agent, + 30, # max_iterations + "test", + "evaluation/evaluation_outputs/test", + details=agent_details, + ) + + # Process the instance + try: + output = process_instance(instance, metadata, reset_logger=False) + print("\nTest completed successfully!") + print(f"Exit code: {output.test_result['exit_code']}") + print(f"Passed: {output.test_result['exit_code'] == 0}") + except Exception as e: + print(f"Error processing instance: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 89fe618a6c34..56448fe36c3a 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -540,7 +540,14 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') - details = {} + # Create details dictionary with agent configuration + details = { + "agent_config": { + "codeact_enable_jupyter": False, + "codeact_enable_browsing": RUN_WITH_BROWSING, + "codeact_enable_llm_editor": False, + } + } _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) dataset_descrption = ( diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 566fbbd71bb3..55ac1e3dd37c 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -160,6 +160,35 @@ def cleanup(): process.join() +def get_tools_string(agent_class: str, details: dict[str, Any] | None = None) -> str: + """Generate a string representation of the tools used by the agent. + + Args: + agent_class: The agent class name. + details: Additional details that might contain tool configuration. + + Returns: + A string representation of the tools used, e.g., "bash+finish+str_replace". + """ + # Default tools for CodeActAgent + if agent_class == "CodeActAgent": + tools = ["bash", "finish", "str_replace"] + + # Check if additional tools are enabled + if details and "agent_config" in details: + agent_config = details.get("agent_config", {}) + if agent_config.get("codeact_enable_browsing", False): + tools.extend(["web_read", "browser"]) + if agent_config.get("codeact_enable_jupyter", False): + tools.append("ipython") + if agent_config.get("codeact_enable_llm_editor", False): + tools[-1] = "llm_editor" # Replace str_replace with llm_editor + + return "+".join(tools) + + # For other agents, return a default string + return "default_tools" + def make_metadata( llm_config: LLMConfig, dataset_name: str, @@ -175,12 +204,15 @@ def make_metadata( model_name = llm_config.model.split('/')[-1] model_path = model_name.replace(':', '_').replace('@', '-') eval_note = f'_N_{eval_note}' if eval_note else '' - + + # Get tools string + tools_string = get_tools_string(agent_class, details) + eval_output_path = os.path.join( eval_output_dir, dataset_name, agent_class, - f'{model_path}_maxiter_{max_iterations}{eval_note}', + f'{model_path}_maxiter_{max_iterations}_tools_{tools_string}{eval_note}', ) pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True) @@ -484,14 +516,15 @@ def update_llm_config_for_completions_logging( instance_id: str, ) -> LLMConfig: """Update the LLM config for logging completions.""" - if llm_config.log_completions: - llm_config.log_completions_folder = os.path.join( - eval_output_dir, 'llm_completions', instance_id - ) - logger.info( - f'Logging LLM completions for instance {instance_id} to ' - f'{llm_config.log_completions_folder}' - ) + # Always enable completions logging + llm_config.log_completions = True + llm_config.log_completions_folder = os.path.join( + eval_output_dir, 'llm_completions', instance_id + ) + logger.info( + f'Logging LLM completions for instance {instance_id} to ' + f'{llm_config.log_completions_folder}' + ) return llm_config diff --git a/run_benchmarks.sh b/run_benchmarks.sh new file mode 100755 index 000000000000..fdf764bd00fa --- /dev/null +++ b/run_benchmarks.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Script to run OpenHands benchmarks with retry functionality +# This script will run the polyglot_benchmark and aider_bench benchmarks +# and retry them until they succeed or reach the maximum number of attempts. + +# Configuration +MAX_ATTEMPTS=10 +RETRY_DELAY=30 # seconds +MODEL_CONFIG="togetherDeepseek" +GIT_VERSION="HEAD" +AGENT="CodeActAgent" +EVAL_LIMIT=1000 +NUM_WORKERS=30 + +# Check if Docker is available +check_docker() { + if ! command -v docker &> /dev/null; then + echo "WARNING: Docker is not available in this environment." + echo "The benchmarks require Docker to run properly." + echo "Continuing anyway, but expect failures if Docker is required." + fi +} + +# Function to run a command and retry until it succeeds +run_with_retry() { + local cmd="$1" + local benchmark_name="$2" + local attempt=1 + local exit_code=1 + + echo "$(date '+%Y-%m-%d %H:%M:%S') - Running $benchmark_name benchmark" + echo "Command: $cmd" + + while [[ $exit_code -ne 0 && $attempt -le $MAX_ATTEMPTS ]]; do + echo "$(date '+%Y-%m-%d %H:%M:%S') - Attempt $attempt of $MAX_ATTEMPTS..." + + # Run the command + eval "$cmd" + exit_code=$? + + if [[ $exit_code -ne 0 ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - Command failed with exit code $exit_code." + + if [[ $attempt -lt $MAX_ATTEMPTS ]]; then + echo "Retrying in $RETRY_DELAY seconds..." + sleep $RETRY_DELAY + ((attempt++)) + fi + fi + done + + if [[ $exit_code -ne 0 ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark failed after $MAX_ATTEMPTS attempts." + return 1 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark succeeded on attempt $attempt." + return 0 + fi +} + +# Main execution +echo "=====================================================================" +echo "OpenHands Benchmark Runner" +echo "Started at: $(date '+%Y-%m-%d %H:%M:%S')" +echo "=====================================================================" +echo "Model config: $MODEL_CONFIG" +echo "Git version: $GIT_VERSION" +echo "Agent: $AGENT" +echo "Eval limit: $EVAL_LIMIT" +echo "Number of workers: $NUM_WORKERS" +echo "Maximum retry attempts: $MAX_ATTEMPTS" +echo "Retry delay: $RETRY_DELAY seconds" +echo "=====================================================================" + +# Check for Docker +check_docker + +# Run polyglot_benchmark +echo "=====================================================================" +echo "Running polyglot_benchmark" +echo "=====================================================================" +run_with_retry "./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS eval" "polyglot_benchmark" +POLYGLOT_RESULT=$? + +# Run aider_bench +echo "=====================================================================" +echo "Running aider_bench" +echo "=====================================================================" +run_with_retry "./evaluation/benchmarks/aider_bench/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS \"\" eval" "aider_bench" +AIDER_RESULT=$? + +# Summary +echo "=====================================================================" +echo "Benchmark Run Summary - Completed at: $(date '+%Y-%m-%d %H:%M:%S')" +echo "=====================================================================" +echo "polyglot_benchmark: $([ $POLYGLOT_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')" +echo "aider_bench: $([ $AIDER_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')" +echo "=====================================================================" + +# Exit with success only if both benchmarks succeeded +if [[ $POLYGLOT_RESULT -eq 0 && $AIDER_RESULT -eq 0 ]]; then + echo "All benchmarks completed successfully." + exit 0 +else + echo "One or more benchmarks failed." + exit 1 +fi \ No newline at end of file