From 92e98f65239677a2bd241abae9a15749eca4fa66 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 04:35:27 +0000
Subject: [PATCH 01/22] feat: Enable llm_completions logging in aider_bench

- Added update_llm_config_for_completions_logging to imports
- Modified get_config to accept instance parameter
- Updated llm_config to enable completions logging
- Updated process_instance to pass instance to get_config

This change makes aider_bench save llm_completions in the same way as swe_bench,
with completions being saved in {eval_output_dir}/llm_completions/{instance_id}/
---
 evaluation/benchmarks/aider_bench/run_infer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 8045f948d3f9..1ee68c21c2f0 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -20,6 +20,7 @@
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -45,6 +46,7 @@
 
 
 def get_config(
+    instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
     config = AppConfig(
@@ -67,7 +69,13 @@ def get_config(
         workspace_base=None,
         workspace_mount_path=None,
     )
-    config.set_llm_config(metadata.llm_config)
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
 
@@ -170,7 +178,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
-    config = get_config(metadata)
+    config = get_config(instance, metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:

From bc8f20d35a6639ee1789832b3d1c4fe830caef3c Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:22:02 +0000
Subject: [PATCH 02/22] Add polyglot benchmark implementation

---
 .../benchmarks/polyglot_benchmark/Dockerfile  |  63 +++
 .../benchmarks/polyglot_benchmark/README.md   |  90 ++++
 .../polyglot_benchmark/helper/__init__.py     |   0
 .../polyglot_benchmark/helper/prompts.py      |  28 +
 .../polyglot_benchmark/run_infer.py           | 487 ++++++++++++++++++
 .../scripts/build_docker.sh                   |  12 +
 .../polyglot_benchmark/scripts/run_infer.sh   |  35 ++
 .../scripts/summarize_results.py              |  84 +++
 .../polyglot_benchmark/test_load_dataset.py   |  40 ++
 .../benchmarks/polyglot_benchmark/test_run.py |  73 +++
 10 files changed, 912 insertions(+)
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/Dockerfile
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/README.md
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/__init__.py
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/run_infer.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_run.py

diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
new file mode 100644
index 000000000000..ed789e6d8000
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
@@ -0,0 +1,63 @@
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    wget \
+    software-properties-common \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    libboost-all-dev \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip3 install --no-cache-dir pytest pytest-timeout
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \
+    && rm go1.20.5.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java
+RUN apt-get update && apt-get install -y openjdk-17-jdk \
+    && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Install Gradle
+RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
+    && mkdir /opt/gradle \
+    && unzip -d /opt/gradle gradle-7.6-bin.zip \
+    && rm gradle-7.6-bin.zip
+ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
new file mode 100644
index 000000000000..d92251acb9f7
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -0,0 +1,90 @@
+# Polyglot Benchmark
+
+This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
+
+## Features
+
+- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
+- End-to-end evaluation of code editing capabilities
+- Automated test execution and validation
+- Parallel evaluation with multiple workers
+- Detailed metrics and logging
+
+## Setup
+
+1. Clone the polyglot-benchmark repository:
+   ```bash
+   git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark
+   ```
+
+2. Build the Docker image for the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
+   ```
+
+## Usage
+
+1. Make sure you have the required dependencies installed:
+   ```bash
+   pip install -e .[dev]
+   ```
+
+2. Run the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh <model_config> <git-version> <agent> <eval_limit> <eval-num-workers> <eval_ids> <eval_languages>
+   ```
+
+### Command Line Arguments
+
+- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`)
+- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`)
+- `agent`: Agent class name (e.g., `CodeActAgent`)
+- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all)
+- `eval-num-workers`: Number of parallel workers (default: `1`)
+- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`)
+- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`)
+
+### Environment Variables
+
+You can also set the following environment variables:
+
+```bash
+export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the polyglot-benchmark repository
+export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
+```
+
+### Example
+
+```bash
+# Run evaluation on CodeActAgent for all Python instances with 2 workers
+export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark"
+./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python"
+```
+
+## Summarize Results
+
+After running the benchmark, you can summarize the results:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py <path_to_output_jsonl_file>
+```
+
+Example:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl
+```
+
+## Supported Languages
+
+The benchmark supports the following languages and test frameworks:
+- Python: pytest
+- JavaScript: npm test
+- Rust: cargo test
+- Go: go test
+- C++: make test
+- Java: Gradle test
+
+## Docker Support
+
+The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks.
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
new file mode 100644
index 000000000000..61bc0e54cb11
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
@@ -0,0 +1,28 @@
+"""Prompts used in the polyglot benchmark."""
+
+INSTRUCTIONS_ADDENDUM = """
+I've provided the following files that need to be modified:
+{file_list}
+
+Please help me implement the necessary changes to meet the requirements.
+You should ONLY modify these files, and NOT create any new files.
+"""
+
+TEST_FAILURES = """
+The tests failed. Please fix the issues and try again.
+Remember to only modify the following files:
+{file_list}
+"""
+
+# Dictionary mapping agent class names to their specific instruction suffixes
+INST_SUFFIXES = {
+    'CodeActAgent': (
+        'REMEMBER: All edits must be made directly in the files. Do NOT send'
+        ' the edited file as output to the user.\n'
+    )
+}
+
+# Dictionary mapping agent class names to their fake response functions
+FAKE_RESPONSES = {
+    'CodeActAgent': lambda _: None,  # Will be replaced with codeact_user_response from shared.py
+}
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
new file mode 100644
index 000000000000..45a9ee4f91ac
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -0,0 +1,487 @@
+import asyncio
+import copy
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+from evaluation.benchmarks.polyglot_benchmark.helper.prompts import (
+    INSTRUCTIONS_ADDENDUM,
+    INST_SUFFIXES,
+    TEST_FAILURES,
+    FAKE_RESPONSES,
+)
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+    codeact_user_response,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure visibility of unit tests to the Agent.
+USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true'
+
+# Map of file extensions to test commands
+TEST_COMMANDS = {
+    ".py": ["python3", "-m", "pytest"],
+    ".rs": ["cargo", "test", "--", "--include-ignored"],
+    ".go": ["go", "test", "./..."],
+    ".js": ["npm", "test"],
+    ".cpp": ["make", "test"],
+    ".java": ["./gradlew", "test"],
+}
+
+# Update fake responses with the actual function
+FAKE_RESPONSES['CodeActAgent'] = codeact_user_response
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0',  # TODO: Create this image
+            enable_auto_lint=True,
+            use_host_network=False,
+            timeout=300,  # Longer timeout for compilation
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    # Enable logging of LLM completions
+    llm_config.log_completions = True
+    config.set_llm_config(llm_config)
+
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+
+    # Create workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Copy files to workspace
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Copy solution files
+        for file_path in instance.solution_files:
+            file_path = Path(file_path)
+            temp_file = Path(tmpdir) / file_path.name
+            with open(temp_file, 'w') as f:
+                f.write(instance.solution_content[file_path.name])
+            runtime.copy_to(
+                str(temp_file),
+                '/workspace',
+            )
+
+        # Copy test files if enabled
+        if USE_UNIT_TESTS:
+            for file_path in instance.test_files:
+                file_path = Path(file_path)
+                temp_file = Path(tmpdir) / file_path.name
+                with open(temp_file, 'w') as f:
+                    f.write(instance.test_content[file_path.name])
+                runtime.copy_to(
+                    str(temp_file),
+                    '/workspace',
+                )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> Dict[str, Any]:
+    """Complete the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    # Run tests
+    test_output = ""
+    exit_code = 1
+    
+    if USE_UNIT_TESTS:
+        # Get unique file extensions from test files
+        extensions = {Path(f).suffix for f in instance.test_files}
+        
+        # Find matching test command
+        command = None
+        for ext in extensions:
+            if ext in TEST_COMMANDS:
+                command = TEST_COMMANDS[ext]
+                break
+                
+        if command:
+            try:
+                result = subprocess.run(
+                    command,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    timeout=180,  # 3 minutes timeout
+                    cwd="/workspace",
+                    encoding="utf-8",
+                    errors="replace",
+                )
+                exit_code = result.returncode
+                test_output = result.stdout
+                
+                # Clean up output
+                test_output = test_output.replace("/workspace", "workspace")
+                
+                # Log test output to history file
+                with open("/workspace/.aider.chat.history.md", "a") as fh:
+                    fh.write(f"```\n{test_output}\n```")
+                    
+            except subprocess.TimeoutExpired:
+                test_output = "Tests timed out!"
+                exit_code = 1
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    runtime.close()
+
+    return {
+        'test_output': test_output,
+        'exit_code': exit_code,
+    }
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = instance.instruction
+
+    # Add file list to instruction
+    file_list = " ".join(instance.solution_files)
+    instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list)
+
+    if USE_UNIT_TESTS:
+        test_files = " ".join(instance.test_files)
+        logger.info(f'\nTest files: {test_files}\n')
+        instruction += (
+            f'Use the appropriate test command to run the tests and verify your solution. '
+            'DO NOT EDIT the test files.\n\n'
+        )
+
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+    
+    # Add agent-specific instruction suffix
+    if metadata.agent_class in INST_SUFFIXES:
+        instruction += INST_SUFFIXES[metadata.agent_class]
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    exit_code = return_val['exit_code']
+    test_output = return_val['test_output']
+
+    errors = []
+    test_cases = None
+    if test_output:
+        if 'SyntaxError' in test_output:
+            errors.append('SyntaxError')
+        elif 'IndentationError' in test_output:
+            errors.append('IndentationError')
+        else:
+            test_cases = test_output
+
+    test_result = {
+        'exit_code': exit_code,
+        'test_cases': test_cases,
+        'errors': errors,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+def load_polyglot_dataset():
+    """Load the polyglot benchmark dataset from the repository."""
+    import glob
+    import json
+    import os
+    
+    # Path to the polyglot-benchmark repository
+    repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark')
+    
+    all_tests = []
+    instance_id = 0
+    
+    # Process each language directory
+    for lang_dir in ['python', 'javascript', 'rust', 'go', 'cpp', 'java']:
+        lang_path = os.path.join(repo_path, lang_dir, 'exercises', 'practice')
+        if not os.path.exists(lang_path):
+            logger.warning(f"Language directory not found: {lang_path}")
+            continue
+            
+        # Process each exercise directory
+        for exercise_dir in os.listdir(lang_path):
+            exercise_path = os.path.join(lang_path, exercise_dir)
+            if not os.path.isdir(exercise_path):
+                continue
+                
+            # Check for config.json
+            config_file = os.path.join(exercise_path, '.meta', 'config.json')
+            if not os.path.exists(config_file):
+                logger.warning(f"Config file not found: {config_file}")
+                continue
+                
+            # Load config
+            with open(config_file, 'r') as f:
+                config = json.load(f)
+                
+            # Get solution and test files
+            solution_files = config.get('files', {}).get('solution', [])
+            test_files = config.get('files', {}).get('test', [])
+            
+            if not solution_files or not test_files:
+                logger.warning(f"Missing solution or test files in {exercise_path}")
+                continue
+                
+            # Load instructions
+            instruction = ""
+            intro_file = os.path.join(exercise_path, '.docs', 'introduction.md')
+            if os.path.exists(intro_file):
+                with open(intro_file, 'r') as f:
+                    instruction += f.read() + "\n\n"
+                    
+            instructions_file = os.path.join(exercise_path, '.docs', 'instructions.md')
+            if os.path.exists(instructions_file):
+                with open(instructions_file, 'r') as f:
+                    instruction += f.read() + "\n\n"
+                    
+            if not instruction:
+                logger.warning(f"No instructions found for {exercise_path}")
+                continue
+                
+            # Load solution and test content
+            solution_content = {}
+            for file_path in solution_files:
+                full_path = os.path.join(exercise_path, file_path)
+                if os.path.exists(full_path):
+                    with open(full_path, 'r') as f:
+                        solution_content[os.path.basename(file_path)] = f.read()
+                        
+            test_content = {}
+            for file_path in test_files:
+                full_path = os.path.join(exercise_path, file_path)
+                if os.path.exists(full_path):
+                    with open(full_path, 'r') as f:
+                        test_content[os.path.basename(file_path)] = f.read()
+                        
+            # Create test instance
+            test_instance = {
+                'instance_id': instance_id,
+                'instance_name': exercise_dir,
+                'language': lang_dir,
+                'instruction': instruction,
+                'solution_files': [os.path.basename(f) for f in solution_files],
+                'test_files': [os.path.basename(f) for f in test_files],
+                'solution_content': solution_content,
+                'test_content': test_content,
+            }
+            
+            all_tests.append(test_instance)
+            instance_id += 1
+            
+    return pd.DataFrame(all_tests)
+
+def add_arguments(parser):
+    """Add polyglot benchmark specific arguments to the parser."""
+    parser.add_argument(
+        '--eval-languages',
+        type=str,
+        help='Comma-separated list of languages to test (e.g., "python,javascript,rust")',
+    )
+    return parser
+
+if __name__ == '__main__':
+    # Add custom arguments
+    parser = parse_arguments.__self__
+    add_arguments(parser)
+    args = parser.parse_args()
+    
+    # Load the polyglot benchmark dataset
+    polyglot_tests = load_polyglot_dataset()
+    
+    if polyglot_tests.empty:
+        logger.error("Failed to load polyglot benchmark dataset")
+        exit(1)
+        
+    logger.info(f"Loaded {len(polyglot_tests)} test instances from polyglot benchmark")
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+        # Enable logging of LLM completions
+        llm_config.log_completions = True
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotBenchmark',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+        
+    # Filter by language if specified
+    if hasattr(args, 'eval_languages') and args.eval_languages:
+        languages = [lang.strip().lower() for lang in args.eval_languages.split(',')]
+        polyglot_tests = polyglot_tests[polyglot_tests['language'].str.lower().isin(languages)]
+        logger.info(f'\nFiltered to languages: {languages}, {len(polyglot_tests)} instances remaining\n')
+
+    instances = prepare_dataset(
+        polyglot_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
new file mode 100755
index 000000000000..1c6a2dfff7a1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory of this script
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
+
+# Build the Docker image
+docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 -f "${BENCHMARK_DIR}/Dockerfile" "${BENCHMARK_DIR}"
+
+echo "Docker image built successfully: ghcr.io/opendevin/eval-polyglot:v1.0.0"
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
new file mode 100755
index 000000000000..ce998a112330
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -e
+
+# Default values
+MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"}
+GIT_VERSION=${2:-"HEAD"}
+AGENT=${3:-"CodeActAgent"}
+EVAL_LIMIT=${4:-"-1"}
+EVAL_NUM_WORKERS=${5:-"1"}
+EVAL_IDS=${6:-""}
+EVAL_LANGUAGES=${7:-""}
+
+# Set environment variables
+export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"}
+export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
+
+# Add additional arguments based on provided parameters
+ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}"
+
+if [ "${EVAL_LIMIT}" != "-1" ]; then
+  ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}"
+fi
+
+if [ -n "${EVAL_IDS}" ]; then
+  ARGS="${ARGS} --eval-ids ${EVAL_IDS}"
+fi
+
+if [ -n "${EVAL_LANGUAGES}" ]; then
+  ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}"
+fi
+
+# Run the evaluation
+cd "$(git rev-parse --show-toplevel)"
+poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS}
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
new file mode 100755
index 000000000000..988f3a618bff
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+from collections import defaultdict
+
+def load_jsonl(file_path):
+    """Load data from a jsonl file."""
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+def summarize_results(output_file):
+    """Summarize the results of the polyglot benchmark evaluation."""
+    if not os.path.exists(output_file):
+        print(f"Error: Output file {output_file} does not exist.")
+        return
+        
+    results = load_jsonl(output_file)
+    
+    # Count total instances
+    total_instances = len(results)
+    print(f"Total instances: {total_instances}")
+    
+    # Count by language
+    language_counts = defaultdict(int)
+    language_passed = defaultdict(int)
+    
+    # Count passed and failed instances
+    passed_instances = []
+    failed_instances = []
+    
+    for result in results:
+        instance = result.get('instance', {})
+        language = instance.get('language', 'unknown')
+        instance_name = instance.get('instance_name', 'unknown')
+        instance_id = result.get('instance_id', 'unknown')
+        
+        language_counts[language] += 1
+        
+        # Check if all tests passed
+        test_result = result.get('test_result', {})
+        exit_code = test_result.get('exit_code', 1)
+        
+        if exit_code == 0:
+            passed_instances.append((instance_id, language, instance_name))
+            language_passed[language] += 1
+        else:
+            failed_instances.append((instance_id, language, instance_name))
+    
+    # Print summary
+    print("\nResults by language:")
+    print("--------------------")
+    for language, count in sorted(language_counts.items()):
+        passed = language_passed[language]
+        percentage = (passed / count) * 100 if count > 0 else 0
+        print(f"{language}: {passed}/{count} ({percentage:.1f}%)")
+    
+    # Overall pass rate
+    total_passed = len(passed_instances)
+    overall_percentage = (total_passed / total_instances) * 100 if total_instances > 0 else 0
+    print(f"\nOverall pass rate: {total_passed}/{total_instances} ({overall_percentage:.1f}%)")
+    
+    # Print passed instances
+    print("\nPassed instances:")
+    print("----------------")
+    for instance_id, language, instance_name in sorted(passed_instances):
+        print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}")
+    
+    # Print failed instances
+    print("\nFailed instances:")
+    print("----------------")
+    for instance_id, language, instance_name in sorted(failed_instances):
+        print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize polyglot benchmark results")
+    parser.add_argument("output_file", help="Path to the output.jsonl file")
+    args = parser.parse_args()
+    
+    summarize_results(args.output_file)
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
new file mode 100755
index 000000000000..708259732b02
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+from pathlib import Path
+
+# Add the parent directory to the Python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from evaluation.benchmarks.polyglot_benchmark.run_infer import load_polyglot_dataset
+
+def main():
+    # Set the environment variable for the polyglot benchmark path
+    os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark'
+    
+    # Load the dataset
+    dataset = load_polyglot_dataset()
+    
+    # Print summary
+    print(f"Loaded {len(dataset)} test instances")
+    
+    # Print language distribution
+    language_counts = dataset['language'].value_counts()
+    print("\nLanguage distribution:")
+    for language, count in language_counts.items():
+        print(f"{language}: {count}")
+    
+    # Print a sample instance
+    if not dataset.empty:
+        print("\nSample instance:")
+        sample = dataset.iloc[0]
+        print(f"ID: {sample.instance_id}")
+        print(f"Name: {sample.instance_name}")
+        print(f"Language: {sample.language}")
+        print(f"Solution files: {sample.solution_files}")
+        print(f"Test files: {sample.test_files}")
+        print(f"Instruction (first 100 chars): {sample.instruction[:100]}...")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py
new file mode 100755
index 000000000000..a8671b0646f1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import argparse
+from pathlib import Path
+
+# Add the parent directory to the Python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from evaluation.benchmarks.polyglot_benchmark.run_infer import (
+    load_polyglot_dataset,
+    process_instance,
+    make_metadata,
+    get_llm_config_arg,
+)
+from openhands.core.logger import openhands_logger as logger
+
+def main():
+    parser = argparse.ArgumentParser(description="Test the polyglot benchmark with a single instance")
+    parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name")
+    parser.add_argument("--agent", default="CodeActAgent", help="Agent class name")
+    parser.add_argument("--instance-id", type=int, default=0, help="Instance ID to test")
+    parser.add_argument("--language", help="Filter by language")
+    args = parser.parse_args()
+    
+    # Set the environment variable for the polyglot benchmark path
+    os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark'
+    
+    # Load the dataset
+    dataset = load_polyglot_dataset()
+    
+    if args.language:
+        dataset = dataset[dataset['language'].str.lower() == args.language.lower()]
+        if dataset.empty:
+            print(f"No instances found for language: {args.language}")
+            return
+    
+    # Get the instance to test
+    if args.instance_id >= len(dataset):
+        print(f"Instance ID {args.instance_id} is out of range. Max ID: {len(dataset) - 1}")
+        return
+        
+    instance = dataset.iloc[args.instance_id]
+    print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})")
+    
+    # Get LLM config
+    llm_config = get_llm_config_arg(args.model)
+    if llm_config is None:
+        print(f"Could not find LLM config: {args.model}")
+        return
+        
+    # Create metadata
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotBenchmark',
+        args.agent,
+        30,  # max_iterations
+        "test",
+        "evaluation/evaluation_outputs/test",
+    )
+    
+    # Process the instance
+    try:
+        output = process_instance(instance, metadata, reset_logger=False)
+        print("\nTest completed successfully!")
+        print(f"Exit code: {output.test_result['exit_code']}")
+        print(f"Passed: {output.test_result['exit_code'] == 0}")
+    except Exception as e:
+        print(f"Error processing instance: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 37ba6965aaf5f5216f2a77ca191fde1ef12aef2f Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:26:06 +0000
Subject: [PATCH 03/22] Fix argument parser in polyglot benchmark

---
 evaluation/benchmarks/polyglot_benchmark/run_infer.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index 45a9ee4f91ac..6fce76d9dbdf 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -424,10 +424,13 @@ def add_arguments(parser):
     return parser
 
 if __name__ == '__main__':
-    # Add custom arguments
-    parser = parse_arguments.__self__
+    # Get the argument parser and add custom arguments
+    import argparse
+    from openhands.core.config import get_parser
+    
+    parser = get_parser()
     add_arguments(parser)
-    args = parser.parse_args()
+    args = parse_arguments()
     
     # Load the polyglot benchmark dataset
     polyglot_tests = load_polyglot_dataset()

From 890377d28352f9742c92e0c336ab4ec9d1e3171f Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:27:21 +0000
Subject: [PATCH 04/22] Improve polyglot benchmark path handling and fix
 logging error

---
 .../polyglot_benchmark/run_infer.py           | 26 ++++++++++++--
 .../polyglot_benchmark/scripts/run_infer.sh   | 35 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index 6fce76d9dbdf..c5adbc64c572 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -328,9 +328,31 @@ def load_polyglot_dataset():
     import glob
     import json
     import os
+    from pathlib import Path
     
-    # Path to the polyglot-benchmark repository
-    repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark')
+    # Try to find the polyglot-benchmark repository
+    # First check the environment variable
+    repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH')
+    
+    # If not set, try common locations
+    if not repo_path or not os.path.exists(repo_path):
+        possible_paths = [
+            '/workspace/polyglot-benchmark',
+            str(Path.home() / 'polyglot-benchmark'),
+            str(Path.home() / 'thereal' / 'polyglot-benchmark'),
+            str(Path(__file__).parent.parent.parent.parent.parent / 'polyglot-benchmark'),
+            str(Path.cwd() / 'polyglot-benchmark'),
+        ]
+        
+        for path in possible_paths:
+            if os.path.exists(path):
+                repo_path = path
+                logger.info(f"Found polyglot-benchmark repository at: {repo_path}")
+                break
+    
+    if not repo_path or not os.path.exists(repo_path):
+        logger.error("Could not find polyglot-benchmark repository. Please set POLYGLOT_BENCHMARK_PATH environment variable.")
+        return pd.DataFrame()
     
     all_tests = []
     instance_id = 0
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index ce998a112330..206716c57958 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -12,9 +12,42 @@ EVAL_IDS=${6:-""}
 EVAL_LANGUAGES=${7:-""}
 
 # Set environment variables
-export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"}
 export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
 
+# Try to find the polyglot-benchmark repository
+if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then
+  # Check common locations
+  POSSIBLE_PATHS=(
+    "/workspace/polyglot-benchmark"
+    "$HOME/polyglot-benchmark"
+    "$HOME/thereal/polyglot-benchmark"
+    "$(git rev-parse --show-toplevel)/polyglot-benchmark"
+    "$(pwd)/polyglot-benchmark"
+  )
+  
+  for path in "${POSSIBLE_PATHS[@]}"; do
+    if [ -d "$path" ]; then
+      export POLYGLOT_BENCHMARK_PATH="$path"
+      echo "Found polyglot-benchmark repository at: $POLYGLOT_BENCHMARK_PATH"
+      break
+    fi
+  done
+fi
+
+# If still not found, try to clone it
+if [ -z "$POLYGLOT_BENCHMARK_PATH" ] || [ ! -d "$POLYGLOT_BENCHMARK_PATH" ]; then
+  echo "Polyglot benchmark repository not found. Attempting to clone it..."
+  CLONE_DIR="$(git rev-parse --show-toplevel)/polyglot-benchmark"
+  git clone https://github.com/Aider-AI/polyglot-benchmark.git "$CLONE_DIR"
+  if [ $? -eq 0 ]; then
+    export POLYGLOT_BENCHMARK_PATH="$CLONE_DIR"
+    echo "Successfully cloned polyglot-benchmark to $POLYGLOT_BENCHMARK_PATH"
+  else
+    echo "Failed to clone polyglot-benchmark. Please set POLYGLOT_BENCHMARK_PATH manually."
+    exit 1
+  fi
+fi
+
 # Add additional arguments based on provided parameters
 ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}"
 

From 8af6f1111baf53831f1a2ca3edcd5a4d6851d70f Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:31:00 +0000
Subject: [PATCH 05/22] Add Docker configuration options and troubleshooting
 guide

---
 .../benchmarks/polyglot_benchmark/README.md   | 31 +++++++++++++++++++
 .../polyglot_benchmark/run_infer.py           | 12 +++++--
 .../polyglot_benchmark/scripts/run_infer.sh   |  2 ++
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index d92251acb9f7..46f79dfeb9c5 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -51,8 +51,39 @@ You can also set the following environment variables:
 ```bash
 export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the polyglot-benchmark repository
 export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
+export NO_DOCKER="true"  # Skip Docker container creation and use local runtime (default: false)
+export POLYGLOT_DOCKER_IMAGE="image:tag"  # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0)
 ```
 
+### Troubleshooting
+
+#### Docker Issues
+
+If you encounter Docker-related errors like:
+
+```
+Command 'docker buildx build ...' returned non-zero exit status 1
+```
+
+You can try the following solutions:
+
+1. Run with `NO_DOCKER=true` to use the local runtime instead:
+   ```bash
+   NO_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
+   ```
+
+2. Make sure Docker is installed and running:
+   ```bash
+   docker --version
+   docker ps
+   ```
+
+3. Check if you have permission to use Docker:
+   ```bash
+   sudo usermod -aG docker $USER
+   # Then log out and log back in
+   ```
+
 ### Example
 
 ```bash
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index c5adbc64c572..4be3b75ae26a 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -62,13 +62,21 @@ def get_config(
     instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
+    # Determine runtime type based on environment variable
+    runtime_type = os.environ.get('RUNTIME', 'docker')
+    
+    # Check if NO_DOCKER is set to skip Docker container creation
+    if os.environ.get('NO_DOCKER', 'false').lower() == 'true':
+        runtime_type = 'local'
+        logger.info("Using local runtime instead of Docker due to NO_DOCKER=true")
+    
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'docker'),
+        runtime=runtime_type,
         max_iterations=metadata.max_iterations,
         sandbox=SandboxConfig(
-            base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0',  # TODO: Create this image
+            base_container_image=os.environ.get('POLYGLOT_DOCKER_IMAGE', 'ghcr.io/opendevin/eval-polyglot:v1.0.0'),
             enable_auto_lint=True,
             use_host_network=False,
             timeout=300,  # Longer timeout for compilation
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index 206716c57958..7c7a3726be5f 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -13,6 +13,8 @@ EVAL_LANGUAGES=${7:-""}
 
 # Set environment variables
 export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
+export NO_DOCKER=${NO_DOCKER:-"false"}
+export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"}
 
 # Try to find the polyglot-benchmark repository
 if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then

From 32335ffcb3862817cc85a3f44ce590353609c38a Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:32:00 +0000
Subject: [PATCH 06/22] Add local Docker image build support for polyglot
 benchmark

---
 .../benchmarks/polyglot_benchmark/README.md   | 39 +++++++-
 .../scripts/build_local_docker.sh             | 94 +++++++++++++++++++
 .../polyglot_benchmark/scripts/run_infer.sh   | 23 ++++-
 3 files changed, 152 insertions(+), 4 deletions(-)
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh

diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index 46f79dfeb9c5..9fa8bfb1dfb3 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -53,6 +53,37 @@ export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the poly
 export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
 export NO_DOCKER="true"  # Skip Docker container creation and use local runtime (default: false)
 export POLYGLOT_DOCKER_IMAGE="image:tag"  # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0)
+export BUILD_LOCAL_DOCKER="true"  # Build a local Docker image if one doesn't exist (default: false)
+```
+
+### Docker Support
+
+The benchmark uses Docker to create isolated environments for running code in different programming languages. There are two ways to use Docker with this benchmark:
+
+#### Option 1: Build a Local Docker Image
+
+You can build a local Docker image that contains all the necessary tools for the benchmark:
+
+```bash
+# Build the Docker image
+./evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
+
+# Run the benchmark with the local image
+./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
+```
+
+Alternatively, you can set the `BUILD_LOCAL_DOCKER` environment variable:
+
+```bash
+BUILD_LOCAL_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
+```
+
+#### Option 2: Use a Pre-built Docker Image
+
+You can specify a custom Docker image to use:
+
+```bash
+POLYGLOT_DOCKER_IMAGE="your-custom-image:tag" ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
 ```
 
 ### Troubleshooting
@@ -67,18 +98,20 @@ Command 'docker buildx build ...' returned non-zero exit status 1
 
 You can try the following solutions:
 
-1. Run with `NO_DOCKER=true` to use the local runtime instead:
+1. Build a local Docker image as described above.
+
+2. Run with `NO_DOCKER=true` to use the local runtime instead:
    ```bash
    NO_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
    ```
 
-2. Make sure Docker is installed and running:
+3. Make sure Docker is installed and running:
    ```bash
    docker --version
    docker ps
    ```
 
-3. Check if you have permission to use Docker:
+4. Check if you have permission to use Docker:
    ```bash
    sudo usermod -aG docker $USER
    # Then log out and log back in
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
new file mode 100755
index 000000000000..d129c5676ec1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory of this script
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
+REPO_ROOT="$( cd "${BENCHMARK_DIR}/../../.." && pwd )"
+
+# Create a temporary directory for the Docker build
+BUILD_DIR=$(mktemp -d)
+trap "rm -rf $BUILD_DIR" EXIT
+
+echo "Creating Docker build context in $BUILD_DIR"
+
+# Create a simple Dockerfile that includes all the necessary tools
+cat > "$BUILD_DIR/Dockerfile" << 'EOF'
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    wget \
+    software-properties-common \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    libboost-all-dev \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip3 install --no-cache-dir pytest pytest-timeout
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \
+    && rm go1.20.5.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java
+RUN apt-get update && apt-get install -y openjdk-17-jdk \
+    && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Install Gradle
+RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
+    && mkdir /opt/gradle \
+    && unzip -d /opt/gradle gradle-7.6-bin.zip \
+    && rm gradle-7.6-bin.zip
+ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+CMD ["/bin/bash"]
+EOF
+
+# Build the Docker image
+IMAGE_NAME="polyglot-benchmark:local"
+echo "Building Docker image: $IMAGE_NAME"
+docker build -t "$IMAGE_NAME" "$BUILD_DIR"
+
+# Export the image name as an environment variable
+echo "export POLYGLOT_DOCKER_IMAGE=$IMAGE_NAME" > "$BENCHMARK_DIR/docker_image.env"
+
+echo "Docker image built successfully: $IMAGE_NAME"
+echo "To use this image, run:"
+echo "source $BENCHMARK_DIR/docker_image.env"
+echo "Then run the benchmark as usual."
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index 7c7a3726be5f..a044219c27e1 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -14,7 +14,28 @@ EVAL_LANGUAGES=${7:-""}
 # Set environment variables
 export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
 export NO_DOCKER=${NO_DOCKER:-"false"}
-export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"}
+
+# Check if we have a local Docker image env file
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
+DOCKER_ENV_FILE="${BENCHMARK_DIR}/docker_image.env"
+
+if [ -f "$DOCKER_ENV_FILE" ]; then
+  echo "Loading Docker image configuration from $DOCKER_ENV_FILE"
+  source "$DOCKER_ENV_FILE"
+else
+  # If no local image is available, use the default
+  export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"}
+  
+  # Check if we need to build a local Docker image
+  if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then
+    echo "Building local Docker image..."
+    "${SCRIPT_DIR}/build_local_docker.sh"
+    source "$DOCKER_ENV_FILE"
+  fi
+fi
+
+echo "Using Docker image: $POLYGLOT_DOCKER_IMAGE"
 
 # Try to find the polyglot-benchmark repository
 if [ -z "$POLYGLOT_BENCHMARK_PATH" ]; then

From 561001019a5d060acbfad9f3c5c171ed862bb658 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:33:36 +0000
Subject: [PATCH 07/22] Set Docker image to build automatically by default

---
 .../benchmarks/polyglot_benchmark/README.md   | 29 ++++++++++++++-----
 .../polyglot_benchmark/scripts/run_infer.sh   | 26 +++++++++++++----
 2 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index 9fa8bfb1dfb3..603b3a787fba 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -53,16 +53,29 @@ export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the poly
 export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
 export NO_DOCKER="true"  # Skip Docker container creation and use local runtime (default: false)
 export POLYGLOT_DOCKER_IMAGE="image:tag"  # Custom Docker image to use (default: ghcr.io/opendevin/eval-polyglot:v1.0.0)
-export BUILD_LOCAL_DOCKER="true"  # Build a local Docker image if one doesn't exist (default: false)
+export BUILD_LOCAL_DOCKER="false"  # Build a local Docker image if one doesn't exist (default: true)
 ```
 
 ### Docker Support
 
-The benchmark uses Docker to create isolated environments for running code in different programming languages. There are two ways to use Docker with this benchmark:
+The benchmark uses Docker to create isolated environments for running code in different programming languages. By default, the script will:
 
-#### Option 1: Build a Local Docker Image
+1. Try to pull the specified Docker image from the registry
+2. If the pull fails, automatically build a local Docker image
 
-You can build a local Docker image that contains all the necessary tools for the benchmark:
+You have several options for customizing this behavior:
+
+#### Option 1: Use the Default Behavior (Recommended)
+
+Simply run the benchmark script, and it will handle the Docker image automatically:
+
+```bash
+./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
+```
+
+#### Option 2: Manually Build a Local Docker Image
+
+You can explicitly build a local Docker image before running the benchmark:
 
 ```bash
 # Build the Docker image
@@ -72,13 +85,15 @@ You can build a local Docker image that contains all the necessary tools for the
 ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
 ```
 
-Alternatively, you can set the `BUILD_LOCAL_DOCKER` environment variable:
+#### Option 3: Disable Automatic Docker Image Building
+
+If you want to disable the automatic building of a Docker image:
 
 ```bash
-BUILD_LOCAL_DOCKER=true ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
+BUILD_LOCAL_DOCKER=false ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent 1 1
 ```
 
-#### Option 2: Use a Pre-built Docker Image
+#### Option 4: Use a Custom Docker Image
 
 You can specify a custom Docker image to use:
 
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index a044219c27e1..ebb3fc2d4a52 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -20,6 +20,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
 DOCKER_ENV_FILE="${BENCHMARK_DIR}/docker_image.env"
 
+# Set BUILD_LOCAL_DOCKER to true by default if not specified
+export BUILD_LOCAL_DOCKER=${BUILD_LOCAL_DOCKER:-"true"}
+
 if [ -f "$DOCKER_ENV_FILE" ]; then
   echo "Loading Docker image configuration from $DOCKER_ENV_FILE"
   source "$DOCKER_ENV_FILE"
@@ -27,11 +30,24 @@ else
   # If no local image is available, use the default
   export POLYGLOT_DOCKER_IMAGE=${POLYGLOT_DOCKER_IMAGE:-"ghcr.io/opendevin/eval-polyglot:v1.0.0"}
   
-  # Check if we need to build a local Docker image
-  if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then
-    echo "Building local Docker image..."
-    "${SCRIPT_DIR}/build_local_docker.sh"
-    source "$DOCKER_ENV_FILE"
+  # Try to pull the image first
+  echo "Trying to pull Docker image: $POLYGLOT_DOCKER_IMAGE"
+  if ! docker pull "$POLYGLOT_DOCKER_IMAGE" 2>/dev/null; then
+    echo "Failed to pull Docker image: $POLYGLOT_DOCKER_IMAGE"
+    
+    # Build a local Docker image if pulling fails and BUILD_LOCAL_DOCKER is true
+    if [ "$BUILD_LOCAL_DOCKER" = "true" ]; then
+      echo "Building local Docker image..."
+      "${SCRIPT_DIR}/build_local_docker.sh"
+      source "$DOCKER_ENV_FILE"
+    else
+      echo "WARNING: Docker image not found and BUILD_LOCAL_DOCKER is not set to true."
+      echo "You can build a local Docker image by running:"
+      echo "  ${SCRIPT_DIR}/build_local_docker.sh"
+      echo "Or set BUILD_LOCAL_DOCKER=true to build it automatically."
+    fi
+  else
+    echo "Successfully pulled Docker image: $POLYGLOT_DOCKER_IMAGE"
   fi
 fi
 

From c9e232e76412bbe7ec540f59696c851dbdf7dd73 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:40:24 +0000
Subject: [PATCH 08/22] Fix Docker build issues by adding unzip and simplifying
 Gradle installation

---
 .../polyglot_benchmark/scripts/build_local_docker.sh     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
index d129c5676ec1..0f93c82164a0 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_local_docker.sh
@@ -30,6 +30,8 @@ RUN apt-get update && apt-get install -y \
     python3-dev \
     python3-venv \
     wget \
+    unzip \
+    zip \
     software-properties-common \
     apt-transport-https \
     ca-certificates \
@@ -63,11 +65,8 @@ RUN apt-get update && apt-get install -y openjdk-17-jdk \
 ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
 
 # Install Gradle
-RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
-    && mkdir /opt/gradle \
-    && unzip -d /opt/gradle gradle-7.6-bin.zip \
-    && rm gradle-7.6-bin.zip
-ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+RUN apt-get update && apt-get install -y gradle \
+    && rm -rf /var/lib/apt/lists/*
 
 # Create workspace directory
 RUN mkdir -p /workspace

From 97e7ca7f3bb6168e2978bd46bde9e9bff65d2ef5 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:51:59 +0000
Subject: [PATCH 09/22] Restrict polyglot benchmark to use only the same tools
 as SWE-Bench (execute_bash, finish, str_replace_editor)

---
 evaluation/benchmarks/polyglot_benchmark/README.md    |  7 +++++++
 evaluation/benchmarks/polyglot_benchmark/run_infer.py | 10 ++++++++++
 2 files changed, 17 insertions(+)

diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index 603b3a787fba..deb02b1969bb 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -2,6 +2,13 @@
 
 This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
 
+> **Note**: This benchmark has been modified to use only the same tools as SWE-Bench:
+> - execute_bash
+> - finish
+> - str_replace_editor
+>
+> This restriction ensures consistent tool usage across benchmarks for more accurate comparisons.
+
 ## Features
 
 - Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index 4be3b75ae26a..d79fc2a707aa 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -8,6 +8,11 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+# NOTE: This benchmark has been modified to use only the same tools as SWE-Bench:
+# - execute_bash
+# - finish
+# - str_replace_editor
+
 import pandas as pd
 
 from evaluation.benchmarks.polyglot_benchmark.helper.prompts import (
@@ -103,6 +108,11 @@ def get_config(
 
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
+    
+    # Restrict tools to match SWE-Bench (only execute_bash, finish, and str_replace_editor)
+    agent_config.codeact_enable_jupyter = False
+    agent_config.codeact_enable_browsing = False
+    agent_config.codeact_enable_llm_editor = False
 
     # copy 'draft_editor' config if exists
     config_copy = copy.deepcopy(config)

From 44bcb39b66a7578172809fe26174d11c53964155 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:57:55 +0000
Subject: [PATCH 10/22] Fix runtime completion to use Docker runtime for
 running tests

---
 .../polyglot_benchmark/run_infer.py           | 44 ++++++++++++-------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index d79fc2a707aa..6b8a841562ca 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -198,28 +198,40 @@ def complete_runtime(
                 
         if command:
             try:
-                result = subprocess.run(
-                    command,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT,
-                    text=True,
-                    timeout=180,  # 3 minutes timeout
-                    cwd="/workspace",
-                    encoding="utf-8",
-                    errors="replace",
-                )
-                exit_code = result.returncode
-                test_output = result.stdout
+                # Use the runtime to run the command inside the Docker container
+                cmd_str = " ".join(command)
+                logger.info(f"Running test command: {cmd_str}")
+                
+                action = CmdRunAction(command=cmd_str)
+                logger.info(action, extra={'msg_type': 'ACTION'})
+                
+                obs = runtime.run_action(action)
+                logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+                
+                if isinstance(obs, CmdOutputObservation):
+                    exit_code = obs.exit_code
+                    test_output = obs.content
+                else:
+                    logger.error(f"Unexpected observation type: {type(obs)}")
+                    exit_code = 1
+                    test_output = f"Error: Unexpected observation type: {type(obs)}"
                 
                 # Clean up output
                 test_output = test_output.replace("/workspace", "workspace")
                 
                 # Log test output to history file
-                with open("/workspace/.aider.chat.history.md", "a") as fh:
-                    fh.write(f"```\n{test_output}\n```")
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    history_path = os.path.join(tmpdir, ".aider.chat.history.md")
+                    with open(history_path, 'w') as f:
+                        f.write(f"```\n{test_output}\n```")
+                    runtime.copy_to(
+                        history_path,
+                        '/workspace',
+                    )
                     
-            except subprocess.TimeoutExpired:
-                test_output = "Tests timed out!"
+            except Exception as e:
+                logger.error(f"Error running tests: {e}")
+                test_output = f"Tests failed with error: {e}"
                 exit_code = 1
 
     logger.info('-' * 30)

From 601da458cdd666efe112e5e202fad674a1cac95c Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 07:07:47 +0000
Subject: [PATCH 11/22] Add script to test one instance per language in
 polyglot benchmark

---
 .../polyglot_benchmark/test_all_languages.py  | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_all_languages.py

diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
new file mode 100755
index 000000000000..89e15b6720f1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import argparse
+from pathlib import Path
+
+# Add the parent directory to the Python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from evaluation.benchmarks.polyglot_benchmark.run_infer import (
+    load_polyglot_dataset,
+    process_instance,
+    make_metadata,
+    get_llm_config_arg,
+)
+from openhands.core.logger import openhands_logger as logger
+
+def test_language(language, model, agent):
+    """Test the first instance of a specific language."""
+    print(f"\n{'=' * 50}")
+    print(f"Testing language: {language}")
+    print(f"{'=' * 50}\n")
+    
+    # Set the environment variable for the polyglot benchmark path
+    os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark'
+    
+    # Load the dataset
+    dataset = load_polyglot_dataset()
+    
+    # Filter by language
+    dataset = dataset[dataset['language'].str.lower() == language.lower()]
+    if dataset.empty:
+        print(f"No instances found for language: {language}")
+        return False
+    
+    # Get the first instance
+    instance = dataset.iloc[0]
+    print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})")
+    
+    # Get LLM config
+    llm_config = get_llm_config_arg(model)
+    if llm_config is None:
+        print(f"Could not find LLM config: {model}")
+        return False
+    
+    # Create metadata
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotBenchmark',
+        agent,
+        30,  # max_iterations
+        f"test_{language}",
+        f"evaluation/evaluation_outputs/test_{language}",
+    )
+    
+    # Process the instance
+    try:
+        output = process_instance(instance, metadata, reset_logger=False)
+        print("\nTest completed successfully!")
+        print(f"Exit code: {output.test_result['exit_code']}")
+        print(f"Passed: {output.test_result['exit_code'] == 0}")
+        return output.test_result['exit_code'] == 0
+    except Exception as e:
+        print(f"Error processing instance: {e}")
+        return False
+
+def main():
+    parser = argparse.ArgumentParser(description="Test the polyglot benchmark with one instance per language")
+    parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name")
+    parser.add_argument("--agent", default="CodeActAgent", help="Agent class name")
+    parser.add_argument("--languages", default="python,rust,go,javascript,cpp,java", 
+                        help="Comma-separated list of languages to test")
+    args = parser.parse_args()
+    
+    languages = args.languages.split(',')
+    results = {}
+    
+    for language in languages:
+        language = language.strip()
+        if not language:
+            continue
+        
+        success = test_language(language, args.model, args.agent)
+        results[language] = "PASSED" if success else "FAILED"
+    
+    # Print summary
+    print("\n" + "=" * 50)
+    print("SUMMARY OF RESULTS")
+    print("=" * 50)
+    
+    for language, result in results.items():
+        print(f"{language.ljust(12)}: {result}")
+    
+    # Check if all tests passed
+    all_passed = all(result == "PASSED" for result in results.values())
+    print("\nOverall result:", "PASSED" if all_passed else "FAILED")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 84293fd031abb846bda22a19974ccfc33758c307 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 07:10:24 +0000
Subject: [PATCH 12/22] Add one-per-language testing mode to polyglot benchmark
 run_infer.sh

---
 .../polyglot_benchmark/scripts/run_infer.sh   | 135 ++++++++++++++++--
 1 file changed, 126 insertions(+), 9 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index ebb3fc2d4a52..e2b5044a00bf 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -2,14 +2,80 @@
 
 set -e
 
-# Default values
-MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"}
+# Display usage information
+function show_usage {
+  echo "Usage: $0 [options]"
+  echo ""
+  echo "Options:"
+  echo "  --help                 Show this help message"
+  echo "  --model MODEL          Model configuration (default: eval_gpt4_1106_preview)"
+  echo "  --agent AGENT          Agent class (default: CodeActAgent)"
+  echo "  --limit LIMIT          Evaluation limit (default: -1 for all)"
+  echo "  --workers WORKERS      Number of workers (default: 1)"
+  echo "  --ids IDS              Comma-separated list of instance IDs"
+  echo "  --languages LANGUAGES  Comma-separated list of languages"
+  echo "  --one-per-language     Test one instance per language"
+  echo ""
+  echo "Legacy positional arguments are still supported:"
+  echo "  $0 MODEL_CONFIG GIT_VERSION AGENT EVAL_LIMIT EVAL_NUM_WORKERS EVAL_IDS EVAL_LANGUAGES"
+  exit 0
+}
+
+# Parse named arguments
+ONE_PER_LANGUAGE=false
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --help)
+      show_usage
+      ;;
+    --model)
+      MODEL_CONFIG="$2"
+      shift 2
+      ;;
+    --agent)
+      AGENT="$2"
+      shift 2
+      ;;
+    --limit)
+      EVAL_LIMIT="$2"
+      shift 2
+      ;;
+    --workers)
+      EVAL_NUM_WORKERS="$2"
+      shift 2
+      ;;
+    --ids)
+      EVAL_IDS="$2"
+      shift 2
+      ;;
+    --languages)
+      EVAL_LANGUAGES="$2"
+      shift 2
+      ;;
+    --one-per-language)
+      ONE_PER_LANGUAGE=true
+      shift
+      ;;
+    *)
+      POSITIONAL_ARGS+=("$1")
+      shift
+      ;;
+  esac
+done
+
+# Restore positional parameters
+set -- "${POSITIONAL_ARGS[@]}"
+
+# Default values (if not set by named arguments)
+MODEL_CONFIG=${MODEL_CONFIG:-${1:-"eval_gpt4_1106_preview"}}
 GIT_VERSION=${2:-"HEAD"}
-AGENT=${3:-"CodeActAgent"}
-EVAL_LIMIT=${4:-"-1"}
-EVAL_NUM_WORKERS=${5:-"1"}
-EVAL_IDS=${6:-""}
-EVAL_LANGUAGES=${7:-""}
+AGENT=${AGENT:-${3:-"CodeActAgent"}}
+EVAL_LIMIT=${EVAL_LIMIT:-${4:-"-1"}}
+EVAL_NUM_WORKERS=${EVAL_NUM_WORKERS:-${5:-"1"}}
+EVAL_IDS=${EVAL_IDS:-${6:-""}}
+EVAL_LANGUAGES=${EVAL_LANGUAGES:-${7:-""}}
 
 # Set environment variables
 export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
@@ -102,6 +168,57 @@ if [ -n "${EVAL_LANGUAGES}" ]; then
   ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}"
 fi
 
-# Run the evaluation
+# Change to the repository root directory
 cd "$(git rev-parse --show-toplevel)"
-poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS}
\ No newline at end of file
+
+# If one-per-language mode is enabled
+if [ "$ONE_PER_LANGUAGE" = true ]; then
+  echo "Running one instance per language mode..."
+  
+  # Define the languages to test
+  LANGUAGES=("python" "javascript" "rust" "go" "cpp" "java")
+  
+  # Create a temporary directory for results
+  RESULTS_DIR="evaluation/evaluation_outputs/one_per_language_test"
+  mkdir -p "$RESULTS_DIR"
+  
+  # Summary file
+  SUMMARY_FILE="$RESULTS_DIR/summary.txt"
+  echo "POLYGLOT BENCHMARK - ONE INSTANCE PER LANGUAGE TEST" > "$SUMMARY_FILE"
+  echo "=================================================" >> "$SUMMARY_FILE"
+  echo "Model: $MODEL_CONFIG" >> "$SUMMARY_FILE"
+  echo "Agent: $AGENT" >> "$SUMMARY_FILE"
+  echo "Date: $(date)" >> "$SUMMARY_FILE"
+  echo "=================================================" >> "$SUMMARY_FILE"
+  echo "" >> "$SUMMARY_FILE"
+  
+  # Test each language
+  for LANG in "${LANGUAGES[@]}"; do
+    echo ""
+    echo "===== Testing language: $LANG ====="
+    echo ""
+    
+    # Run with one instance for this language
+    LANG_ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers 1 --eval-n-limit 1 --eval-languages ${LANG} --eval-note one_per_language_${LANG}"
+    
+    # Run the evaluation for this language
+    if poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${LANG_ARGS}; then
+      RESULT="PASSED"
+    else
+      RESULT="FAILED"
+    fi
+    
+    # Add to summary
+    echo "${LANG}: ${RESULT}" >> "$SUMMARY_FILE"
+  done
+  
+  # Display summary
+  echo ""
+  echo "===== TEST SUMMARY ====="
+  cat "$SUMMARY_FILE"
+  echo ""
+  echo "Detailed results available in: $RESULTS_DIR"
+else
+  # Run the normal evaluation
+  poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS}
+fi
\ No newline at end of file

From 87d9e15491913fe4ba8989dc4bb7e49b287aa845 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 07:10:54 +0000
Subject: [PATCH 13/22] Update README with one-per-language testing
 instructions and command-line options

---
 .../benchmarks/polyglot_benchmark/README.md   | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index deb02b1969bb..f7ee5e0112fb 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -36,11 +36,34 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
    pip install -e .[dev]
    ```
 
-2. Run the benchmark:
+2. To test one instance per language (quick verification):
    ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo
+   ```
+   
+   This will run one test for each supported language (Python, Rust, Go, JavaScript, C++, and Java) and provide a summary of results.
+
+3. Run the full benchmark:
+   ```bash
+   # Using named arguments (recommended)
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript
+   
+   # Or using positional arguments (legacy)
    ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh <model_config> <git-version> <agent> <eval_limit> <eval-num-workers> <eval_ids> <eval_languages>
    ```
 
+4. Available command-line options:
+   ```
+   --help                 Show help message
+   --model MODEL          Model configuration (default: eval_gpt4_1106_preview)
+   --agent AGENT          Agent class (default: CodeActAgent)
+   --limit LIMIT          Evaluation limit (default: -1 for all)
+   --workers WORKERS      Number of workers (default: 1)
+   --ids IDS              Comma-separated list of instance IDs
+   --languages LANGUAGES  Comma-separated list of languages
+   --one-per-language     Test one instance per language
+   ```
+
 ### Command Line Arguments
 
 - `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`)

From 8a5dc594e5438b1ebf26085cf4a9a18fdbccb5a3 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 07:17:53 +0000
Subject: [PATCH 14/22] Enable LLM completions logging in aider_bench
 run_infer.py

---
 evaluation/benchmarks/aider_bench/run_infer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 1ee68c21c2f0..93dd5102359b 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -75,6 +75,8 @@ def get_config(
         metadata.eval_output_dir,
         str(instance.instance_id)
     )
+    # Enable logging of LLM completions
+    llm_config.log_completions = True
     config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False

From 8ffe33e88e6512540247efe1d955696ddd809cb6 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 07:51:33 +0000
Subject: [PATCH 15/22] Include tools information in evaluation output
 directory names

---
 .../benchmarks/aider_bench/run_infer.py       | 10 ++++++
 .../polyglot_benchmark/run_infer.py           | 10 ++++++
 .../polyglot_benchmark/test_all_languages.py  | 10 ++++++
 .../benchmarks/polyglot_benchmark/test_run.py | 10 ++++++
 evaluation/benchmarks/swe_bench/run_infer.py  |  9 ++++-
 evaluation/utils/shared.py                    | 36 +++++++++++++++++--
 6 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 93dd5102359b..dc1cea9f5de3 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -295,6 +295,15 @@ def process_instance(
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
     metadata = make_metadata(
         llm_config,
         'AiderBench',
@@ -302,6 +311,7 @@ def process_instance(
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index 6b8a841562ca..12d870bd3b1e 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -504,6 +504,15 @@ def add_arguments(parser):
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
     metadata = make_metadata(
         llm_config,
         'PolyglotBenchmark',
@@ -511,6 +520,7 @@ def add_arguments(parser):
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
index 89e15b6720f1..f196651b890d 100755
--- a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
+++ b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
@@ -44,6 +44,15 @@ def test_language(language, model, agent):
         print(f"Could not find LLM config: {model}")
         return False
     
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
     # Create metadata
     metadata = make_metadata(
         llm_config,
@@ -52,6 +61,7 @@ def test_language(language, model, agent):
         30,  # max_iterations
         f"test_{language}",
         f"evaluation/evaluation_outputs/test_{language}",
+        details=agent_details,
     )
     
     # Process the instance
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py
index a8671b0646f1..c946356e90d6 100755
--- a/evaluation/benchmarks/polyglot_benchmark/test_run.py
+++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py
@@ -50,6 +50,15 @@ def main():
         print(f"Could not find LLM config: {args.model}")
         return
         
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
     # Create metadata
     metadata = make_metadata(
         llm_config,
@@ -58,6 +67,7 @@ def main():
         30,  # max_iterations
         "test",
         "evaluation/evaluation_outputs/test",
+        details=agent_details,
     )
     
     # Process the instance
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index 5e3f0e6a5bd7..71d37764ccb4 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -531,7 +531,14 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
-    details = {}
+    # Create details dictionary with agent configuration
+    details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": RUN_WITH_BROWSING,
+            "codeact_enable_llm_editor": False,
+        }
+    }
     _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
 
     dataset_descrption = (
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 0f8ac8fa8332..0e49da8ae971 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -158,6 +158,35 @@ def cleanup():
         process.join()
 
 
+def get_tools_string(agent_class: str, details: dict[str, Any] | None = None) -> str:
+    """Generate a string representation of the tools used by the agent.
+    
+    Args:
+        agent_class: The agent class name.
+        details: Additional details that might contain tool configuration.
+        
+    Returns:
+        A string representation of the tools used, e.g., "bash+finish+str_replace".
+    """
+    # Default tools for CodeActAgent
+    if agent_class == "CodeActAgent":
+        tools = ["bash", "finish", "str_replace"]
+        
+        # Check if additional tools are enabled
+        if details and "agent_config" in details:
+            agent_config = details.get("agent_config", {})
+            if agent_config.get("codeact_enable_browsing", False):
+                tools.extend(["web_read", "browser"])
+            if agent_config.get("codeact_enable_jupyter", False):
+                tools.append("ipython")
+            if agent_config.get("codeact_enable_llm_editor", False):
+                tools[-1] = "llm_editor"  # Replace str_replace with llm_editor
+        
+        return "+".join(tools)
+    
+    # For other agents, return a default string
+    return "default_tools"
+
 def make_metadata(
     llm_config: LLMConfig,
     dataset_name: str,
@@ -172,12 +201,15 @@ def make_metadata(
     model_name = llm_config.model.split('/')[-1]
     model_path = model_name.replace(':', '_').replace('@', '-')
     eval_note = f'_N_{eval_note}' if eval_note else ''
-
+    
+    # Get tools string
+    tools_string = get_tools_string(agent_class, details)
+    
     eval_output_path = os.path.join(
         eval_output_dir,
         dataset_name,
         agent_class,
-        f'{model_path}_maxiter_{max_iterations}{eval_note}',
+        f'{model_path}_maxiter_{max_iterations}_tools_{tools_string}{eval_note}',
     )
 
     pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)

From d45b98dd1c800e8383480ab4c3e0481a601c1cbc Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:00:02 +0000
Subject: [PATCH 16/22] Add evaluation parameter to run_infer.sh scripts for
 aider_bench and polyglot_benchmark

---
 .../aider_bench/scripts/run_infer.sh          | 30 +++++++++
 .../polyglot_benchmark/scripts/run_infer.sh   | 65 +++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 34249e94c527..3173b3d196f4 100755
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -9,6 +9,7 @@ AGENT=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
 EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -58,3 +59,32 @@ fi
 
 # Run the command
 eval $COMMAND
+
+# Get the output directory
+OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" | sort -r | head -n 1)
+OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+  
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE"
+    
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+    
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index e2b5044a00bf..a70df608b454 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -15,6 +15,7 @@ function show_usage {
   echo "  --ids IDS              Comma-separated list of instance IDs"
   echo "  --languages LANGUAGES  Comma-separated list of languages"
   echo "  --one-per-language     Test one instance per language"
+  echo "  --eval                 Run evaluation after benchmark"
   echo ""
   echo "Legacy positional arguments are still supported:"
   echo "  $0 MODEL_CONFIG GIT_VERSION AGENT EVAL_LIMIT EVAL_NUM_WORKERS EVAL_IDS EVAL_LANGUAGES"
@@ -23,6 +24,7 @@ function show_usage {
 
 # Parse named arguments
 ONE_PER_LANGUAGE=false
+RUN_EVALUATION=false
 POSITIONAL_ARGS=()
 
 while [[ $# -gt 0 ]]; do
@@ -58,6 +60,10 @@ while [[ $# -gt 0 ]]; do
       ONE_PER_LANGUAGE=true
       shift
       ;;
+    --eval)
+      RUN_EVALUATION=true
+      shift
+      ;;
     *)
       POSITIONAL_ARGS+=("$1")
       shift
@@ -218,7 +224,66 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then
   cat "$SUMMARY_FILE"
   echo ""
   echo "Detailed results available in: $RESULTS_DIR"
+  
+  # Run evaluation if requested
+  if [ "$RUN_EVALUATION" = true ]; then
+    echo ""
+    echo "======================================"
+    echo "Running detailed evaluation on results..."
+    echo "======================================"
+    echo ""
+    
+    # Evaluate each language's results
+    for LANG in "${LANGUAGES[@]}"; do
+      LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}"
+      LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl"
+      
+      if [ -f "$LANG_OUTPUT_FILE" ]; then
+        echo ""
+        echo "===== Evaluating $LANG results ====="
+        echo ""
+        echo "Evaluating results in: $LANG_OUTPUT_FILE"
+        
+        # Save the evaluation results
+        EVAL_RESULTS_FILE="${LANG_OUTPUT_DIR}/evaluation_results.txt"
+        echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+        poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$LANG_OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+      fi
+    done
+    
+    echo ""
+    echo "Detailed evaluation complete."
+  fi
 else
   # Run the normal evaluation
   poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS}
+  
+  # Run evaluation if requested
+  if [ "$RUN_EVALUATION" = true ]; then
+    echo ""
+    echo "======================================"
+    echo "Running evaluation on results..."
+    echo "======================================"
+    echo ""
+    
+    # Get the output directory
+    OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" | sort -r | head -n 1)
+    OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+    
+    if [ -f "$OUTPUT_FILE" ]; then
+      echo "Evaluating results in: $OUTPUT_FILE"
+      poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE"
+      
+      # Save the evaluation results
+      EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+      echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+      poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+      
+      echo ""
+      echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+    else
+      echo "Error: Output file not found: $OUTPUT_FILE"
+      echo "Cannot run evaluation."
+    fi
+  fi
 fi
\ No newline at end of file

From 62d2632c62eaa8760d2223792bda189e7b4c02b4 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:00:55 +0000
Subject: [PATCH 17/22] Update README files with documentation for the new
 evaluation parameter

---
 evaluation/benchmarks/aider_bench/README.md        | 7 ++++++-
 evaluation/benchmarks/polyglot_benchmark/README.md | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
index 086cfe58160a..a011e6ec9d5c 100644
--- a/evaluation/benchmarks/aider_bench/README.md
+++ b/evaluation/benchmarks/aider_bench/README.md
@@ -16,7 +16,7 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
@@ -31,6 +31,7 @@ development environment and LLM.
 - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
 - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
     given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
 
 There are also following optional environment variables you can set:
 
@@ -53,7 +54,11 @@ You can update the arguments in the script
 - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
+# Run benchmark without evaluation
 ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" eval
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)
diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
index f7ee5e0112fb..f5e8ee6a2903 100644
--- a/evaluation/benchmarks/polyglot_benchmark/README.md
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -38,7 +38,11 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
 
 2. To test one instance per language (quick verification):
    ```bash
+   # Without evaluation
    ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo
+   
+   # With automatic evaluation
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --one-per-language --model eval_gpt35_turbo --eval
    ```
    
    This will run one test for each supported language (Python, Rust, Go, JavaScript, C++, and Java) and provide a summary of results.
@@ -48,6 +52,9 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
    # Using named arguments (recommended)
    ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript
    
+   # With automatic evaluation
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh --model eval_gpt35_turbo --agent CodeActAgent --limit 10 --workers 4 --languages python,javascript --eval
+   
    # Or using positional arguments (legacy)
    ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh <model_config> <git-version> <agent> <eval_limit> <eval-num-workers> <eval_ids> <eval_languages>
    ```
@@ -62,6 +69,7 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
    --ids IDS              Comma-separated list of instance IDs
    --languages LANGUAGES  Comma-separated list of languages
    --one-per-language     Test one instance per language
+   --eval                 Run evaluation after benchmark completes
    ```
 
 ### Command Line Arguments

From c8dab2c421e4eb8340b6b66bd27fb124d908f302 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:07:50 +0000
Subject: [PATCH 18/22] Fix output directory detection in evaluation scripts

---
 .../aider_bench/scripts/run_infer.sh          | 20 +++++++++++--
 .../polyglot_benchmark/scripts/run_infer.sh   | 28 ++++++++++++++++---
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 3173b3d196f4..3526381de5ab 100755
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -60,9 +60,23 @@ fi
 # Run the command
 eval $COMMAND
 
-# Get the output directory
-OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" | sort -r | head -n 1)
-OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
 
 # Run evaluation if requested
 if [ "$RUN_EVALUATION" = "eval" ]; then
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index a70df608b454..112028eb7079 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -235,7 +235,13 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then
     
     # Evaluate each language's results
     for LANG in "${LANGUAGES[@]}"; do
-      LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}"
+      # Try to find the output directory for this language
+      LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1)
+      
+      if [ -z "$LANG_OUTPUT_DIR" ]; then
+        LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}"
+      fi
+      
       LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl"
       
       if [ -f "$LANG_OUTPUT_FILE" ]; then
@@ -266,9 +272,23 @@ else
     echo "======================================"
     echo ""
     
-    # Get the output directory
-    OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" | sort -r | head -n 1)
-    OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+    # Get the output directory - first try the default location
+    OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
+    
+    # If not found, try to find it anywhere under evaluation_outputs
+    if [ -z "$OUTPUT_DIR" ]; then
+      OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
+    fi
+    
+    # If still not found, try to find any output.jsonl file
+    if [ -z "$OUTPUT_DIR" ]; then
+      OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+      if [ -n "$OUTPUT_FILE" ]; then
+        OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+      fi
+    else
+      OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+    fi
     
     if [ -f "$OUTPUT_FILE" ]; then
       echo "Evaluating results in: $OUTPUT_FILE"

From fa9a0f8b6bc682ebf89319bbf10873f1392faff1 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:10:52 +0000
Subject: [PATCH 19/22] Fix LLM completions logging to ensure it's enabled in
 all benchmarks

---
 evaluation/benchmarks/aider_bench/run_infer.py  |  2 --
 .../benchmarks/polyglot_benchmark/run_infer.py  |  4 ----
 evaluation/utils/shared.py                      | 17 +++++++++--------
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index dc1cea9f5de3..fb035c5a4c1d 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -75,8 +75,6 @@ def get_config(
         metadata.eval_output_dir,
         str(instance.instance_id)
     )
-    # Enable logging of LLM completions
-    llm_config.log_completions = True
     config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
index 12d870bd3b1e..334a0a769bcc 100644
--- a/evaluation/benchmarks/polyglot_benchmark/run_infer.py
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -102,8 +102,6 @@ def get_config(
         metadata.eval_output_dir,
         str(instance.instance_id)
     )
-    # Enable logging of LLM completions
-    llm_config.log_completions = True
     config.set_llm_config(llm_config)
 
     agent_config = config.get_agent_config(metadata.agent_class)
@@ -498,8 +496,6 @@ def add_arguments(parser):
         llm_config = get_llm_config_arg(args.llm_config)
         # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
         llm_config.modify_params = False
-        # Enable logging of LLM completions
-        llm_config.log_completions = True
 
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 0e49da8ae971..124d2682fcf4 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -512,14 +512,15 @@ def update_llm_config_for_completions_logging(
     instance_id: str,
 ) -> LLMConfig:
     """Update the LLM config for logging completions."""
-    if llm_config.log_completions:
-        llm_config.log_completions_folder = os.path.join(
-            eval_output_dir, 'llm_completions', instance_id
-        )
-        logger.info(
-            f'Logging LLM completions for instance {instance_id} to '
-            f'{llm_config.log_completions_folder}'
-        )
+    # Always enable completions logging
+    llm_config.log_completions = True
+    llm_config.log_completions_folder = os.path.join(
+        eval_output_dir, 'llm_completions', instance_id
+    )
+    logger.info(
+        f'Logging LLM completions for instance {instance_id} to '
+        f'{llm_config.log_completions_folder}'
+    )
     return llm_config
 
 

From 8a4ca1e48c329f895682967aca70b824922570cc Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:15:04 +0000
Subject: [PATCH 20/22] Improve output directory detection in evaluation
 scripts with better path matching and debugging output

---
 .../aider_bench/scripts/run_infer.sh          |  9 ++++++--
 .../polyglot_benchmark/scripts/run_infer.sh   | 21 ++++++++++++++++---
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 3526381de5ab..737b004121c7 100755
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -61,11 +61,11 @@ fi
 eval $COMMAND
 
 # Get the output directory - first try the default location
-OUTPUT_DIR=$(find evaluation/evaluation_outputs/AiderBench/$AGENT -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
 
 # If not found, try to find it anywhere under evaluation_outputs
 if [ -z "$OUTPUT_DIR" ]; then
-  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
 fi
 
 # If still not found, try to find any output.jsonl file
@@ -78,6 +78,11 @@ else
   OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
 fi
 
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
 # Run evaluation if requested
 if [ "$RUN_EVALUATION" = "eval" ]; then
   echo ""
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index 112028eb7079..34bd41287dcf 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -236,7 +236,11 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then
     # Evaluate each language's results
     for LANG in "${LANGUAGES[@]}"; do
       # Try to find the output directory for this language
-      LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1)
+      LANG_OUTPUT_DIR=$(find evaluation/evaluation_outputs -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1)
+      
+      if [ -z "$LANG_OUTPUT_DIR" ]; then
+        LANG_OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*one_per_language_${LANG}*" 2>/dev/null | sort -r | head -n 1)
+      fi
       
       if [ -z "$LANG_OUTPUT_DIR" ]; then
         LANG_OUTPUT_DIR="evaluation/evaluation_outputs/one_per_language_${LANG}"
@@ -244,6 +248,12 @@ if [ "$ONE_PER_LANGUAGE" = true ]; then
       
       LANG_OUTPUT_FILE="${LANG_OUTPUT_DIR}/output.jsonl"
       
+      # Print the language output directory and file for debugging
+      echo ""
+      echo "Language: $LANG"
+      echo "Output directory: $LANG_OUTPUT_DIR"
+      echo "Output file: $LANG_OUTPUT_FILE"
+      
       if [ -f "$LANG_OUTPUT_FILE" ]; then
         echo ""
         echo "===== Evaluating $LANG results ====="
@@ -273,11 +283,11 @@ else
     echo ""
     
     # Get the output directory - first try the default location
-    OUTPUT_DIR=$(find evaluation/evaluation_outputs/PolyglotBenchmark/$AGENT -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
+    OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
     
     # If not found, try to find it anywhere under evaluation_outputs
     if [ -z "$OUTPUT_DIR" ]; then
-      OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
+      OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/PolyglotBenchmark/$AGENT/*" -type d -name "*tools_bash+finish+str_replace*" 2>/dev/null | sort -r | head -n 1)
     fi
     
     # If still not found, try to find any output.jsonl file
@@ -290,6 +300,11 @@ else
       OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
     fi
     
+    # Print the output directory and file for debugging
+    echo ""
+    echo "Output directory: $OUTPUT_DIR"
+    echo "Output file: $OUTPUT_FILE"
+    
     if [ -f "$OUTPUT_FILE" ]; then
       echo "Evaluating results in: $OUTPUT_FILE"
       poetry run python evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py "$OUTPUT_FILE"

From a2d7e631c68bdf4f5175e105dfebe64ae3329fc5 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:17:30 +0000
Subject: [PATCH 21/22] Fix handling of 'eval' parameter to prevent it from
 being treated as an instance ID

---
 .../benchmarks/aider_bench/scripts/run_infer.sh | 17 ++++++++++++++++-
 .../polyglot_benchmark/scripts/run_infer.sh     |  8 +++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
index 737b004121c7..102f5d0158b6 100755
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -11,6 +11,20 @@ NUM_WORKERS=$5
 EVAL_IDS=$6
 RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
 
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
+
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
   echo "Number of workers not specified, use default $NUM_WORKERS"
@@ -52,7 +66,8 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 
-if [ -n "$EVAL_IDS" ]; then
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
   echo "EVAL_IDS: $EVAL_IDS"
   COMMAND="$COMMAND --eval-ids $EVAL_IDS"
 fi
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
index 34bd41287dcf..757cee5ac3bb 100755
--- a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -64,6 +64,11 @@ while [[ $# -gt 0 ]]; do
       RUN_EVALUATION=true
       shift
       ;;
+    eval)
+      # Special case for the 'eval' parameter in the positional arguments
+      RUN_EVALUATION=true
+      shift
+      ;;
     *)
       POSITIONAL_ARGS+=("$1")
       shift
@@ -166,7 +171,8 @@ if [ "${EVAL_LIMIT}" != "-1" ]; then
   ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}"
 fi
 
-if [ -n "${EVAL_IDS}" ]; then
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "${EVAL_IDS}" ] && [ "${EVAL_IDS}" != "eval" ]; then
   ARGS="${ARGS} --eval-ids ${EVAL_IDS}"
 fi
 

From 880bc10c3c10b8fa59e274c2e9a0f9f5a366a091 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 08:56:28 +0000
Subject: [PATCH 22/22] Add benchmark runner script with retry functionality

---
 run_benchmarks.sh | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100755 run_benchmarks.sh

diff --git a/run_benchmarks.sh b/run_benchmarks.sh
new file mode 100755
index 000000000000..fdf764bd00fa
--- /dev/null
+++ b/run_benchmarks.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Script to run OpenHands benchmarks with retry functionality
+# This script will run the polyglot_benchmark and aider_bench benchmarks
+# and retry them until they succeed or reach the maximum number of attempts.
+
+# Configuration
+MAX_ATTEMPTS=10
+RETRY_DELAY=30  # seconds
+MODEL_CONFIG="togetherDeepseek"
+GIT_VERSION="HEAD"
+AGENT="CodeActAgent"
+EVAL_LIMIT=1000
+NUM_WORKERS=30
+
+# Check if Docker is available
+check_docker() {
+    if ! command -v docker &> /dev/null; then
+        echo "WARNING: Docker is not available in this environment."
+        echo "The benchmarks require Docker to run properly."
+        echo "Continuing anyway, but expect failures if Docker is required."
+    fi
+}
+
+# Function to run a command and retry until it succeeds
+run_with_retry() {
+    local cmd="$1"
+    local benchmark_name="$2"
+    local attempt=1
+    local exit_code=1
+    
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - Running $benchmark_name benchmark"
+    echo "Command: $cmd"
+    
+    while [[ $exit_code -ne 0 && $attempt -le $MAX_ATTEMPTS ]]; do
+        echo "$(date '+%Y-%m-%d %H:%M:%S') - Attempt $attempt of $MAX_ATTEMPTS..."
+        
+        # Run the command
+        eval "$cmd"
+        exit_code=$?
+        
+        if [[ $exit_code -ne 0 ]]; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') - Command failed with exit code $exit_code."
+            
+            if [[ $attempt -lt $MAX_ATTEMPTS ]]; then
+                echo "Retrying in $RETRY_DELAY seconds..."
+                sleep $RETRY_DELAY
+                ((attempt++))
+            fi
+        fi
+    done
+    
+    if [[ $exit_code -ne 0 ]]; then
+        echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark failed after $MAX_ATTEMPTS attempts."
+        return 1
+    else
+        echo "$(date '+%Y-%m-%d %H:%M:%S') - $benchmark_name benchmark succeeded on attempt $attempt."
+        return 0
+    fi
+}
+
+# Main execution
+echo "====================================================================="
+echo "OpenHands Benchmark Runner"
+echo "Started at: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "====================================================================="
+echo "Model config: $MODEL_CONFIG"
+echo "Git version: $GIT_VERSION"
+echo "Agent: $AGENT"
+echo "Eval limit: $EVAL_LIMIT"
+echo "Number of workers: $NUM_WORKERS"
+echo "Maximum retry attempts: $MAX_ATTEMPTS"
+echo "Retry delay: $RETRY_DELAY seconds"
+echo "====================================================================="
+
+# Check for Docker
+check_docker
+
+# Run polyglot_benchmark
+echo "====================================================================="
+echo "Running polyglot_benchmark"
+echo "====================================================================="
+run_with_retry "./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS eval" "polyglot_benchmark"
+POLYGLOT_RESULT=$?
+
+# Run aider_bench
+echo "====================================================================="
+echo "Running aider_bench"
+echo "====================================================================="
+run_with_retry "./evaluation/benchmarks/aider_bench/scripts/run_infer.sh $MODEL_CONFIG $GIT_VERSION $AGENT $EVAL_LIMIT $NUM_WORKERS \"\" eval" "aider_bench"
+AIDER_RESULT=$?
+
+# Summary
+echo "====================================================================="
+echo "Benchmark Run Summary - Completed at: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "====================================================================="
+echo "polyglot_benchmark: $([ $POLYGLOT_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')"
+echo "aider_bench: $([ $AIDER_RESULT -eq 0 ] && echo 'SUCCESS' || echo 'FAILED')"
+echo "====================================================================="
+
+# Exit with success only if both benchmarks succeeded
+if [[ $POLYGLOT_RESULT -eq 0 && $AIDER_RESULT -eq 0 ]]; then
+    echo "All benchmarks completed successfully."
+    exit 0
+else
+    echo "One or more benchmarks failed."
+    exit 1
+fi
\ No newline at end of file