From 92e98f65239677a2bd241abae9a15749eca4fa66 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 04:35:27 +0000
Subject: [PATCH 1/6] feat: Enable llm_completions logging in aider_bench

- Added update_llm_config_for_completions_logging to imports
- Modified get_config to accept instance parameter
- Updated llm_config to enable completions logging
- Updated process_instance to pass instance to get_config

This change makes aider_bench save llm_completions in the same way as swe_bench,
with completions being saved in {eval_output_dir}/llm_completions/{instance_id}/
---
 evaluation/benchmarks/aider_bench/run_infer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 8045f948d3f9..1ee68c21c2f0 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -20,6 +20,7 @@
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -45,6 +46,7 @@
 
 
 def get_config(
+    instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
     config = AppConfig(
@@ -67,7 +69,13 @@ def get_config(
         workspace_base=None,
         workspace_mount_path=None,
     )
-    config.set_llm_config(metadata.llm_config)
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
 
@@ -170,7 +178,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
-    config = get_config(metadata)
+    config = get_config(instance, metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:

From c24d8baeb86b98995a87692968e21020e19e6fa4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 10:00:35 +0000
Subject: [PATCH 2/6] feat: Add polyglot aider benchmark

Added a new benchmark based on Aider's polyglot benchmark that supports:
- Multiple programming languages (Python, JS, Rust, Go, C++, Java)
- End-to-end evaluation of code editing capabilities
- Automated test execution and validation
- Parallel evaluation with multiple workers
- Detailed metrics and logging

Key components:
- run_infer.py: Main benchmark implementation
- Dockerfile: Multi-language development environment
- Scripts for running benchmarks and building Docker image
- Helper modules for prompts and utilities
---
 .../polyglot_aider_bench/Dockerfile           |  47 +++
 .../benchmarks/polyglot_aider_bench/README.md |  73 ++++
 .../polyglot_aider_bench/helper/prompts.py    |  15 +
 .../polyglot_aider_bench/run_infer.py         | 382 ++++++++++++++++++
 .../scripts/build_docker.sh                   |   8 +
 .../polyglot_aider_bench/scripts/run_infer.sh |  68 ++++
 6 files changed, 593 insertions(+)
 create mode 100644 evaluation/benchmarks/polyglot_aider_bench/Dockerfile
 create mode 100644 evaluation/benchmarks/polyglot_aider_bench/README.md
 create mode 100644 evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py
 create mode 100644 evaluation/benchmarks/polyglot_aider_bench/run_infer.py
 create mode 100755 evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh
 create mode 100755 evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh

diff --git a/evaluation/benchmarks/polyglot_aider_bench/Dockerfile b/evaluation/benchmarks/polyglot_aider_bench/Dockerfile
new file mode 100644
index 000000000000..5ba82d25dcaa
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:22.04
+
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-venv \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+RUN python3 -m pip install --no-cache-dir pytest
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.21.6.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.21.6.linux-amd64.tar.gz \
+    && rm go1.21.6.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java and Gradle
+RUN apt-get update && apt-get install -y \
+    openjdk-17-jdk \
+    gradle \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV AIDER_DOCKER=1
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/README.md b/evaluation/benchmarks/polyglot_aider_bench/README.md
new file mode 100644
index 000000000000..727866f097b6
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/README.md
@@ -0,0 +1,73 @@
+# Polyglot Aider Benchmark
+
+This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/aider/tree/main/benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
+
+## Features
+
+- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
+- End-to-end evaluation of code editing capabilities
+- Automated test execution and validation
+- Parallel evaluation with multiple workers
+- Detailed metrics and logging
+
+## Usage
+
+1. Make sure you have the required dependencies installed:
+   ```bash
+   pip install -e .[dev]
+   ```
+
+2. Run the benchmark:
+   ```bash
+   ./scripts/run_infer.sh \
+       --agent-cls CodeActAgent \
+       --llm-config configs/llm/gpt-4.yaml \
+       --eval-output-dir eval_output \
+       --eval-num-workers 10
+   ```
+
+### Command Line Arguments
+
+- `--agent-cls`: The agent class to use (default: CodeActAgent)
+- `--llm-config`: Path to the LLM configuration file (required)
+- `--eval-output-dir`: Directory to store evaluation outputs (default: eval_output)
+- `--eval-num-workers`: Number of parallel workers (default: 1)
+- `--eval-n-limit`: Limit the number of test cases to run (-1 for all)
+- `--eval-ids`: Comma-separated list of specific test IDs to run
+- `--eval-note`: Optional note to append to the output directory name
+
+## Output Format
+
+The benchmark saves its results in the following structure:
+```
+eval_output/
+├── PolyglotAiderBench/
+│   ├── CodeActAgent/
+│   │   ├── gpt-4_maxiter_10/
+│   │   │   ├── infer_logs/
+│   │   │   │   └── instance_*.log
+│   │   │   ├── llm_completions/
+│   │   │   │   └── instance_*/
+│   │   │   └── output.jsonl
+│   │   └── metadata.json
+```
+
+Each instance's results include:
+- Test execution results
+- LLM completions and costs
+- Error tracking (syntax errors, timeouts, etc.)
+- Full interaction history
+
+## Supported Languages
+
+The benchmark supports the following languages and test frameworks:
+- Python: pytest
+- JavaScript: npm test
+- Rust: cargo test
+- Go: go test
+- C++: make test
+- Java: Gradle test
+
+## Docker Support
+
+The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks.
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py b/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py
new file mode 100644
index 000000000000..f74101755a37
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py
@@ -0,0 +1,15 @@
+"""Prompts used in the polyglot aider benchmark."""
+
+INSTRUCTIONS_ADDENDUM = """
+I've provided the following files that need to be modified:
+{file_list}
+
+Please help me implement the necessary changes to meet the requirements.
+You should ONLY modify these files, and NOT create any new files.
+"""
+
+TEST_FAILURES = """
+The tests failed. Please fix the issues and try again.
+Remember to only modify the following files:
+{file_list}
+"""
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
new file mode 100644
index 000000000000..96399902c837
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
@@ -0,0 +1,382 @@
+import asyncio
+import copy
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import (
+    INSTRUCTIONS_ADDENDUM,
+    TEST_FAILURES,
+)
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure visibility of unit tests to the Agent.
+USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true'
+
+# Map of file extensions to test commands
+TEST_COMMANDS = {
+    ".py": ["python3", "-m", "pytest"],
+    ".rs": ["cargo", "test", "--", "--include-ignored"],
+    ".go": ["go", "test", "./..."],
+    ".js": ["npm", "test"],
+    ".cpp": ["make", "test"],
+    ".java": ["./gradlew", "test"],
+}
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0',  # TODO: Create this image
+            enable_auto_lint=True,
+            use_host_network=False,
+            timeout=300,  # Longer timeout for compilation
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    # Enable logging of LLM completions
+    llm_config.log_completions = True
+    config.set_llm_config(llm_config)
+
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    return config
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+
+    # Create workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Copy files to workspace
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Copy solution files
+        for file_path in instance.solution_files:
+            file_path = Path(file_path)
+            temp_file = Path(tmpdir) / file_path.name
+            with open(temp_file, 'w') as f:
+                f.write(instance.solution_content[file_path.name])
+            runtime.copy_to(
+                str(temp_file),
+                '/workspace',
+            )
+
+        # Copy test files if enabled
+        if USE_UNIT_TESTS:
+            for file_path in instance.test_files:
+                file_path = Path(file_path)
+                temp_file = Path(tmpdir) / file_path.name
+                with open(temp_file, 'w') as f:
+                    f.write(instance.test_content[file_path.name])
+                runtime.copy_to(
+                    str(temp_file),
+                    '/workspace',
+                )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+def run_unit_tests(
+    testdir: Path,
+    test_files: List[str],
+    history_fname: Path,
+) -> Optional[str]:
+    """Run unit tests and return error output if any."""
+    timeout = 180  # 3 minutes timeout
+
+    # Get unique file extensions from test files
+    extensions = {Path(f).suffix for f in test_files}
+
+    # Find matching test command
+    command = None
+    for ext in extensions:
+        if ext in TEST_COMMANDS:
+            command = TEST_COMMANDS[ext]
+            break
+
+    if not command:
+        raise ValueError(f"No test command found for files with extensions: {extensions}")
+
+    # Run tests
+    try:
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            timeout=timeout,
+            cwd=testdir,
+            encoding="utf-8",
+            errors="replace",
+        )
+    except subprocess.TimeoutExpired:
+        error = "Tests timed out!"
+        with history_fname.open("a") as fh:
+            fh.write(f"```\n{error}\n```")
+        return error
+
+    success = result.returncode == 0
+    output = result.stdout
+
+    # Clean up output
+    output = output.replace(str(testdir), str(testdir.name))
+    output = output.strip()
+
+    with history_fname.open("a") as fh:
+        fh.write(f"```\n{output}\n```")
+
+    if not success:
+        logger.info(f"Tests failed: {testdir}")
+        return output
+
+    return None
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> Dict[str, Any]:
+    """Complete the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    # Run tests
+    if USE_UNIT_TESTS:
+        test_output = run_unit_tests(
+            Path('/workspace'),
+            instance.test_files,
+            Path('/workspace/.aider.chat.history.md'),
+        )
+        exit_code = 1 if test_output else 0
+    else:
+        test_output = ""
+        exit_code = 0
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    runtime.close()
+
+    return {
+        'test_output': test_output,
+        'exit_code': exit_code,
+    }
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = instance.instruction
+
+    # Add file list to instruction
+    file_list = " ".join(instance.solution_files)
+    instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list)
+
+    if USE_UNIT_TESTS:
+        test_files = " ".join(instance.test_files)
+        logger.info(f'\nTest files: {test_files}\n')
+        instruction += (
+            f'Use the appropriate test command to run the tests and verify your solution. '
+            'DO NOT EDIT the test files.\n\n'
+        )
+
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    exit_code = return_val['exit_code']
+    test_output = return_val['test_output']
+
+    errors = []
+    test_cases = None
+    if test_output:
+        if 'SyntaxError' in test_output:
+            errors.append('SyntaxError')
+        elif 'IndentationError' in test_output:
+            errors.append('IndentationError')
+        else:
+            test_cases = test_output
+
+    test_result = {
+        'exit_code': exit_code,
+        'test_cases': test_cases,
+        'errors': errors,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    # Load the polyglot benchmark dataset
+    dataset = load_dataset('Aider-AI/polyglot-benchmark')
+    polyglot_tests = dataset['train'].to_pandas()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
+        llm_config.modify_params = False
+        # Enable logging of LLM completions
+        llm_config.log_completions = True
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotAiderBench',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        polyglot_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh
new file mode 100755
index 000000000000..7719fe28c0d8
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/build_docker.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Get the directory where the script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+cd "$SCRIPT_DIR/.." || exit 1
+
+# Build the Docker image
+docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 .
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
new file mode 100755
index 000000000000..51fe90c87bcc
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Get the directory where the script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+cd "$SCRIPT_DIR/.." || exit 1
+
+# Default values
+AGENT_CLS="CodeActAgent"
+EVAL_NOTE=""
+EVAL_OUTPUT_DIR="eval_output"
+EVAL_NUM_WORKERS=1
+EVAL_N_LIMIT=-1
+LLM_CONFIG=""
+EVAL_IDS=""
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --agent-cls)
+            AGENT_CLS="$2"
+            shift 2
+            ;;
+        --eval-note)
+            EVAL_NOTE="$2"
+            shift 2
+            ;;
+        --eval-output-dir)
+            EVAL_OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --eval-num-workers)
+            EVAL_NUM_WORKERS="$2"
+            shift 2
+            ;;
+        --eval-n-limit)
+            EVAL_N_LIMIT="$2"
+            shift 2
+            ;;
+        --llm-config)
+            LLM_CONFIG="$2"
+            shift 2
+            ;;
+        --eval-ids)
+            EVAL_IDS="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Check required arguments
+if [ -z "$LLM_CONFIG" ]; then
+    echo "Error: --llm-config is required"
+    exit 1
+fi
+
+# Run the evaluation
+python3 run_infer.py \
+    --agent-cls "$AGENT_CLS" \
+    --eval-note "$EVAL_NOTE" \
+    --eval-output-dir "$EVAL_OUTPUT_DIR" \
+    --eval-num-workers "$EVAL_NUM_WORKERS" \
+    --eval-n-limit "$EVAL_N_LIMIT" \
+    --llm-config "$LLM_CONFIG" \
+    ${EVAL_IDS:+--eval-ids "$EVAL_IDS"}
\ No newline at end of file

From a386b423d44880a2f67921dc3f39a8b9519464ff Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 10:06:10 +0000
Subject: [PATCH 3/6] feat: Support old-style positional arguments in polyglot
 aider benchmark

Modified run_infer.sh to support both argument styles:
- Old style: <model> <commit> <agent> <max_iters> <num_workers>
- New style: --llm-config <config> --agent-cls <agent> [other options]

Updated README to document both usage styles with examples.
This maintains backward compatibility with existing scripts.
---
 .../benchmarks/polyglot_aider_bench/README.md | 21 +++-
 .../polyglot_aider_bench/scripts/run_infer.sh | 97 +++++++++++--------
 2 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_aider_bench/README.md b/evaluation/benchmarks/polyglot_aider_bench/README.md
index 727866f097b6..e3f07537ae56 100644
--- a/evaluation/benchmarks/polyglot_aider_bench/README.md
+++ b/evaluation/benchmarks/polyglot_aider_bench/README.md
@@ -17,7 +17,18 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
    pip install -e .[dev]
    ```
 
-2. Run the benchmark:
+2. Run the benchmark using either style:
+
+   **Old Style (Positional Arguments)**:
+   ```bash
+   ./scripts/run_infer.sh <model> <commit> <agent> <max_iters> <num_workers>
+   ```
+   Example:
+   ```bash
+   ./scripts/run_infer.sh 4ominiSky HEAD CodeActAgent 1000 1
+   ```
+
+   **New Style (Named Arguments)**:
    ```bash
    ./scripts/run_infer.sh \
        --agent-cls CodeActAgent \
@@ -28,6 +39,14 @@ This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aid
 
 ### Command Line Arguments
 
+**Old Style (Positional)**:
+1. `model`: Model name (will look for configs/llm/{model}.yaml)
+2. `commit`: Git commit or note to append to output directory
+3. `agent`: Agent class name
+4. `max_iters`: Maximum iterations per test
+5. `num_workers`: Number of parallel workers
+
+**New Style (Named)**:
 - `--agent-cls`: The agent class to use (default: CodeActAgent)
 - `--llm-config`: Path to the LLM configuration file (required)
 - `--eval-output-dir`: Directory to store evaluation outputs (default: eval_output)
diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
index 51fe90c87bcc..13fe63ff1ca1 100755
--- a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
@@ -13,47 +13,65 @@ EVAL_N_LIMIT=-1
 LLM_CONFIG=""
 EVAL_IDS=""
 
-# Parse command line arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --agent-cls)
-            AGENT_CLS="$2"
-            shift 2
-            ;;
-        --eval-note)
-            EVAL_NOTE="$2"
-            shift 2
-            ;;
-        --eval-output-dir)
-            EVAL_OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        --eval-num-workers)
-            EVAL_NUM_WORKERS="$2"
-            shift 2
-            ;;
-        --eval-n-limit)
-            EVAL_N_LIMIT="$2"
-            shift 2
-            ;;
-        --llm-config)
-            LLM_CONFIG="$2"
-            shift 2
-            ;;
-        --eval-ids)
-            EVAL_IDS="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unknown argument: $1"
-            exit 1
-            ;;
-    esac
-done
+# Check if using positional arguments (old style)
+if [[ $# -ge 5 && "$1" != "--"* ]]; then
+    # Old style: <model> <commit> <agent> <max_iters> <num_workers>
+    MODEL="$1"
+    COMMIT="$2"
+    AGENT_CLS="$3"
+    MAX_ITERS="$4"
+    EVAL_NUM_WORKERS="$5"
+
+    # Convert to new style arguments
+    LLM_CONFIG="configs/llm/${MODEL}.yaml"
+    EVAL_NOTE="${COMMIT}"
+    MAX_ITERATIONS="--max-iterations ${MAX_ITERS}"
+else
+    # Parse named arguments (new style)
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --agent-cls)
+                AGENT_CLS="$2"
+                shift 2
+                ;;
+            --eval-note)
+                EVAL_NOTE="$2"
+                shift 2
+                ;;
+            --eval-output-dir)
+                EVAL_OUTPUT_DIR="$2"
+                shift 2
+                ;;
+            --eval-num-workers)
+                EVAL_NUM_WORKERS="$2"
+                shift 2
+                ;;
+            --eval-n-limit)
+                EVAL_N_LIMIT="$2"
+                shift 2
+                ;;
+            --llm-config)
+                LLM_CONFIG="$2"
+                shift 2
+                ;;
+            --eval-ids)
+                EVAL_IDS="$2"
+                shift 2
+                ;;
+            *)
+                echo "Unknown argument: $1"
+                exit 1
+                ;;
+        esac
+    done
+fi
 
 # Check required arguments
 if [ -z "$LLM_CONFIG" ]; then
-    echo "Error: --llm-config is required"
+    echo "Error: LLM config is required"
+    echo "Usage:"
+    echo "  Old style: $0 <model> <commit> <agent> <max_iters> <num_workers>"
+    echo "  New style: $0 --llm-config <config> --agent-cls <agent> [other options]"
     exit 1
 fi
 
@@ -65,4 +83,5 @@ python3 run_infer.py \
     --eval-num-workers "$EVAL_NUM_WORKERS" \
     --eval-n-limit "$EVAL_N_LIMIT" \
     --llm-config "$LLM_CONFIG" \
-    ${EVAL_IDS:+--eval-ids "$EVAL_IDS"}
\ No newline at end of file
+    ${EVAL_IDS:+--eval-ids "$EVAL_IDS"} \
+    ${MAX_ITERATIONS:-}
\ No newline at end of file

From 0121e5711b5962c5baa824f034a92ff4f26ac57c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 10:10:14 +0000
Subject: [PATCH 4/6] fix: Use relative imports in polyglot aider benchmark

- Changed imports to use relative paths
- Added __init__.py to helper directory
- This fixes ModuleNotFoundError when running the benchmark
---
 evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py | 1 +
 evaluation/benchmarks/polyglot_aider_bench/run_infer.py       | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py

diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py b/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py
new file mode 100644
index 000000000000..f5f6062fa9a4
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py
@@ -0,0 +1 @@
+"""Helper modules for the polyglot aider benchmark."""
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
index 96399902c837..27ebe077b543 100644
--- a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
+++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
@@ -11,11 +11,11 @@
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import (
+from .helper.prompts import (
     INSTRUCTIONS_ADDENDUM,
     TEST_FAILURES,
 )
-from evaluation.utils.shared import (
+from ....utils.shared import (
     EvalMetadata,
     EvalOutput,
     compatibility_for_eval_history_pairs,

From 3a2b167309a0ad9bf6b6922327aebba1496b1d84 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 25 Feb 2025 10:11:01 +0000
Subject: [PATCH 5/6] fix: Fix Python package imports in polyglot aider
 benchmark

- Added OpenHands root to PYTHONPATH in run_infer.sh
- Changed back to absolute imports in run_infer.py
- This fixes the 'no known parent package' error
---
 evaluation/benchmarks/polyglot_aider_bench/run_infer.py   | 4 ++--
 .../benchmarks/polyglot_aider_bench/scripts/run_infer.sh  | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
index 27ebe077b543..96399902c837 100644
--- a/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
+++ b/evaluation/benchmarks/polyglot_aider_bench/run_infer.py
@@ -11,11 +11,11 @@
 import pandas as pd
 from datasets import load_dataset
 
-from .helper.prompts import (
+from evaluation.benchmarks.polyglot_aider_bench.helper.prompts import (
     INSTRUCTIONS_ADDENDUM,
     TEST_FAILURES,
 )
-from ....utils.shared import (
+from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     compatibility_for_eval_history_pairs,
diff --git a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
index 13fe63ff1ca1..d802a454e605 100755
--- a/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/polyglot_aider_bench/scripts/run_infer.sh
@@ -2,7 +2,13 @@
 
 # Get the directory where the script is located
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-cd "$SCRIPT_DIR/.." || exit 1
+BENCH_DIR="$( cd "$SCRIPT_DIR/.." &> /dev/null && pwd )"
+ROOT_DIR="$( cd "$BENCH_DIR/../../.." &> /dev/null && pwd )"
+
+# Add OpenHands root to PYTHONPATH
+export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
+
+cd "$BENCH_DIR" || exit 1
 
 # Default values
 AGENT_CLS="CodeActAgent"

From afbf10f8569264e337fd811d2b02c67fa4499d0f Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 26 Feb 2025 06:03:07 +0000
Subject: [PATCH 6/6] Add polyglot benchmark implementation

---
 .../benchmarks/polyglot_benchmark/Dockerfile  |  63 +++
 .../benchmarks/polyglot_benchmark/README.md   |  90 ++++
 .../polyglot_benchmark/helper/__init__.py     |   0
 .../polyglot_benchmark/helper/prompts.py      |  28 +
 .../polyglot_benchmark/run_infer.py           | 487 ++++++++++++++++++
 .../scripts/build_docker.sh                   |  12 +
 .../polyglot_benchmark/scripts/run_infer.sh   |  35 ++
 .../scripts/summarize_results.py              |  84 +++
 .../polyglot_benchmark/test_load_dataset.py   |  40 ++
 .../benchmarks/polyglot_benchmark/test_run.py |  73 +++
 10 files changed, 912 insertions(+)
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/Dockerfile
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/README.md
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/__init__.py
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
 create mode 100644 evaluation/benchmarks/polyglot_benchmark/run_infer.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
 create mode 100755 evaluation/benchmarks/polyglot_benchmark/test_run.py

diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
new file mode 100644
index 000000000000..ed789e6d8000
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
@@ -0,0 +1,63 @@
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    wget \
+    software-properties-common \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    libboost-all-dev \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip3 install --no-cache-dir pytest pytest-timeout
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \
+    && rm go1.20.5.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java
+RUN apt-get update && apt-get install -y openjdk-17-jdk \
+    && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Install Gradle
+RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
+    && mkdir /opt/gradle \
+    && unzip -d /opt/gradle gradle-7.6-bin.zip \
+    && rm gradle-7.6-bin.zip
+ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
new file mode 100644
index 000000000000..d92251acb9f7
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -0,0 +1,90 @@
+# Polyglot Benchmark
+
+This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
+
+## Features
+
+- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
+- End-to-end evaluation of code editing capabilities
+- Automated test execution and validation
+- Parallel evaluation with multiple workers
+- Detailed metrics and logging
+
+## Setup
+
+1. Clone the polyglot-benchmark repository:
+   ```bash
+   git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark
+   ```
+
+2. Build the Docker image for the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
+   ```
+
+## Usage
+
+1. Make sure you have the required dependencies installed:
+   ```bash
+   pip install -e .[dev]
+   ```
+
+2. Run the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh <model_config> <git-version> <agent> <eval_limit> <eval-num-workers> <eval_ids> <eval_languages>
+   ```
+
+### Command Line Arguments
+
+- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`)
+- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`)
+- `agent`: Agent class name (e.g., `CodeActAgent`)
+- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all)
+- `eval-num-workers`: Number of parallel workers (default: `1`)
+- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`)
+- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`)
+
+### Environment Variables
+
+You can also set the following environment variables:
+
+```bash
+export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the polyglot-benchmark repository
+export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
+```
+
+### Example
+
+```bash
+# Run evaluation on CodeActAgent for all Python instances with 2 workers
+export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark"
+./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python"
+```
+
+## Summarize Results
+
+After running the benchmark, you can summarize the results:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py <path_to_output_jsonl_file>
+```
+
+Example:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl
+```
+
+## Supported Languages
+
+The benchmark supports the following languages and test frameworks:
+- Python: pytest
+- JavaScript: npm test
+- Rust: cargo test
+- Go: go test
+- C++: make test
+- Java: Gradle test
+
+## Docker Support
+
+The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks.
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
new file mode 100644
index 000000000000..61bc0e54cb11
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
@@ -0,0 +1,28 @@
+"""Prompts used in the polyglot benchmark."""
+
+INSTRUCTIONS_ADDENDUM = """
+I've provided the following files that need to be modified:
+{file_list}
+
+Please help me implement the necessary changes to meet the requirements.
+You should ONLY modify these files, and NOT create any new files.
+"""
+
+TEST_FAILURES = """
+The tests failed. Please fix the issues and try again.
+Remember to only modify the following files:
+{file_list}
+"""
+
+# Dictionary mapping agent class names to their specific instruction suffixes
+INST_SUFFIXES = {
+    'CodeActAgent': (
+        'REMEMBER: All edits must be made directly in the files. Do NOT send'
+        ' the edited file as output to the user.\n'
+    )
+}
+
+# Dictionary mapping agent class names to their fake response functions
+FAKE_RESPONSES = {
+    'CodeActAgent': lambda _: None,  # Will be replaced with codeact_user_response from shared.py
+}
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
new file mode 100644
index 000000000000..45a9ee4f91ac
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -0,0 +1,487 @@
+import asyncio
+import copy
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+from evaluation.benchmarks.polyglot_benchmark.helper.prompts import (
+    INSTRUCTIONS_ADDENDUM,
+    INST_SUFFIXES,
+    TEST_FAILURES,
+    FAKE_RESPONSES,
+)
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+    codeact_user_response,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure visibility of unit tests to the Agent.
+USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'true').lower() == 'true'
+
+# Map of file extensions to test commands
+TEST_COMMANDS = {
+    ".py": ["python3", "-m", "pytest"],
+    ".rs": ["cargo", "test", "--", "--include-ignored"],
+    ".go": ["go", "test", "./..."],
+    ".js": ["npm", "test"],
+    ".cpp": ["make", "test"],
+    ".java": ["./gradlew", "test"],
+}
+
+# Update fake responses with the actual function
+FAKE_RESPONSES['CodeActAgent'] = codeact_user_response
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='ghcr.io/opendevin/eval-polyglot:v1.0.0',  # TODO: Create this image
+            enable_auto_lint=True,
+            use_host_network=False,
+            timeout=300,  # Longer timeout for compilation
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    # Enable logging of LLM completions
+    llm_config.log_completions = True
+    config.set_llm_config(llm_config)
+
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+
+    # Create workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Copy files to workspace
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Copy solution files
+        for file_path in instance.solution_files:
+            file_path = Path(file_path)
+            temp_file = Path(tmpdir) / file_path.name
+            with open(temp_file, 'w') as f:
+                f.write(instance.solution_content[file_path.name])
+            runtime.copy_to(
+                str(temp_file),
+                '/workspace',
+            )
+
+        # Copy test files if enabled
+        if USE_UNIT_TESTS:
+            for file_path in instance.test_files:
+                file_path = Path(file_path)
+                temp_file = Path(tmpdir) / file_path.name
+                with open(temp_file, 'w') as f:
+                    f.write(instance.test_content[file_path.name])
+                runtime.copy_to(
+                    str(temp_file),
+                    '/workspace',
+                )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> Dict[str, Any]:
+    """Complete the runtime for the agent."""
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    # Run tests
+    test_output = ""
+    exit_code = 1
+    
+    if USE_UNIT_TESTS:
+        # Get unique file extensions from test files
+        extensions = {Path(f).suffix for f in instance.test_files}
+        
+        # Find matching test command
+        command = None
+        for ext in extensions:
+            if ext in TEST_COMMANDS:
+                command = TEST_COMMANDS[ext]
+                break
+                
+        if command:
+            try:
+                result = subprocess.run(
+                    command,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    timeout=180,  # 3 minutes timeout
+                    cwd="/workspace",
+                    encoding="utf-8",
+                    errors="replace",
+                )
+                exit_code = result.returncode
+                test_output = result.stdout
+                
+                # Clean up output
+                test_output = test_output.replace("/workspace", "workspace")
+                
+                # Log test output to history file
+                with open("/workspace/.aider.chat.history.md", "a") as fh:
+                    fh.write(f"```\n{test_output}\n```")
+                    
+            except subprocess.TimeoutExpired:
+                test_output = "Tests timed out!"
+                exit_code = 1
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+
+    runtime.close()
+
+    return {
+        'test_output': test_output,
+        'exit_code': exit_code,
+    }
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = instance.instruction
+
+    # Add file list to instruction
+    file_list = " ".join(instance.solution_files)
+    instruction += INSTRUCTIONS_ADDENDUM.format(file_list=file_list)
+
+    if USE_UNIT_TESTS:
+        test_files = " ".join(instance.test_files)
+        logger.info(f'\nTest files: {test_files}\n')
+        instruction += (
+            f'Use the appropriate test command to run the tests and verify your solution. '
+            'DO NOT EDIT the test files.\n\n'
+        )
+
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+    
+    # Add agent-specific instruction suffix
+    if metadata.agent_class in INST_SUFFIXES:
+        instruction += INST_SUFFIXES[metadata.agent_class]
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    exit_code = return_val['exit_code']
+    test_output = return_val['test_output']
+
+    errors = []
+    test_cases = None
+    if test_output:
+        if 'SyntaxError' in test_output:
+            errors.append('SyntaxError')
+        elif 'IndentationError' in test_output:
+            errors.append('IndentationError')
+        else:
+            test_cases = test_output
+
+    test_result = {
+        'exit_code': exit_code,
+        'test_cases': test_cases,
+        'errors': errors,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+def load_polyglot_dataset():
+    """Load the polyglot benchmark dataset from the repository."""
+    import glob
+    import json
+    import os
+    
+    # Path to the polyglot-benchmark repository
+    repo_path = os.environ.get('POLYGLOT_BENCHMARK_PATH', '/workspace/polyglot-benchmark')
+    
+    all_tests = []
+    instance_id = 0
+    
+    # Process each language directory
+    for lang_dir in ['python', 'javascript', 'rust', 'go', 'cpp', 'java']:
+        lang_path = os.path.join(repo_path, lang_dir, 'exercises', 'practice')
+        if not os.path.exists(lang_path):
+            logger.warning(f"Language directory not found: {lang_path}")
+            continue
+            
+        # Process each exercise directory
+        for exercise_dir in os.listdir(lang_path):
+            exercise_path = os.path.join(lang_path, exercise_dir)
+            if not os.path.isdir(exercise_path):
+                continue
+                
+            # Check for config.json
+            config_file = os.path.join(exercise_path, '.meta', 'config.json')
+            if not os.path.exists(config_file):
+                logger.warning(f"Config file not found: {config_file}")
+                continue
+                
+            # Load config
+            with open(config_file, 'r') as f:
+                config = json.load(f)
+                
+            # Get solution and test files
+            solution_files = config.get('files', {}).get('solution', [])
+            test_files = config.get('files', {}).get('test', [])
+            
+            if not solution_files or not test_files:
+                logger.warning(f"Missing solution or test files in {exercise_path}")
+                continue
+                
+            # Load instructions
+            instruction = ""
+            intro_file = os.path.join(exercise_path, '.docs', 'introduction.md')
+            if os.path.exists(intro_file):
+                with open(intro_file, 'r') as f:
+                    instruction += f.read() + "\n\n"
+                    
+            instructions_file = os.path.join(exercise_path, '.docs', 'instructions.md')
+            if os.path.exists(instructions_file):
+                with open(instructions_file, 'r') as f:
+                    instruction += f.read() + "\n\n"
+                    
+            if not instruction:
+                logger.warning(f"No instructions found for {exercise_path}")
+                continue
+                
+            # Load solution and test content
+            solution_content = {}
+            for file_path in solution_files:
+                full_path = os.path.join(exercise_path, file_path)
+                if os.path.exists(full_path):
+                    with open(full_path, 'r') as f:
+                        solution_content[os.path.basename(file_path)] = f.read()
+                        
+            test_content = {}
+            for file_path in test_files:
+                full_path = os.path.join(exercise_path, file_path)
+                if os.path.exists(full_path):
+                    with open(full_path, 'r') as f:
+                        test_content[os.path.basename(file_path)] = f.read()
+                        
+            # Create test instance
+            test_instance = {
+                'instance_id': instance_id,
+                'instance_name': exercise_dir,
+                'language': lang_dir,
+                'instruction': instruction,
+                'solution_files': [os.path.basename(f) for f in solution_files],
+                'test_files': [os.path.basename(f) for f in test_files],
+                'solution_content': solution_content,
+                'test_content': test_content,
+            }
+            
+            all_tests.append(test_instance)
+            instance_id += 1
+            
+    return pd.DataFrame(all_tests)
+
+def add_arguments(parser):
+    """Add polyglot benchmark specific arguments to the parser."""
+    parser.add_argument(
+        '--eval-languages',
+        type=str,
+        help='Comma-separated list of languages to test (e.g., "python,javascript,rust")',
+    )
+    return parser
+
+if __name__ == '__main__':
+    # Add custom arguments
+    parser = parse_arguments.__self__
+    add_arguments(parser)
+    args = parser.parse_args()
+    
+    # Load the polyglot benchmark dataset
+    polyglot_tests = load_polyglot_dataset()
+    
+    if polyglot_tests.empty:
+        logger.error("Failed to load polyglot benchmark dataset")
+        exit(1)
+        
+    logger.info(f"Loaded {len(polyglot_tests)} test instances from polyglot benchmark")
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+        # Enable logging of LLM completions
+        llm_config.log_completions = True
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotBenchmark',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+        
+    # Filter by language if specified
+    if hasattr(args, 'eval_languages') and args.eval_languages:
+        languages = [lang.strip().lower() for lang in args.eval_languages.split(',')]
+        polyglot_tests = polyglot_tests[polyglot_tests['language'].str.lower().isin(languages)]
+        logger.info(f'\nFiltered to languages: {languages}, {len(polyglot_tests)} instances remaining\n')
+
+    instances = prepare_dataset(
+        polyglot_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
new file mode 100755
index 000000000000..1c6a2dfff7a1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory of this script
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BENCHMARK_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
+
+# Build the Docker image
+docker build -t ghcr.io/opendevin/eval-polyglot:v1.0.0 -f "${BENCHMARK_DIR}/Dockerfile" "${BENCHMARK_DIR}"
+
+echo "Docker image built successfully: ghcr.io/opendevin/eval-polyglot:v1.0.0"
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
new file mode 100755
index 000000000000..ce998a112330
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -e
+
+# Default values
+MODEL_CONFIG=${1:-"eval_gpt4_1106_preview"}
+GIT_VERSION=${2:-"HEAD"}
+AGENT=${3:-"CodeActAgent"}
+EVAL_LIMIT=${4:-"-1"}
+EVAL_NUM_WORKERS=${5:-"1"}
+EVAL_IDS=${6:-""}
+EVAL_LANGUAGES=${7:-""}
+
+# Set environment variables
+export POLYGLOT_BENCHMARK_PATH=${POLYGLOT_BENCHMARK_PATH:-"/workspace/polyglot-benchmark"}
+export USE_UNIT_TESTS=${USE_UNIT_TESTS:-"true"}
+
+# Add additional arguments based on provided parameters
+ARGS="--agent-cls ${AGENT} --llm-config ${MODEL_CONFIG} --max-iterations 30 --eval-num-workers ${EVAL_NUM_WORKERS}"
+
+if [ "${EVAL_LIMIT}" != "-1" ]; then
+  ARGS="${ARGS} --eval-n-limit ${EVAL_LIMIT}"
+fi
+
+if [ -n "${EVAL_IDS}" ]; then
+  ARGS="${ARGS} --eval-ids ${EVAL_IDS}"
+fi
+
+if [ -n "${EVAL_LANGUAGES}" ]; then
+  ARGS="${ARGS} --eval-languages ${EVAL_LANGUAGES}"
+fi
+
+# Run the evaluation
+cd "$(git rev-parse --show-toplevel)"
+poetry run python -m evaluation.benchmarks.polyglot_benchmark.run_infer ${ARGS}
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
new file mode 100755
index 000000000000..988f3a618bff
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+from collections import defaultdict
+
+def load_jsonl(file_path):
+    """Load data from a jsonl file."""
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+def summarize_results(output_file):
+    """Summarize the results of the polyglot benchmark evaluation."""
+    if not os.path.exists(output_file):
+        print(f"Error: Output file {output_file} does not exist.")
+        return
+        
+    results = load_jsonl(output_file)
+    
+    # Count total instances
+    total_instances = len(results)
+    print(f"Total instances: {total_instances}")
+    
+    # Count by language
+    language_counts = defaultdict(int)
+    language_passed = defaultdict(int)
+    
+    # Count passed and failed instances
+    passed_instances = []
+    failed_instances = []
+    
+    for result in results:
+        instance = result.get('instance', {})
+        language = instance.get('language', 'unknown')
+        instance_name = instance.get('instance_name', 'unknown')
+        instance_id = result.get('instance_id', 'unknown')
+        
+        language_counts[language] += 1
+        
+        # Check if all tests passed
+        test_result = result.get('test_result', {})
+        exit_code = test_result.get('exit_code', 1)
+        
+        if exit_code == 0:
+            passed_instances.append((instance_id, language, instance_name))
+            language_passed[language] += 1
+        else:
+            failed_instances.append((instance_id, language, instance_name))
+    
+    # Print summary
+    print("\nResults by language:")
+    print("--------------------")
+    for language, count in sorted(language_counts.items()):
+        passed = language_passed[language]
+        percentage = (passed / count) * 100 if count > 0 else 0
+        print(f"{language}: {passed}/{count} ({percentage:.1f}%)")
+    
+    # Overall pass rate
+    total_passed = len(passed_instances)
+    overall_percentage = (total_passed / total_instances) * 100 if total_instances > 0 else 0
+    print(f"\nOverall pass rate: {total_passed}/{total_instances} ({overall_percentage:.1f}%)")
+    
+    # Print passed instances
+    print("\nPassed instances:")
+    print("----------------")
+    for instance_id, language, instance_name in sorted(passed_instances):
+        print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}")
+    
+    # Print failed instances
+    print("\nFailed instances:")
+    print("----------------")
+    for instance_id, language, instance_name in sorted(failed_instances):
+        print(f"ID: {instance_id}, Language: {language}, Name: {instance_name}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize polyglot benchmark results")
+    parser.add_argument("output_file", help="Path to the output.jsonl file")
+    args = parser.parse_args()
+    
+    summarize_results(args.output_file)
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
new file mode 100755
index 000000000000..708259732b02
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/test_load_dataset.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+from pathlib import Path
+
+# Add the parent directory to the Python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from evaluation.benchmarks.polyglot_benchmark.run_infer import load_polyglot_dataset
+
+def main():
+    # Set the environment variable for the polyglot benchmark path
+    os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark'
+    
+    # Load the dataset
+    dataset = load_polyglot_dataset()
+    
+    # Print summary
+    print(f"Loaded {len(dataset)} test instances")
+    
+    # Print language distribution
+    language_counts = dataset['language'].value_counts()
+    print("\nLanguage distribution:")
+    for language, count in language_counts.items():
+        print(f"{language}: {count}")
+    
+    # Print a sample instance
+    if not dataset.empty:
+        print("\nSample instance:")
+        sample = dataset.iloc[0]
+        print(f"ID: {sample.instance_id}")
+        print(f"Name: {sample.instance_name}")
+        print(f"Language: {sample.language}")
+        print(f"Solution files: {sample.solution_files}")
+        print(f"Test files: {sample.test_files}")
+        print(f"Instruction (first 100 chars): {sample.instruction[:100]}...")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py
new file mode 100755
index 000000000000..a8671b0646f1
--- /dev/null
+++ b/evaluation/benchmarks/polyglot_benchmark/test_run.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import argparse
+from pathlib import Path
+
+# Add the parent directory to the Python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from evaluation.benchmarks.polyglot_benchmark.run_infer import (
+    load_polyglot_dataset,
+    process_instance,
+    make_metadata,
+    get_llm_config_arg,
+)
+from openhands.core.logger import openhands_logger as logger
+
+def main():
+    parser = argparse.ArgumentParser(description="Test the polyglot benchmark with a single instance")
+    parser.add_argument("--model", default="eval_gpt35_turbo", help="Model configuration name")
+    parser.add_argument("--agent", default="CodeActAgent", help="Agent class name")
+    parser.add_argument("--instance-id", type=int, default=0, help="Instance ID to test")
+    parser.add_argument("--language", help="Filter by language")
+    args = parser.parse_args()
+    
+    # Set the environment variable for the polyglot benchmark path
+    os.environ['POLYGLOT_BENCHMARK_PATH'] = '/workspace/polyglot-benchmark'
+    
+    # Load the dataset
+    dataset = load_polyglot_dataset()
+    
+    if args.language:
+        dataset = dataset[dataset['language'].str.lower() == args.language.lower()]
+        if dataset.empty:
+            print(f"No instances found for language: {args.language}")
+            return
+    
+    # Get the instance to test
+    if args.instance_id >= len(dataset):
+        print(f"Instance ID {args.instance_id} is out of range. Max ID: {len(dataset) - 1}")
+        return
+        
+    instance = dataset.iloc[args.instance_id]
+    print(f"Testing instance {instance.instance_id}: {instance.instance_name} ({instance.language})")
+    
+    # Get LLM config
+    llm_config = get_llm_config_arg(args.model)
+    if llm_config is None:
+        print(f"Could not find LLM config: {args.model}")
+        return
+        
+    # Create metadata
+    metadata = make_metadata(
+        llm_config,
+        'PolyglotBenchmark',
+        args.agent,
+        30,  # max_iterations
+        "test",
+        "evaluation/evaluation_outputs/test",
+    )
+    
+    # Process the instance
+    try:
+        output = process_instance(instance, metadata, reset_logger=False)
+        print("\nTest completed successfully!")
+        print(f"Exit code: {output.test_result['exit_code']}")
+        print(f"Passed: {output.test_result['exit_code'] == 0}")
+    except Exception as e:
+        print(f"Error processing instance: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file