AlexCuadron · AlexCuadron · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -20,6 +20,7 @@
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -45,6 +46,7 @@
 
 
 def get_config(
+    instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
     config = AppConfig(
@@ -67,7 +69,13 @@ def get_config(
         workspace_base=None,
         workspace_mount_path=None,
     )
-    config.set_llm_config(metadata.llm_config)
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
 
@@ -170,7 +178,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
-    config = get_config(metadata)
+    config = get_config(instance, metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:

diff --git a/evaluation/benchmarks/polyglot_aider_bench/Dockerfile b/evaluation/benchmarks/polyglot_aider_bench/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:22.04
+
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-venv \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+RUN python3 -m pip install --no-cache-dir pytest
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.21.6.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.21.6.linux-amd64.tar.gz \
+    && rm go1.21.6.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java and Gradle
+RUN apt-get update && apt-get install -y \
+    openjdk-17-jdk \
+    gradle \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV AIDER_DOCKER=1
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
diff --git a/evaluation/benchmarks/polyglot_aider_bench/README.md b/evaluation/benchmarks/polyglot_aider_bench/README.md
@@ -0,0 +1,92 @@
+# Polyglot Aider Benchmark
+
+This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/aider/tree/main/benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
+
+## Features
+
+- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
+- End-to-end evaluation of code editing capabilities
+- Automated test execution and validation
+- Parallel evaluation with multiple workers
+- Detailed metrics and logging
+
+## Usage
+
+1. Make sure you have the required dependencies installed:
+   ```bash
+   pip install -e .[dev]
+   ```
+
+2. Run the benchmark using either style:
+
+   **Old Style (Positional Arguments)**:
+   ```bash
+   ./scripts/run_infer.sh <model> <commit> <agent> <max_iters> <num_workers>
+   ```
+   Example:
+   ```bash
+   ./scripts/run_infer.sh 4ominiSky HEAD CodeActAgent 1000 1
+   ```
+
+   **New Style (Named Arguments)**:
+   ```bash
+   ./scripts/run_infer.sh \
+       --agent-cls CodeActAgent \
+       --llm-config configs/llm/gpt-4.yaml \
+       --eval-output-dir eval_output \
+       --eval-num-workers 10
+   ```
+
+### Command Line Arguments
+
+**Old Style (Positional)**:
+1. `model`: Model name (will look for configs/llm/{model}.yaml)
+2. `commit`: Git commit or note to append to output directory
+3. `agent`: Agent class name
+4. `max_iters`: Maximum iterations per test
+5. `num_workers`: Number of parallel workers
+
+**New Style (Named)**:
+- `--agent-cls`: The agent class to use (default: CodeActAgent)
+- `--llm-config`: Path to the LLM configuration file (required)
+- `--eval-output-dir`: Directory to store evaluation outputs (default: eval_output)
+- `--eval-num-workers`: Number of parallel workers (default: 1)
+- `--eval-n-limit`: Limit the number of test cases to run (-1 for all)
+- `--eval-ids`: Comma-separated list of specific test IDs to run
+- `--eval-note`: Optional note to append to the output directory name
+
+## Output Format
+
+The benchmark saves its results in the following structure:
+```
+eval_output/
+├── PolyglotAiderBench/
+│   ├── CodeActAgent/
+│   │   ├── gpt-4_maxiter_10/
+│   │   │   ├── infer_logs/
+│   │   │   │   └── instance_*.log
+│   │   │   ├── llm_completions/
+│   │   │   │   └── instance_*/
+│   │   │   └── output.jsonl
+│   │   └── metadata.json
+```
+
+Each instance's results include:
+- Test execution results
+- LLM completions and costs
+- Error tracking (syntax errors, timeouts, etc.)
+- Full interaction history
+
+## Supported Languages
+
+The benchmark supports the following languages and test frameworks:
+- Python: pytest
+- JavaScript: npm test
+- Rust: cargo test
+- Go: go test
+- C++: make test
+- Java: Gradle test
+
+## Docker Support
+
+The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks.
diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py b/evaluation/benchmarks/polyglot_aider_bench/helper/__init__.py
@@ -0,0 +1 @@
+"""Helper modules for the polyglot aider benchmark."""
diff --git a/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py b/evaluation/benchmarks/polyglot_aider_bench/helper/prompts.py
@@ -0,0 +1,15 @@
+"""Prompts used in the polyglot aider benchmark."""
+
+INSTRUCTIONS_ADDENDUM = """
+I've provided the following files that need to be modified:
+{file_list}
+
+Please help me implement the necessary changes to meet the requirements.
+You should ONLY modify these files, and NOT create any new files.
+"""
+
+TEST_FAILURES = """
+The tests failed. Please fix the issues and try again.
+Remember to only modify the following files:
+{file_list}
+"""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Helper modules for the polyglot aider benchmark."""