AlexCuadron · AlexCuadron · Feb 25, 2025 · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
@@ -16,7 +16,7 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
@@ -31,6 +31,7 @@ development environment and LLM.
 - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
 - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
     given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
 
 There are also following optional environment variables you can set:
 
@@ -53,7 +54,11 @@ You can update the arguments in the script
 - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
+# Run benchmark without evaluation
 ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" eval
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -21,6 +21,7 @@
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -45,6 +46,7 @@
 
 
 def get_config(
+    instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
@@ -59,7 +61,13 @@ def get_config(
         workspace_base=None,
         workspace_mount_path=None,
     )
-    config.set_llm_config(metadata.llm_config)
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
 
@@ -162,7 +170,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
-    config = get_config(metadata)
+    config = get_config(instance, metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
@@ -277,13 +285,23 @@ def process_instance(
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     metadata = make_metadata(
         llm_config,
         'AiderBench',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -9,6 +9,21 @@ AGENT=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
 EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
+
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -51,10 +66,59 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 
-if [ -n "$EVAL_IDS" ]; then
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
   echo "EVAL_IDS: $EVAL_IDS"
   COMMAND="$COMMAND --eval-ids $EVAL_IDS"
 fi
 
 # Run the command
 eval $COMMAND
+
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
+
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE"
+
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
@@ -0,0 +1,63 @@
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    wget \
+    software-properties-common \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    libboost-all-dev \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip3 install --no-cache-dir pytest pytest-timeout
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \
+    && rm go1.20.5.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java
+RUN apt-get update && apt-get install -y openjdk-17-jdk \
+    && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Install Gradle
+RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
+    && mkdir /opt/gradle \
+    && unzip -d /opt/gradle gradle-7.6-bin.zip \
+    && rm gradle-7.6-bin.zip
+ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+CMD ["/bin/bash"]