AlexCuadron · AlexCuadron · Feb 25, 2025 · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
@@ -16,7 +16,7 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
@@ -31,6 +31,7 @@ development environment and LLM.
 - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
 - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
     given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
 
 There are also following optional environment variables you can set:
 
@@ -53,7 +54,11 @@ You can update the arguments in the script
 - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
+# Run benchmark without evaluation
 ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" eval
 ```
 
 ### Run Inference on `RemoteRuntime` (experimental)

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -21,6 +21,7 @@
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -44,6 +45,7 @@
 
 
 def get_config(
+    instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
@@ -58,7 +60,13 @@ def get_config(
         workspace_base=None,
         workspace_mount_path=None,
     )
-    config.set_llm_config(metadata.llm_config)
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
 
@@ -161,7 +169,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ) -> EvalOutput:
-    config = get_config(metadata)
+    config = get_config(instance, metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
@@ -275,13 +283,23 @@ def process_instance(
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     metadata = make_metadata(
         llm_config,
         'AiderBench',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -9,6 +9,21 @@ AGENT=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
 EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
+
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -51,10 +66,59 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 
-if [ -n "$EVAL_IDS" ]; then
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
   echo "EVAL_IDS: $EVAL_IDS"
   COMMAND="$COMMAND --eval-ids $EVAL_IDS"
 fi
 
 # Run the command
 eval $COMMAND
+
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/AiderBench/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
+
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE"
+
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/aider_bench/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
diff --git a/evaluation/benchmarks/aime2024/README.md b/evaluation/benchmarks/aime2024/README.md
@@ -0,0 +1,103 @@
+# AIME2024 Benchmark
+
+This benchmark evaluates the performance of AI agents on problems from the American Invitational Mathematics Examination (AIME). The dataset is sourced from [AI-MO/aimo-validation-aime](https://huggingface.co/datasets/AI-MO/aimo-validation-aime) on Hugging Face.
+
+## Dataset
+
+The AIME is a challenging mathematics competition for high school students in the United States. The problems require advanced mathematical reasoning and problem-solving skills. The dataset contains 90 problems from various AIME competitions.
+
+## Running the Benchmark
+
+### Prerequisites
+
+- Python 3.11+
+- OpenHands installed
+- Required Python packages: `datasets`, `pandas`, `matplotlib`
+
+### Running a Single Example
+
+To run a single example from the AIME2024 benchmark:
+
+```bash
+cd OpenHands
+bash evaluation/benchmarks/aime2024/scripts/run_example.sh togetherDeepseek HEAD CodeActAgent 1 1 "0" "" ipython_only
+```
+
+This format follows: `<llm-config> <commit-hash> <agent-cls> <eval-limit> <num-workers> <eval-ids> <run-evaluation> <allowed-tools>`
+
+This will run the first problem in the dataset.
+
+### Running the Full Benchmark
+
+To run the full AIME2024 benchmark:
+
+```bash
+cd OpenHands
+bash evaluation/benchmarks/aime2024/scripts/run_infer.sh togetherDeepseek HEAD CodeActAgent 500 20 "" eval ipython_only
+```
+
+### Options
+
+#### Positional Arguments:
+1. `MODEL_CONFIG`: LLM configuration to use (required)
+2. `COMMIT_HASH`: Git commit hash to use (optional)
+3. `AGENT`: Agent class to use (default: "CodeActAgent")
+4. `EVAL_LIMIT`: Limit the number of examples to evaluate (default: 0 for full benchmark, 1 for example)
+5. `NUM_WORKERS`: Number of workers for parallel evaluation (default: 1)
+6. `EVAL_IDS`: Comma-separated list of example IDs to evaluate (default: "" for full benchmark, "0" for example)
+7. `RUN_EVALUATION`: Set to "eval" to run evaluation after benchmark
+8. `ALLOWED_TOOLS`: Tools allowed for the agent (default: "all")
+
+## Analyzing Results
+
+There are three ways to analyze the results of the benchmark:
+
+### 1. Using the eval_infer.sh script (recommended)
+
+If you already have an output.jsonl file from a previous run, you can analyze it directly:
+
+```bash
+bash evaluation/benchmarks/aime2024/scripts/eval_infer.sh <path-to-output-jsonl> [output-directory]
+```
+
+Example:
+```bash
+bash evaluation/benchmarks/aime2024/scripts/eval_infer.sh ./evaluation/evaluation_outputs/AIME2024/CodeActAgent/v0.26.0/output.jsonl
+```
+
+### 2. Using the analyze_results.py script directly
+
+```bash
+poetry run python evaluation/benchmarks/aime2024/scripts/analyze_results.py <path-to-results-jsonl> --output-dir <output-directory>
+```
+
+### 3. Including "eval" in your benchmark run
+
+Simply include "eval" in your command to automatically run the analysis after the benchmark:
+
+```bash
+bash evaluation/benchmarks/aime2024/scripts/run_infer.sh togetherDeepseek HEAD CodeActAgent 500 20 "" eval ipython_only
+```
+
+All methods will generate:
+- A summary of the results in JSON format
+- Plots of the overall accuracy and accuracy by problem ID
+- A detailed CSV file with the results for each problem
+
+## Benchmark Details
+
+The AIME2024 benchmark evaluates the agent's ability to:
+1. Understand complex mathematical problems
+2. Apply mathematical reasoning and problem-solving skills
+3. Use tools (like Python libraries) to verify calculations and reasoning
+4. Arrive at the correct numerical answer
+
+AIME problems typically have integer answers, and the agent is evaluated based on whether it produces the exact correct answer.
+
+## Example Problem
+
+Here's an example problem from the dataset:
+
+> Quadratic polynomials $P(x)$ and $Q(x)$ have leading coefficients $2$ and $-2,$ respectively. The graphs of both polynomials pass through the two points $(16,54)$ and $(20,53).$ Find $P(0) + Q(0).$
+
+The correct answer is 116.