Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 59 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ Drawing on real-world tasks from the AtCoder Heuristic Contest (AHC), ALE-Bench

*Note: This repository is not an official product of SakanaAI or AtCoder and is therefore not officially supported.*

***Important: Please do not use this repository to participate in AHCs ([AtCoder Heuristic Contest Generative AI Usage Rules - Version 20250616](https://info.atcoder.jp/entry/ahc-llm-rules-en)).***

https://github.com/user-attachments/assets/50a8de5a-b519-4aef-8e54-c60ac9dcbb90

## Setup
Expand Down Expand Up @@ -485,6 +487,55 @@ Set `num_workers` to at most the number of **physical cores** of your instance,
# Confirm with 'yes' or use -auto-approve
```

## MCP (Model Context Protocol) Server
The MCP server is a lightweight HTTP server that provides a simple interface for interacting with the ALE-Bench toolkit. It allows you to run evaluations and manage sessions without needing to write Python code directly.

### Setup
1. Install Node.js and npm
```sh
# Install nvm (Node Version Manager) for easy Node.js management
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash
export NVM_DIR="$HOME/.nvm"
[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" # This loads nvm
[ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion" # This loads nvm bash_completion
# Install the latest LTS version of Node.js
nvm install --lts
# Install the Model Context Protocol Inspector
npm install -g @modelcontextprotocol/inspector
```
2. Install the MCP server dependencies using pip or uv:
```sh
cd mcp
uv sync
uv sync --extra dev # For development dependencies
```

### Running the MCP Server
```sh
# Ensure you are in the mcp directory (e.g., cd mcp from the project root)
uv run mcp run server.py
uv run mcp dev server.py --with-editable . # For development
```

### Use with Claude Desktop
1. Open the `claude_desktop_config.json` file that configures the Claude Desktop. Add the following configuration to connect to the MCP server, ensuring you replace `/path/to/ALE-Bench` in the `args` with the actual absolute path to your cloned `ALE-Bench` repository directory:
```json
{
"mcpServers": {
"ALE-Bench MCP Server": {
"command": "/bin/bash",
"args": [
"-c",
"cd /path/to/ALE-Bench/mcp && uv run --with ale_bench --with mcp[cli] mcp run /path/to/ALE-Bench/mcp/server.py"
]
}
}
}
```
2. Restart the Claude Desktop application to apply the changes.

<img width="680" alt="MCP_Claude_Desktop" src="https://github.com/user-attachments/assets/d9f22719-5686-406d-aa94-44406c700d6f" />

## Development

- **Environment Setup:**
Expand Down Expand Up @@ -516,13 +567,13 @@ Set `num_workers` to at most the number of **physical cores** of your instance,
- **Python Library Development:**
```sh
# Linting
ruff check src tests
ruff check src mcp tests

# Formatting
ruff format src tests
ruff format src mcp tests

# Static Type Checking
mypy src tests
mypy src mcp tests

# Running Tests
pytest
Expand All @@ -534,10 +585,10 @@ Set `num_workers` to at most the number of **physical cores** of your instance,
Please cite ALE-Bench as follows:

```bibtex
@misc{imajuku2025ale-bench,
title = {{ALE-Bench}: A Benchmark for Long-Horizon Objective-Driven Algorithm Engineering},
author = {Imajuku, Yuki and Horie, Kohki and Iwata, Yoichi and Aoki, Kensho and Takahashi, Naohiro and Akiba, Takuya},
url = {https://github.com/SakanaAI/ALE-Bench},
year = {2025}
@article{imajuku2025ale-bench,
title={ALE-Bench: A Benchmark for Long-Horizon Objective-Driven Algorithm Engineering},
author={Imajuku, Yuki and Horie, Kohki and Iwata, Yoichi and Aoki, Kensho and Takahashi, Naohiro and Akiba, Takuya},
journal={arXiv preprint arXiv:2506.09050},
year={2025}
}
```
2 changes: 1 addition & 1 deletion cloud/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ resource "aws_instance" "ale_bench_instance" {
disable_api_termination = false
instance_initiated_shutdown_behavior = "stop"

user_data = file("setup.sh")
user_data = file(var.setup_file_name)

tags = {
Name = "ale-bench-instance-${count.index}"
Expand Down
2 changes: 1 addition & 1 deletion cloud/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ su - ubuntu -c "cd /home/ubuntu/ && rm awscliv2.zip && rm -rf aws"


# Install uv
su - ubuntu -c "curl -fsSL https://astral.sh/uv/0.6.6/install.sh | sh"
su - ubuntu -c "curl -fsSL https://astral.sh/uv/install.sh | sh"
su - ubuntu -c "source /home/ubuntu/.local/bin/env"


Expand Down
68 changes: 68 additions & 0 deletions cloud/setup_mcp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/bash
set -Eeuo pipefail

# Install docker (https://docs.docker.com/engine/install/ubuntu/)
## Add Docker's official GPG key:
sudo apt-get -qq update
sudo apt-get -qq install -y ca-certificates curl wget
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc

## Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get -qq update

## Install the Docker packages.
sudo apt-get -qq install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

## Docker with non-root user
# sudo groupadd docker # Already created
sudo usermod -aG docker ubuntu
su - ubuntu -c "newgrp docker"


# Install Google Chrome
wget -q -O /tmp/google-chrome-stable_current_amd64.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo apt-get -qq update
sudo apt-get -qq install -y /tmp/google-chrome-stable_current_amd64.deb
rm /tmp/google-chrome-stable_current_amd64.deb


# Install another dependencies (including the dependencies for the experiments)
sudo apt-get -qq update
sudo apt-get -qq install -y build-essential make unzip libcairo2-dev libffi-dev
sudo apt-get -qq autoremove --purge -y
sudo apt-get -qq clean
rm -rf /var/lib/apt/lists/*


# Install AWS CLI version 2 (https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
su - ubuntu -c "cd /home/ubuntu/ && curl -fsSL https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -o awscliv2.zip"
su - ubuntu -c "cd /home/ubuntu/ && unzip -qq awscliv2.zip"
su - ubuntu -c "cd /home/ubuntu/ && sudo ./aws/install"
su - ubuntu -c "cd /home/ubuntu/ && rm awscliv2.zip && rm -rf aws"


# Install uv
su - ubuntu -c "curl -fsSL https://astral.sh/uv/install.sh | sh"
su - ubuntu -c "source /home/ubuntu/.local/bin/env"


# Install Node Version Manager (NVM) and Node.js
su - ubuntu -c "curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash"
su - ubuntu -c "export NVM_DIR=\"\${HOME}/.nvm\" && [ -s \"\${NVM_DIR}/nvm.sh\" ] && \\. \"\${NVM_DIR}/nvm.sh\" && nvm install --lts && nvm use --lts && npm install -g @modelcontextprotocol/inspector"


# Clone the ALE-Bench repository and setup the environment
su - ubuntu -c "cd /home/ubuntu/ && git clone https://github.com/SakanaAI/ALE-Bench.git"
su - ubuntu -c "cd /home/ubuntu/ALE-Bench && uv -q venv --python 3.12.9 && uv -q sync"
su - ubuntu -c "cd /home/ubuntu/ALE-Bench/mcp && uv -q venv --python 3.12.9 && uv -q sync"
su - ubuntu -c "cd /home/ubuntu/ALE-Bench && bash ./scripts/docker_build_all.sh \$(id -u) \$(id -g)"


# Finish
echo "$(printf '\033')[1;4;5;32mALE-Bench setup completed! $(printf '\033')[0m"
6 changes: 6 additions & 0 deletions cloud/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ variable "allowed_ssh_cidr" {
type = string
default = "0.0.0.0/0"
}

variable "setup_file_name" {
description = "Name of the setup file to be copied to the instance"
type = string
default = "setup.sh"
}
100 changes: 100 additions & 0 deletions mcp/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
[project]
name = "ale_bench_mcp_server"
version = "1.0.0"
description = "The MCP server for the ALE-Bench"
authors = [
{ name = "Yuki-Imajuku", email = "yuki.imjk@gmail.com" }
]
readme = "README.md"
requires-python = ">=3.10,<3.14"
license = { file = "LICENSE" }
keywords = ["benchmark", "algorithmic programming", "atcoder", "AHC", "AI evaluation", "heuristic", "optimization", "estimation", "MCP", "server"]
classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"ahocorapy",
"ale-bench",
"cairosvg",
"docker",
"huggingface_hub",
"mcp[cli]",
"pillow",
"polars>=1",
"pydantic>=2",
]

[project.urls]
Repository = "https://github.com/SakanaAI/ALE-Bench"

[project.optional-dependencies]
dev = [
"mypy==1.15.0",
"pytest==8.3.4",
"pytest-mock==3.14.0",
"ruff==0.9.7",
"types-requests==2.32.0.20250301",
]

[tool.uv.sources]
ale-bench = { git = "https://github.com/SakanaAI/ALE-Bench.git", tag = "v1.0.0" }

[tool.mypy]
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
disallow_untyped_decorators = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true
ignore_missing_imports = true

[tool.pytest.ini_options]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"docker: marks tests as docker (deselect with '-m \"not docker\"')",
]

[tool.ruff]
fix = true
target-version = "py312"
line-length = 120

[tool.ruff.format]
quote-style = "double"

[tool.ruff.lint]
select = [
"C9",
"E",
"F",
"W",
"I",
]
ignore = ["C901"]

[tool.ruff.lint.isort]
case-sensitive = true
combine-as-imports = true
default-section = "first-party"
known-first-party = ["ale_bench"]
section-order = [
"future",
"standard-library",
"third-party",
"first-party",
"local-folder"
]
split-on-trailing-comma = true
Loading