From 28b8d261b2fd941cf7e43804c29b8e94d40fab61 Mon Sep 17 00:00:00 2001 From: Vinamra Agarwal Date: Tue, 30 Dec 2025 19:21:06 -0800 Subject: [PATCH 1/3] add full vs minimal toolset distinction feature --- .../mcp_universe/fastagent.config.yaml | 1 + .../mcp_universe/test_mcp_universe.py | 38 +++++++++++++++++-- tests/conftest.py | 12 ++++++ 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/tests/benchmarks/mcp_universe/fastagent.config.yaml b/tests/benchmarks/mcp_universe/fastagent.config.yaml index dffdddc..ff2297e 100644 --- a/tests/benchmarks/mcp_universe/fastagent.config.yaml +++ b/tests/benchmarks/mcp_universe/fastagent.config.yaml @@ -21,3 +21,4 @@ logger: show_tools: true truncate_tools: false progress_display: true + enable_markup: false diff --git a/tests/benchmarks/mcp_universe/test_mcp_universe.py b/tests/benchmarks/mcp_universe/test_mcp_universe.py index ce8ea61..332d44b 100644 --- a/tests/benchmarks/mcp_universe/test_mcp_universe.py +++ b/tests/benchmarks/mcp_universe/test_mcp_universe.py @@ -25,6 +25,30 @@ MAX_ITERATIONS = 500 MAX_TOKENS = 16000 +# Minimal toolset: union of all distinct tools used by successful runs +# Source: why-agents-fail-dataset/mcp_universe/checkpoint-2025-11-06/mcp_universe_tool_stats.csv +MINIMAL_TOOLSET = [ + "add_issue_comment", + "create_branch", + "create_issue", + "create_or_update_file", + "create_pull_request", + "create_repository", + "fork_repository", + "get_file_contents", + "get_issue", + "get_issue_comments", + "get_me", + "get_pull_request", + "list_branches", + "list_issues", + "push_files", + "run_workflow", + "search_code", + "search_issues", + "search_repositories", +] + def _parse_question(question: Any) -> str: """Parse question from various formats into a string.""" @@ -90,7 +114,9 @@ def _get_task_description(task: dict[str, Any]) -> str: return str(task_description) -async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, output_dir: Path) -> Path: +async def _run_mcp_universe_test( + test_id: str, model: str, temperature: float, output_dir: Path, toolset: str +) -> Path: """Run MCP-Universe test and return path to results.""" raw_dir = output_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) @@ -106,10 +132,14 @@ async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, o config_path = str(test_dir / "fastagent.config.yaml") agent = FastAgent("MCP-Universe Test", config_path=config_path, ignore_unknown_args=True) + # Apply tool filtering based on toolset parameter + tools_config = {"github": MINIMAL_TOOLSET} if toolset == "minimal" else None + @agent.agent( name="test_agent", model=model, servers=["github"], + tools=tools_config, instruction=test_dir / "instruction.txt", request_params=RequestParams(maxTokens=MAX_TOKENS, max_iterations=MAX_ITERATIONS), ) @@ -236,15 +266,15 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @pytest.mark.asyncio -async def test_mcp_universe( - test_id: str, model: str, temperature: float, output_dir: Path, request: pytest.FixtureRequest +async def test_mcp_universe( # noqa: PLR0913 + test_id: str, model: str, temperature: float, output_dir: Path, toolset: str, request: pytest.FixtureRequest ) -> None: """Run or validate a MCP-Universe repository management test.""" validate_only = request.config.getoption("--validate-only") # Run test if not in validate-only mode if not validate_only: - await _run_mcp_universe_test(test_id, model, temperature, output_dir) + await _run_mcp_universe_test(test_id, model, temperature, output_dir, toolset) # Validate and get results log_dir = output_dir / "raw" diff --git a/tests/conftest.py b/tests/conftest.py index 8341edd..5592bd9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,12 +36,24 @@ def output_dir(request: pytest.FixtureRequest) -> Path: return path +@pytest.fixture +def toolset(request: pytest.FixtureRequest) -> str: + """Toolset from CLI: 'full' (all tools) or 'minimal' (essential tools only).""" + return cast(str, request.config.getoption("--toolset")) + + def pytest_addoption(parser: pytest.Parser) -> None: """Add custom CLI options.""" parser.addoption("--model", default="gpt-4o-mini", help="Model to use") parser.addoption("--temperature", default=0.001, type=float, help="Temperature for LLM (default: 0.001)") parser.addoption("--output-dir", default="outputs", help="Output directory for results") parser.addoption("--validate-only", action="store_true", help="Only validate existing logs") + parser.addoption( + "--toolset", + default="full", + choices=["full", "minimal"], + help="Tool availability: 'full' (all tools) or 'minimal' (19 essential tools)", + ) def pytest_configure(config: pytest.Config) -> None: From 83a2a9dccb1eeb71711a878516bdecac61ae4032 Mon Sep 17 00:00:00 2001 From: Vinamra Agarwal Date: Tue, 30 Dec 2025 19:27:19 -0800 Subject: [PATCH 2/3] fix ruff format for test_mcp_universe.py --- tests/benchmarks/mcp_universe/test_mcp_universe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/benchmarks/mcp_universe/test_mcp_universe.py b/tests/benchmarks/mcp_universe/test_mcp_universe.py index 332d44b..d6b09a2 100644 --- a/tests/benchmarks/mcp_universe/test_mcp_universe.py +++ b/tests/benchmarks/mcp_universe/test_mcp_universe.py @@ -114,9 +114,7 @@ def _get_task_description(task: dict[str, Any]) -> str: return str(task_description) -async def _run_mcp_universe_test( - test_id: str, model: str, temperature: float, output_dir: Path, toolset: str -) -> Path: +async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, output_dir: Path, toolset: str) -> Path: """Run MCP-Universe test and return path to results.""" raw_dir = output_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) From d352143e14846a7fead400623634241211b2840e Mon Sep 17 00:00:00 2001 From: Vinamra Agarwal Date: Wed, 31 Dec 2025 15:07:54 -0800 Subject: [PATCH 3/3] update README.md --- tests/benchmarks/mcp_universe/README.md | 79 ++++++++++--------------- 1 file changed, 30 insertions(+), 49 deletions(-) diff --git a/tests/benchmarks/mcp_universe/README.md b/tests/benchmarks/mcp_universe/README.md index fa47e77..10092bf 100644 --- a/tests/benchmarks/mcp_universe/README.md +++ b/tests/benchmarks/mcp_universe/README.md @@ -4,9 +4,9 @@ This directory contains the integration of the MCP-Universe repository managemen ## Overview -MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the **repository management domain** with: +MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the repository management domain with: -- **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020) +- 28 GitHub tasks - Tests realistic GitHub operations including: - Creating repositories and branches - Managing files and commits @@ -18,22 +18,21 @@ MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evalu ### Prerequisites -1. **Docker** - Required to run the GitHub MCP server +1. Docker - Required to run the GitHub MCP server - Install Docker Desktop: https://www.docker.com/products/docker-desktop - - **Start Docker Desktop** before running tests + - Start Docker Desktop before running tests - Verify: `docker --version` - Using pinned version v0.15.0 for research reproducibility + - If you have multiple versions of the GitHub MCP server image, ensure v0.15.0 is tagged as `latest` or is the only version installed -2. **GitHub Personal Access Token** - For GitHub API access - - **CRITICAL**: Use a dedicated test GitHub account for safety +2. GitHub Personal Access Token - For GitHub API access + - Use a test GitHub account for safety - Create token: https://github.com/settings/tokens - - Required scopes: `repo`, `delete_repo` + - Required scopes: All scopes -3. **LLM API Key** - - OpenAI API key for GPT models, OR - - Anthropic API key for Claude models +3. LLM API Key -4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/) +4. Python 3.13+ with [uv](https://docs.astral.sh/uv/) ### Installation @@ -51,37 +50,32 @@ docker pull ghcr.io/github/github-mcp-server:v0.15.0 ### Environment Variables -**Required** - tests will fail without these: - ```bash +# Required; tests will fail without these +# Use a test GitHub account; the agent performs real operations export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token" export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username" -``` - -**LLM API Key** - one of these depending on model: -```bash -# For OpenAI models (gpt-4o, gpt-4o-mini, etc.) +# LLM API Key +# For OpenAI models (gpt-5, gpt-4o, gpt-4o-mini, etc.) export OPENAI_API_KEY="your_openai_key" # For Anthropic models (claude-sonnet-4-5, etc.) export ANTHROPIC_API_KEY="your_anthropic_key" ``` -**IMPORTANT**: Use a dedicated test GitHub account. The agent performs real operations including creating and deleting repositories. - ### Running Tests Run all 28 tasks: ```bash -pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o-mini -v +pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-5 -v ``` Run a single task: ```bash -pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-4o-mini -v +pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-5 -v ``` Run with different models: @@ -90,7 +84,7 @@ Run with different models: # GPT-4o pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o -# Claude Sonnet +# Claude Sonnet 4.5 pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet-4-5 ``` @@ -102,6 +96,19 @@ pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet- | `--temperature` | `0.001` | Temperature for LLM sampling | | `--output-dir` | `outputs` | Base directory for outputs (logs written to `{output_dir}/raw/`) | | `--validate-only` | - | Skip agent execution, only run evaluation against live GitHub | +| `--toolset` | `full` | Tool availability: `full` (all 93 tools) or `minimal` (19 essential tools) | + +### Toolset Comparison + +The `--toolset` flag allows comparing agent performance with different tool availability: + +```bash +# Full toolset (default): All 93 GitHub MCP tools +pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --toolset full -v + +# Minimal toolset: 19 essential tools identified from benchmark analysis +pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --toolset minimal -v +``` ### Validate Mode @@ -127,15 +134,6 @@ This is useful if you previously ran the agent and want to re-check the GitHub s | `instruction.txt` | System instruction for the agent | | `reporting.py` | Human-readable log formatting | -### Environment Variables - -| Variable | Used By | Purpose | -|----------|---------|---------| -| `GITHUB_PERSONAL_ACCESS_TOKEN` | MCP Server, Evaluator | GitHub API authentication | -| `GITHUB_PERSONAL_ACCOUNT_NAME` | Evaluator | Template substitution in task assertions | -| `OPENAI_API_KEY` | FastAgent | OpenAI model access | -| `ANTHROPIC_API_KEY` | FastAgent | Anthropic model access | - ### MCP Server Configuration The GitHub MCP server runs in Docker: @@ -143,20 +141,3 @@ The GitHub MCP server runs in Docker: - Required env var: `GITHUB_PERSONAL_ACCESS_TOKEN` Only the access token is passed to the Docker container. The account name is used locally by the evaluator for template substitution in task assertions (e.g., checking `{{GITHUB_PERSONAL_ACCOUNT_NAME}}/repo-name` exists). - -## Troubleshooting - -### "Docker not found" -Ensure Docker Desktop is running and restart your terminal. - -### "GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set" -Export the required environment variables before running tests. - -### "repository doesn't exist" (false negative) -GitHub's search API has indexing delays for newly created repos. The evaluator patches handle this with direct API calls, but occasional failures may occur. - -### Rate limiting -If you hit GitHub API rate limits, wait a few minutes or use a token with higher limits. - -### Tests pass but some checks fail -Review the `*_readable.log` files in the output directory for detailed execution traces.