chughtapan · vinamra57 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/tests/benchmarks/mcp_universe/README.md b/tests/benchmarks/mcp_universe/README.md
@@ -4,9 +4,9 @@ This directory contains the integration of the MCP-Universe repository managemen
 
 ## Overview
 
-MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the **repository management domain** with:
+MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the repository management domain with:
 
-- **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020)
+- 28 GitHub tasks
 - Tests realistic GitHub operations including:
   - Creating repositories and branches
   - Managing files and commits
@@ -18,22 +18,21 @@ MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evalu
 
 ### Prerequisites
 
-1. **Docker** - Required to run the GitHub MCP server
+1. Docker - Required to run the GitHub MCP server
    - Install Docker Desktop: https://www.docker.com/products/docker-desktop
-   - **Start Docker Desktop** before running tests
+   - Start Docker Desktop before running tests
    - Verify: `docker --version`
    - Using pinned version v0.15.0 for research reproducibility
+   - If you have multiple versions of the GitHub MCP server image, ensure v0.15.0 is tagged as `latest` or is the only version installed
 
-2. **GitHub Personal Access Token** - For GitHub API access
-   - **CRITICAL**: Use a dedicated test GitHub account for safety
+2. GitHub Personal Access Token - For GitHub API access
+   - Use a test GitHub account for safety
    - Create token: https://github.com/settings/tokens
-   - Required scopes: `repo`, `delete_repo`
+   - Required scopes: All scopes
 
-3. **LLM API Key**
-   - OpenAI API key for GPT models, OR
-   - Anthropic API key for Claude models
+3. LLM API Key
 
-4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/)
+4. Python 3.13+ with [uv](https://docs.astral.sh/uv/)
 
 ### Installation
 
@@ -51,37 +50,32 @@ docker pull ghcr.io/github/github-mcp-server:v0.15.0
 
 ### Environment Variables
 
-**Required** - tests will fail without these:
-
 ```bash
+# Required; tests will fail without these
+# Use a test GitHub account; the agent performs real operations
 export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token"
 export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
-```
-
-**LLM API Key** - one of these depending on model:
 
-```bash
-# For OpenAI models (gpt-4o, gpt-4o-mini, etc.)
+# LLM API Key
+# For OpenAI models (gpt-5, gpt-4o, gpt-4o-mini, etc.)
 export OPENAI_API_KEY="your_openai_key"
 
 # For Anthropic models (claude-sonnet-4-5, etc.)
 export ANTHROPIC_API_KEY="your_anthropic_key"
 ```
 
-**IMPORTANT**: Use a dedicated test GitHub account. The agent performs real operations including creating and deleting repositories.
-
 ### Running Tests
 
 Run all 28 tasks:
 
 ```bash
-pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o-mini -v
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-5 -v
 ```
 
 Run a single task:
 
 ```bash
-pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-4o-mini -v
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-5 -v
 ```
 
 Run with different models:
@@ -90,7 +84,7 @@ Run with different models:
 # GPT-4o
 pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o
 
-# Claude Sonnet
+# Claude Sonnet 4.5
 pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet-4-5
 ```
 
@@ -102,6 +96,19 @@ pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet-
 | `--temperature` | `0.001` | Temperature for LLM sampling |
 | `--output-dir` | `outputs` | Base directory for outputs (logs written to `{output_dir}/raw/`) |
 | `--validate-only` | - | Skip agent execution, only run evaluation against live GitHub |
+| `--toolset` | `full` | Tool availability: `full` (all 93 tools) or `minimal` (19 essential tools) |
+
+### Toolset Comparison
+
+The `--toolset` flag allows comparing agent performance with different tool availability:
+
+```bash
+# Full toolset (default): All 93 GitHub MCP tools
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --toolset full -v
+
+# Minimal toolset: 19 essential tools identified from benchmark analysis
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --toolset minimal -v
+```
 
 ### Validate Mode
 
@@ -127,36 +134,10 @@ This is useful if you previously ran the agent and want to re-check the GitHub s
 | `instruction.txt` | System instruction for the agent |
 | `reporting.py` | Human-readable log formatting |
 
-### Environment Variables
-
-| Variable | Used By | Purpose |
-|----------|---------|---------|
-| `GITHUB_PERSONAL_ACCESS_TOKEN` | MCP Server, Evaluator | GitHub API authentication |
-| `GITHUB_PERSONAL_ACCOUNT_NAME` | Evaluator | Template substitution in task assertions |
-| `OPENAI_API_KEY` | FastAgent | OpenAI model access |
-| `ANTHROPIC_API_KEY` | FastAgent | Anthropic model access |
-
 ### MCP Server Configuration
 
 The GitHub MCP server runs in Docker:
 - Image: `ghcr.io/github/github-mcp-server:v0.15.0`
 - Required env var: `GITHUB_PERSONAL_ACCESS_TOKEN`
 
 Only the access token is passed to the Docker container. The account name is used locally by the evaluator for template substitution in task assertions (e.g., checking `{{GITHUB_PERSONAL_ACCOUNT_NAME}}/repo-name` exists).
-
-## Troubleshooting
-
-### "Docker not found"
-Ensure Docker Desktop is running and restart your terminal.
-
-### "GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set"
-Export the required environment variables before running tests.
-
-### "repository doesn't exist" (false negative)
-GitHub's search API has indexing delays for newly created repos. The evaluator patches handle this with direct API calls, but occasional failures may occur.
-
-### Rate limiting
-If you hit GitHub API rate limits, wait a few minutes or use a token with higher limits.
-
-### Tests pass but some checks fail
-Review the `*_readable.log` files in the output directory for detailed execution traces.
diff --git a/tests/benchmarks/mcp_universe/fastagent.config.yaml b/tests/benchmarks/mcp_universe/fastagent.config.yaml
@@ -21,3 +21,4 @@ logger:
   show_tools: true
   truncate_tools: false
   progress_display: true
+  enable_markup: false
diff --git a/tests/benchmarks/mcp_universe/test_mcp_universe.py b/tests/benchmarks/mcp_universe/test_mcp_universe.py
@@ -25,6 +25,30 @@
 MAX_ITERATIONS = 500
 MAX_TOKENS = 16000
 
+# Minimal toolset: union of all distinct tools used by successful runs
+# Source: why-agents-fail-dataset/mcp_universe/checkpoint-2025-11-06/mcp_universe_tool_stats.csv
+MINIMAL_TOOLSET = [
+    "add_issue_comment",
+    "create_branch",
+    "create_issue",
+    "create_or_update_file",
+    "create_pull_request",
+    "create_repository",
+    "fork_repository",
+    "get_file_contents",
+    "get_issue",
+    "get_issue_comments",
+    "get_me",
+    "get_pull_request",
+    "list_branches",
+    "list_issues",
+    "push_files",
+    "run_workflow",
+    "search_code",
+    "search_issues",
+    "search_repositories",
+]
+
 
 def _parse_question(question: Any) -> str:
     """Parse question from various formats into a string."""
@@ -90,7 +114,7 @@ def _get_task_description(task: dict[str, Any]) -> str:
     return str(task_description)
 
 
-async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, output_dir: Path) -> Path:
+async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, output_dir: Path, toolset: str) -> Path:
     """Run MCP-Universe test and return path to results."""
     raw_dir = output_dir / "raw"
     raw_dir.mkdir(parents=True, exist_ok=True)
@@ -106,10 +130,14 @@ async def _run_mcp_universe_test(test_id: str, model: str, temperature: float, o
     config_path = str(test_dir / "fastagent.config.yaml")
     agent = FastAgent("MCP-Universe Test", config_path=config_path, ignore_unknown_args=True)
 
+    # Apply tool filtering based on toolset parameter
+    tools_config = {"github": MINIMAL_TOOLSET} if toolset == "minimal" else None
+
     @agent.agent(
         name="test_agent",
         model=model,
         servers=["github"],
+        tools=tools_config,
         instruction=test_dir / "instruction.txt",
         request_params=RequestParams(maxTokens=MAX_TOKENS, max_iterations=MAX_ITERATIONS),
     )
@@ -236,15 +264,15 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
 
 @pytest.mark.asyncio
-async def test_mcp_universe(
-    test_id: str, model: str, temperature: float, output_dir: Path, request: pytest.FixtureRequest
+async def test_mcp_universe(  # noqa: PLR0913
+    test_id: str, model: str, temperature: float, output_dir: Path, toolset: str, request: pytest.FixtureRequest
 ) -> None:
     """Run or validate a MCP-Universe repository management test."""
     validate_only = request.config.getoption("--validate-only")
 
     # Run test if not in validate-only mode
     if not validate_only:
-        await _run_mcp_universe_test(test_id, model, temperature, output_dir)
+        await _run_mcp_universe_test(test_id, model, temperature, output_dir, toolset)
 
     # Validate and get results
     log_dir = output_dir / "raw"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -36,12 +36,24 @@ def output_dir(request: pytest.FixtureRequest) -> Path:
     return path
 
 
+@pytest.fixture
+def toolset(request: pytest.FixtureRequest) -> str:
+    """Toolset from CLI: 'full' (all tools) or 'minimal' (essential tools only)."""
+    return cast(str, request.config.getoption("--toolset"))
+
+
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add custom CLI options."""
     parser.addoption("--model", default="gpt-4o-mini", help="Model to use")
     parser.addoption("--temperature", default=0.001, type=float, help="Temperature for LLM (default: 0.001)")
     parser.addoption("--output-dir", default="outputs", help="Output directory for results")
     parser.addoption("--validate-only", action="store_true", help="Only validate existing logs")
+    parser.addoption(
+        "--toolset",
+        default="full",
+        choices=["full", "minimal"],
+        help="Tool availability: 'full' (all tools) or 'minimal' (19 essential tools)",
+    )
 
 
 def pytest_configure(config: pytest.Config) -> None: