Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ dev = [
evals = [
"bfcl-eval",
"mcpuniverse",
"jinja2>=3.0.0",
"python-dotenv>=1.0.0",
"appworld",
"appworld-experiments[simplified]",
]
Expand Down
160 changes: 100 additions & 60 deletions tests/benchmarks/mcp_universe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,115 +8,155 @@ MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evalu

- **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020)
- Tests realistic GitHub operations including:

- Creating repositories and branches
- Managing files and commits
- Creating pull requests
- Copying files between repositories
- Managing issues and labels
- Creating repositories and branches
- Managing files and commits
- Creating pull requests
- Copying files between repositories
- Managing issues and labels

## Quick Start

### Prerequisites

1. **Docker** - REQUIRED to run the GitHub MCP server
1. **Docker** - Required to run the GitHub MCP server
- Install Docker Desktop: https://www.docker.com/products/docker-desktop
- **Start Docker Desktop** before running tests
- Verify installation: `docker --version`
- **Note**: Using pinned version v0.15.0 for research reproducibility (before PR #1091 which added automatic instruction generation)
- If `docker` command is not found, ensure Docker Desktop is running and restart your terminal
- Verify: `docker --version`
- Using pinned version v0.15.0 for research reproducibility

2. **GitHub Personal Access Token** - For GitHub API access
- **CRITICAL**: Use a dedicated test GitHub account for safety
- Create token: https://github.com/settings/tokens
3. **OpenAI API Key** (or Anthropic for Claude models) - For running the LLM agent
- Required scopes: `repo`, `delete_repo`

3. **LLM API Key**
- OpenAI API key for GPT models, OR
- Anthropic API key for Claude models

4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/)

### Installation

```bash
# Clone the repository (if not already done)
# Clone the repository
git clone https://github.com/chughtapan/wags.git
cd wags

# Install dependencies (pulls the forked MCP-Universe package via eval extras)
UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
# Install dependencies
uv pip install -e ".[dev,evals]"

# Verify Docker is working
docker run --rm hello-world

# Pre-pull the GitHub MCP server image (recommended for faster test startup)
# Pre-pull the GitHub MCP server image
docker pull ghcr.io/github/github-mcp-server:v0.15.0
```

**Note**: The `.[dev,evals]` extras install:
- `mcpuniverse` from the fork [`vinamra57/MCP-Universe@72389d8`](https://github.com/vinamra57/MCP-Universe/tree/72389d8a04044dceb855f733a938d0344ac58813), which removes heavy 3D dependencies while keeping the repository-management configs
- `bfcl-eval` for Berkeley Function Call Leaderboard evaluation
- Other shared evaluation dependencies

All repository management task JSON files are bundled inside the installed `mcpuniverse` wheel, so no git submodules or manual data checkout are required.
### Environment Variables

### Configuration

Environment variables are automatically loaded from `servers/github/.env`. Create this file with:
**Required** - tests will fail without these:

```bash
# servers/github/.env
GITHUB_PERSONAL_ACCESS_TOKEN=your_github_token_here
GITHUB_PERSONAL_ACCOUNT_NAME=your_github_username
OPENAI_API_KEY=your_openai_key_here
export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token"
export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
```

**IMPORTANT**: Use a dedicated test GitHub account. The AI agent will perform real operations on GitHub repositories.

Alternatively, you can manually export the environment variables:
**LLM API Key** - one of these depending on model:

```bash
export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token_here"
export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
export OPENAI_API_KEY="your_openai_key_here"
# For OpenAI models (gpt-4o, gpt-4o-mini, etc.)
export OPENAI_API_KEY="your_openai_key"

# For Anthropic models (claude-sonnet-4-5, etc.)
export ANTHROPIC_API_KEY="your_anthropic_key"
```

**IMPORTANT**: Use a dedicated test GitHub account. The agent performs real operations including creating and deleting repositories.

### Running Tests

Run all 28 repository management tasks:
Run all 28 tasks:

```bash
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
--model gpt-4o-mini \
--output-dir outputs/mcp_universe \
-v
pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o-mini -v
```

Run a single task:

```bash
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] \
--model gpt-4o-mini \
--output-dir outputs/mcp_universe \
-v
pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-4o-mini -v
```

Run with different models:

```bash
# Use GPT-4o
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
--model gpt-4o \
--output-dir outputs/mcp_universe

# Use Claude (requires ANTHROPIC_API_KEY)
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
--model claude-3-5-sonnet-20241022 \
--output-dir outputs/mcp_universe
# GPT-4o
pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o

# Claude Sonnet
pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet-4-5
```

### CLI Options

| Option | Default | Description |
|--------|---------|-------------|
| `--model` | `gpt-4o-mini` | Model to use for the agent |
| `--temperature` | `0.001` | Temperature for LLM sampling |
| `--output-dir` | `outputs` | Base directory for outputs (logs written to `{output_dir}/raw/`) |
| `--validate-only` | - | Skip agent execution, only run evaluation against live GitHub |

### Validate Mode

If you have existing output files, you can validate them without re-running the agent:
Run evaluation against live GitHub without running the agent:

```bash
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
--validate-only \
--log-dir outputs/mcp_universe/raw \
--output-dir outputs/mcp_universe
pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --validate-only
```

This is useful if you previously ran the agent and want to re-check the GitHub state (e.g., after fixing an evaluator bug).

## Architecture

### Files

| File | Purpose |
|------|---------|
| `test_mcp_universe.py` | Main pytest test file - agent execution and evaluation |
| `evaluator.py` | Runs MCP-Universe evaluators against test results |
| `evaluator_patch.py` | Patches for GitHub MCP Server v0.15.0 compatibility |
| `fastagent.config.yaml` | FastAgent config for GitHub MCP server (agent) |
| `mcp_server_config.json` | MCP server config for evaluator |
| `instruction.txt` | System instruction for the agent |
| `reporting.py` | Human-readable log formatting |

### Environment Variables

| Variable | Used By | Purpose |
|----------|---------|---------|
| `GITHUB_PERSONAL_ACCESS_TOKEN` | MCP Server, Evaluator | GitHub API authentication |
| `GITHUB_PERSONAL_ACCOUNT_NAME` | Evaluator | Template substitution in task assertions |
| `OPENAI_API_KEY` | FastAgent | OpenAI model access |
| `ANTHROPIC_API_KEY` | FastAgent | Anthropic model access |

### MCP Server Configuration

The GitHub MCP server runs in Docker:
- Image: `ghcr.io/github/github-mcp-server:v0.15.0`
- Required env var: `GITHUB_PERSONAL_ACCESS_TOKEN`

Only the access token is passed to the Docker container. The account name is used locally by the evaluator for template substitution in task assertions (e.g., checking `{{GITHUB_PERSONAL_ACCOUNT_NAME}}/repo-name` exists).

## Troubleshooting

### "Docker not found"
Ensure Docker Desktop is running and restart your terminal.

### "GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set"
Export the required environment variables before running tests.

### "repository doesn't exist" (false negative)
GitHub's search API has indexing delays for newly created repos. The evaluator patches handle this with direct API calls, but occasional failures may occur.

### Rate limiting
If you hit GitHub API rate limits, wait a few minutes or use a token with higher limits.

### Tests pass but some checks fail
Review the `*_readable.log` files in the output directory for detailed execution traces.
3 changes: 0 additions & 3 deletions tests/benchmarks/mcp_universe/fastagent.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,9 @@ mcp:
- --rm
- -e
- GITHUB_PERSONAL_ACCESS_TOKEN
- -e
- GITHUB_PERSONAL_ACCOUNT_NAME
- ghcr.io/github/github-mcp-server:v0.15.0
env:
GITHUB_PERSONAL_ACCESS_TOKEN: ${GITHUB_PERSONAL_ACCESS_TOKEN}
GITHUB_PERSONAL_ACCOUNT_NAME: ${GITHUB_PERSONAL_ACCOUNT_NAME}

logger:
level: info
Expand Down
35 changes: 11 additions & 24 deletions tests/benchmarks/mcp_universe/test_mcp_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,14 @@ def _setup_environment(model: str, temperature: float) -> None:
raise ValueError(
"GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set. Please set it before running tests."
)
github_account_name = os.getenv("GITHUB_PERSONAL_ACCOUNT_NAME", "vinamra-test")
if not os.getenv("GITHUB_PERSONAL_ACCOUNT_NAME"):
raise ValueError(
"GITHUB_PERSONAL_ACCOUNT_NAME environment variable not set. Please set it before running tests."
)
os.environ.update(
{
"DEFAULT_MODEL": model,
"TEMPERATURE": str(temperature),
"GITHUB_PERSONAL_ACCOUNT_NAME": github_account_name,
}
)

Expand Down Expand Up @@ -223,26 +225,13 @@ def _log_evaluation_results(log_path: Path, evaluation: dict[str, Any]) -> None:


def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
"""Dynamically generate test cases."""
"""Dynamically generate test cases from task JSON files."""
if "test_id" in metafunc.fixturenames:
validate_only = metafunc.config.getoption("--validate-only")

if validate_only:
# Find existing output files to validate
log_dir = Path(metafunc.config.getoption("--log-dir"))
if log_dir.exists():
output_files = list(log_dir.glob("**/*_complete.json"))
test_ids = [f.stem.replace("_complete", "") for f in output_files]
else:
test_ids = []
else:
# Generate test IDs for repository management tasks
test_ids = sorted(
e.name.removesuffix(".json")
for e in _DATA_DIR.iterdir()
if e.is_file() and e.name.startswith("github_task_") and e.name.endswith(".json")
)

test_ids = sorted(
e.name.removesuffix(".json")
for e in _DATA_DIR.iterdir()
if e.is_file() and e.name.startswith("github_task_") and e.name.endswith(".json")
)
metafunc.parametrize("test_id", test_ids)


Expand All @@ -257,10 +246,8 @@ async def test_mcp_universe(
if not validate_only:
await _run_mcp_universe_test(test_id, model, temperature, output_dir)

# Determine log directory
log_dir = Path(request.config.getoption("--log-dir")) if validate_only else output_dir / "raw"

# Validate and get results
log_dir = output_dir / "raw"
evaluation = await _validate_test(test_id, model, log_dir)

# Fail test with detailed message if evaluation failed
Expand Down
4 changes: 0 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.