chughtapan · vinamra57 · Dec 25, 2025 · Dec 22, 2025 · Dec 25, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,8 +36,6 @@ dev = [
 evals = [
     "bfcl-eval",
     "mcpuniverse",
-    "jinja2>=3.0.0",
-    "python-dotenv>=1.0.0",
     "appworld",
     "appworld-experiments[simplified]",
 ]

diff --git a/tests/benchmarks/mcp_universe/README.md b/tests/benchmarks/mcp_universe/README.md
@@ -8,115 +8,155 @@ MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evalu
 
 - **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020)
 - Tests realistic GitHub operations including:
-
-- Creating repositories and branches
-- Managing files and commits
-- Creating pull requests
-- Copying files between repositories
-- Managing issues and labels
+  - Creating repositories and branches
+  - Managing files and commits
+  - Creating pull requests
+  - Copying files between repositories
+  - Managing issues and labels
 
 ## Quick Start
 
 ### Prerequisites
 
-1. **Docker** - REQUIRED to run the GitHub MCP server
+1. **Docker** - Required to run the GitHub MCP server
    - Install Docker Desktop: https://www.docker.com/products/docker-desktop
    - **Start Docker Desktop** before running tests
-   - Verify installation: `docker --version`
-   - **Note**: Using pinned version v0.15.0 for research reproducibility (before PR #1091 which added automatic instruction generation)
-   - If `docker` command is not found, ensure Docker Desktop is running and restart your terminal
+   - Verify: `docker --version`
+   - Using pinned version v0.15.0 for research reproducibility
+
 2. **GitHub Personal Access Token** - For GitHub API access
    - **CRITICAL**: Use a dedicated test GitHub account for safety
    - Create token: https://github.com/settings/tokens
-3. **OpenAI API Key** (or Anthropic for Claude models) - For running the LLM agent
+   - Required scopes: `repo`, `delete_repo`
+
+3. **LLM API Key**
+   - OpenAI API key for GPT models, OR
+   - Anthropic API key for Claude models
+
 4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/)
 
 ### Installation
 
 ```bash
-# Clone the repository (if not already done)
+# Clone the repository
 git clone https://github.com/chughtapan/wags.git
 cd wags
 
-# Install dependencies (pulls the forked MCP-Universe package via eval extras)
-UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+# Install dependencies
+uv pip install -e ".[dev,evals]"
 
-# Verify Docker is working
-docker run --rm hello-world
-
-# Pre-pull the GitHub MCP server image (recommended for faster test startup)
+# Pre-pull the GitHub MCP server image
 docker pull ghcr.io/github/github-mcp-server:v0.15.0
 ```
 
-**Note**: The `.[dev,evals]` extras install:
-- `mcpuniverse` from the fork [`vinamra57/MCP-Universe@72389d8`](https://github.com/vinamra57/MCP-Universe/tree/72389d8a04044dceb855f733a938d0344ac58813), which removes heavy 3D dependencies while keeping the repository-management configs
-- `bfcl-eval` for Berkeley Function Call Leaderboard evaluation
-- Other shared evaluation dependencies
-
-All repository management task JSON files are bundled inside the installed `mcpuniverse` wheel, so no git submodules or manual data checkout are required.
+### Environment Variables
 
-### Configuration
-
-Environment variables are automatically loaded from `servers/github/.env`. Create this file with:
+**Required** - tests will fail without these:
 
 ```bash
-# servers/github/.env
-GITHUB_PERSONAL_ACCESS_TOKEN=your_github_token_here
-GITHUB_PERSONAL_ACCOUNT_NAME=your_github_username
-OPENAI_API_KEY=your_openai_key_here
+export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token"
+export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
 ```
 
-**IMPORTANT**: Use a dedicated test GitHub account. The AI agent will perform real operations on GitHub repositories.
-
-Alternatively, you can manually export the environment variables:
+**LLM API Key** - one of these depending on model:
 
 ```bash
-export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token_here"
-export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
-export OPENAI_API_KEY="your_openai_key_here"
+# For OpenAI models (gpt-4o, gpt-4o-mini, etc.)
+export OPENAI_API_KEY="your_openai_key"
+
+# For Anthropic models (claude-sonnet-4-5, etc.)
+export ANTHROPIC_API_KEY="your_anthropic_key"
 ```
 
+**IMPORTANT**: Use a dedicated test GitHub account. The agent performs real operations including creating and deleting repositories.
+
 ### Running Tests
 
-Run all 28 repository management tasks:
+Run all 28 tasks:
 
 ```bash
-uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
-    --model gpt-4o-mini \
-    --output-dir outputs/mcp_universe \
-    -v
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o-mini -v
 ```
 
 Run a single task:
 
 ```bash
-uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] \
-    --model gpt-4o-mini \
-    --output-dir outputs/mcp_universe \
-    -v
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] --model gpt-4o-mini -v
 ```
 
 Run with different models:
 
 ```bash
-# Use GPT-4o
-uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
-    --model gpt-4o \
-    --output-dir outputs/mcp_universe
-
-# Use Claude (requires ANTHROPIC_API_KEY)
-uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
-    --model claude-3-5-sonnet-20241022 \
-    --output-dir outputs/mcp_universe
+# GPT-4o
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model gpt-4o
+
+# Claude Sonnet
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --model claude-sonnet-4-5
 ```
 
+### CLI Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--model` | `gpt-4o-mini` | Model to use for the agent |
+| `--temperature` | `0.001` | Temperature for LLM sampling |
+| `--output-dir` | `outputs` | Base directory for outputs (logs written to `{output_dir}/raw/`) |
+| `--validate-only` | - | Skip agent execution, only run evaluation against live GitHub |
+
 ### Validate Mode
 
-If you have existing output files, you can validate them without re-running the agent:
+Run evaluation against live GitHub without running the agent:
 
 ```bash
-uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
-    --validate-only \
-    --log-dir outputs/mcp_universe/raw \
-    --output-dir outputs/mcp_universe
+pytest tests/benchmarks/mcp_universe/test_mcp_universe.py --validate-only
 ```
+
+This is useful if you previously ran the agent and want to re-check the GitHub state (e.g., after fixing an evaluator bug).
+
+## Architecture
+
+### Files
+
+| File | Purpose |
+|------|---------|
+| `test_mcp_universe.py` | Main pytest test file - agent execution and evaluation |
+| `evaluator.py` | Runs MCP-Universe evaluators against test results |
+| `evaluator_patch.py` | Patches for GitHub MCP Server v0.15.0 compatibility |
+| `fastagent.config.yaml` | FastAgent config for GitHub MCP server (agent) |
+| `mcp_server_config.json` | MCP server config for evaluator |
+| `instruction.txt` | System instruction for the agent |
+| `reporting.py` | Human-readable log formatting |
+
+### Environment Variables
+
+| Variable | Used By | Purpose |
+|----------|---------|---------|
+| `GITHUB_PERSONAL_ACCESS_TOKEN` | MCP Server, Evaluator | GitHub API authentication |
+| `GITHUB_PERSONAL_ACCOUNT_NAME` | Evaluator | Template substitution in task assertions |
+| `OPENAI_API_KEY` | FastAgent | OpenAI model access |
+| `ANTHROPIC_API_KEY` | FastAgent | Anthropic model access |
+
+### MCP Server Configuration
+
+The GitHub MCP server runs in Docker:
+- Image: `ghcr.io/github/github-mcp-server:v0.15.0`
+- Required env var: `GITHUB_PERSONAL_ACCESS_TOKEN`
+
+Only the access token is passed to the Docker container. The account name is used locally by the evaluator for template substitution in task assertions (e.g., checking `{{GITHUB_PERSONAL_ACCOUNT_NAME}}/repo-name` exists).
+
+## Troubleshooting
+
+### "Docker not found"
+Ensure Docker Desktop is running and restart your terminal.
+
+### "GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set"
+Export the required environment variables before running tests.
+
+### "repository doesn't exist" (false negative)
+GitHub's search API has indexing delays for newly created repos. The evaluator patches handle this with direct API calls, but occasional failures may occur.
+
+### Rate limiting
+If you hit GitHub API rate limits, wait a few minutes or use a token with higher limits.
+
+### Tests pass but some checks fail
+Review the `*_readable.log` files in the output directory for detailed execution traces.
diff --git a/tests/benchmarks/mcp_universe/fastagent.config.yaml b/tests/benchmarks/mcp_universe/fastagent.config.yaml
@@ -10,12 +10,9 @@ mcp:
         - --rm
         - -e
         - GITHUB_PERSONAL_ACCESS_TOKEN
-        - -e
-        - GITHUB_PERSONAL_ACCOUNT_NAME
         - ghcr.io/github/github-mcp-server:v0.15.0
       env:
         GITHUB_PERSONAL_ACCESS_TOKEN: ${GITHUB_PERSONAL_ACCESS_TOKEN}
-        GITHUB_PERSONAL_ACCOUNT_NAME: ${GITHUB_PERSONAL_ACCOUNT_NAME}
 
 logger:
   level: info

diff --git a/tests/benchmarks/mcp_universe/test_mcp_universe.py b/tests/benchmarks/mcp_universe/test_mcp_universe.py
@@ -70,12 +70,14 @@ def _setup_environment(model: str, temperature: float) -> None:
         raise ValueError(
             "GITHUB_PERSONAL_ACCESS_TOKEN environment variable not set. Please set it before running tests."
         )
-    github_account_name = os.getenv("GITHUB_PERSONAL_ACCOUNT_NAME", "vinamra-test")
+    if not os.getenv("GITHUB_PERSONAL_ACCOUNT_NAME"):
+        raise ValueError(
+            "GITHUB_PERSONAL_ACCOUNT_NAME environment variable not set. Please set it before running tests."
+        )
     os.environ.update(
         {
             "DEFAULT_MODEL": model,
             "TEMPERATURE": str(temperature),
-            "GITHUB_PERSONAL_ACCOUNT_NAME": github_account_name,
         }
     )
 
@@ -223,26 +225,13 @@ def _log_evaluation_results(log_path: Path, evaluation: dict[str, Any]) -> None:
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
-    """Dynamically generate test cases."""
+    """Dynamically generate test cases from task JSON files."""
     if "test_id" in metafunc.fixturenames:
-        validate_only = metafunc.config.getoption("--validate-only")
-
-        if validate_only:
-            # Find existing output files to validate
-            log_dir = Path(metafunc.config.getoption("--log-dir"))
-            if log_dir.exists():
-                output_files = list(log_dir.glob("**/*_complete.json"))
-                test_ids = [f.stem.replace("_complete", "") for f in output_files]
-            else:
-                test_ids = []
-        else:
-            # Generate test IDs for repository management tasks
-            test_ids = sorted(
-                e.name.removesuffix(".json")
-                for e in _DATA_DIR.iterdir()
-                if e.is_file() and e.name.startswith("github_task_") and e.name.endswith(".json")
-            )
-
+        test_ids = sorted(
+            e.name.removesuffix(".json")
+            for e in _DATA_DIR.iterdir()
+            if e.is_file() and e.name.startswith("github_task_") and e.name.endswith(".json")
+        )
         metafunc.parametrize("test_id", test_ids)
 
 
@@ -257,10 +246,8 @@ async def test_mcp_universe(
     if not validate_only:
         await _run_mcp_universe_test(test_id, model, temperature, output_dir)
 
-    # Determine log directory
-    log_dir = Path(request.config.getoption("--log-dir")) if validate_only else output_dir / "raw"
-
     # Validate and get results
+    log_dir = output_dir / "raw"
     evaluation = await _validate_test(test_id, model, log_dir)
 
     # Fail test with detailed message if evaluation failed

diff --git a/uv.lock b/uv.lock