Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,7 @@ steps:
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py

##### B200 test #####
- label: Distributed Tests (B200) # optional
Expand All @@ -1154,6 +1155,7 @@ steps:
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py

##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
Expand Down
85 changes: 85 additions & 0 deletions tests/v1/distributed/test_dbo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism.

DBO is specifically designed for DP+EP scenarios to hide communication latency
by overlapping computation of two batches. This test validates that DBO works
correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
"""

import pytest

from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
from tests.utils import RemoteOpenAIServer

MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
DP_SIZE = 2

# GSM8K eval configuration
NUM_QUESTIONS = 256 # Fast eval for CI; but must be large enough to hit dbo thresholds
NUM_SHOTS = 5 # Few-shot examples
MIN_ACCURACY = 0.62 # Expected 0.64 with 2% buffer (based on vLLM test data)

# Increase max_num_seqs to trigger DBO for decode batches
# With 64 seqs, decode batches should exceed the 32 token threshold
MAX_NUM_SEQS = 64 # Increased from 16 to trigger decode DBO

# DeepEP backends to test
DEEPEP_BACKENDS = [
"deepep_low_latency",
"deepep_high_throughput",
]

@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
"""
Test DBO with DP+EP using GSM8K evaluation.
"""
required_gpus = DP_SIZE

if num_gpus_available < required_gpus:
pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})")

# Server arguments for DBO + DP + EP
server_args = [
"--max-model-len", "4096",
"--max-num-seqs", str(MAX_NUM_SEQS), # Use larger batch to trigger decode DBO
"--trust-remote-code",
# Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching
"--data-parallel-size", str(DP_SIZE),
"--enable-expert-parallel",
"--enable-dbo",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to drop the decode threshold as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could; I already verified that we hit cases above and below both thresholds but probably good to fix them so if they get updated we don't suddenly start testing no-DBO

# Fix threshold so we know we trigger DBO
"--dbo-decode-token-threshold", "16",
"--dbo-prefill-token-threshold", "256",
"--all2all-backend", all2all_backend,
]

with RemoteOpenAIServer(
MODEL_NAME,
server_args,
max_wait_seconds=600 # Allow time for model loading with DP+EP
) as remote_server:
# Use host and port directly from RemoteOpenAIServer
host = f"http://{remote_server.host}"
port = remote_server.port

# Run GSM8K evaluation
results = evaluate_gsm8k(
num_questions=NUM_QUESTIONS,
num_shots=NUM_SHOTS,
host=host,
port=port,
)

# Validate accuracy is reasonable
accuracy = results["accuracy"]
assert accuracy >= MIN_ACCURACY, (
f"DBO+DP+EP accuracy too low ({all2all_backend}): "
f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
f"(correct: {results['num_correct']}/{results['num_questions']})"
)



Loading