Skip to content

Commit ad9026c

Browse files
authored
Merge pull request vllm-project#187 from ROCm/upstream_merge_24_09_16
Upstream merge 24/09/16
2 parents 0958045 + c27753d commit ad9026c

File tree

252 files changed

+11960
-4059
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

252 files changed

+11960
-4059
lines changed

Diff for: .buildkite/run-amd-test.sh

+24-1
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
7171
HF_MOUNT="/root/.cache/huggingface"
7272

7373
commands=$@
74+
echo "Commands:$commands"
75+
#ignore certain kernels tests
76+
if [[ $commands == *" kernels "* ]]; then
77+
commands="${commands} \
78+
--ignore=kernels/test_attention.py \
79+
--ignore=kernels/test_attention_selector.py \
80+
--ignore=kernels/test_blocksparse_attention.py \
81+
--ignore=kernels/test_causal_conv1d.py \
82+
--ignore=kernels/test_cutlass.py \
83+
--ignore=kernels/test_encoder_decoder_attn.py \
84+
--ignore=kernels/test_flash_attn.py \
85+
--ignore=kernels/test_flashinfer.py \
86+
--ignore=kernels/test_int8_quant.py \
87+
--ignore=kernels/test_machete_gemm.py \
88+
--ignore=kernels/test_mamba_ssm.py \
89+
--ignore=kernels/test_marlin_gemm.py \
90+
--ignore=kernels/test_moe.py \
91+
--ignore=kernels/test_prefix_prefill.py \
92+
--ignore=kernels/test_rand.py \
93+
--ignore=kernels/test_sampler.py"
94+
fi
95+
7496
PARALLEL_JOB_COUNT=8
7597
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
7698
if [[ $commands == *"--shard-id="* ]]; then
7799
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
78100
#replace shard arguments
79-
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
101+
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
80102
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
103+
echo "Shard ${GPU} commands:$commands"
81104
docker run \
82105
--device /dev/kfd --device /dev/dri \
83106
--network host \

Diff for: .buildkite/run-cpu-test-ppc64le.sh

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image, setting --shm-size=4g for tensor parallel.
14+
source /etc/environment
1415
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
15-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
16+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
1617

1718
# Run basic model test
1819
docker exec cpu-test bash -c "

Diff for: .buildkite/run-cpu-test.sh

+11-7
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,17 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
2222

2323
# Run basic model test
2424
docker exec cpu-test bash -c "
25-
pip install pytest matplotlib einops transformers_stream_generator
26-
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
27-
--ignore=tests/models/test_oot_registration.py \
28-
--ignore=tests/models/test_registry.py \
29-
--ignore=tests/models/test_fp8.py \
30-
--ignore=tests/models/test_jamba.py \
31-
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
25+
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
26+
pytest -v -s tests/models/decoder_only/language \
27+
--ignore=tests/models/test_fp8.py \
28+
--ignore=tests/models/decoder_only/language/test_jamba.py \
29+
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
30+
31+
# Run compressed-tensor test
32+
docker exec cpu-test bash -c "
33+
pytest -s -v \
34+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
35+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
3236

3337
# online inference
3438
docker exec cpu-test bash -c "

Diff for: .buildkite/test-pipeline.yaml

+62-27
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ steps:
5050
- tests/worker
5151
commands:
5252
- pytest -v -s async_engine # Async Engine
53+
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
5354
- pytest -v -s test_inputs.py
5455
- pytest -v -s multimodal
5556
- pytest -v -s test_utils.py # Utils
@@ -91,7 +92,7 @@ steps:
9192
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
9293
- pytest -v -s entrypoints/openai
9394
- pytest -v -s entrypoints/test_chat_utils.py
94-
95+
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
9596

9697
- label: Distributed Tests (4 GPUs) # 10min
9798
working_dir: "/vllm-workspace/tests"
@@ -162,30 +163,13 @@ steps:
162163
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
163164
- python3 offline_inference_encoder_decoder.py
164165

165-
- label: Models Test # 1hr10min
166-
source_file_dependencies:
167-
- vllm/
168-
- tests/models
169-
commands:
170-
- pip install -e ./plugins/vllm_add_dummy_model
171-
- pytest -v -s models/test_oot_registration.py # it needs a clean process
172-
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
173-
174166
- label: torch compile integration test
175167
source_file_dependencies:
176168
- vllm/
177169
commands:
178170
- pytest -v -s ./compile/test_full_graph.py
179171
- pytest -v -s ./compile/test_wrapper.py
180172

181-
182-
- label: Vision Language Models Test # 42min
183-
#mirror_hardwares: [amd]
184-
source_file_dependencies:
185-
- vllm/
186-
commands:
187-
- pytest -v -s models -m vlm
188-
189173
- label: Prefix Caching Test # 7min
190174
#mirror_hardwares: [amd]
191175
source_file_dependencies:
@@ -217,7 +201,8 @@ steps:
217201
commands:
218202
# See https://github.com/vllm-project/vllm/issues/5152
219203
- export VLLM_ATTENTION_BACKEND=XFORMERS
220-
- pytest -v -s spec_decode
204+
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
205+
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
221206

222207
- label: LoRA Test %N # 30min each
223208
mirror_hardwares: [amd]
@@ -228,6 +213,7 @@ steps:
228213
parallelism: 4
229214

230215
- label: Kernels Test %N # 30min each
216+
mirror_hardwares: [amd]
231217
source_file_dependencies:
232218
- csrc/
233219
- vllm/attention
@@ -282,6 +268,45 @@ steps:
282268
commands:
283269
- pytest -v -s tool_use
284270

271+
##### models test #####
272+
273+
- label: Basic Models Test # 3min
274+
source_file_dependencies:
275+
- vllm/
276+
- tests/models
277+
commands:
278+
- pip install -e ./plugins/vllm_add_dummy_model
279+
- pytest -v -s models/test_oot_registration.py # it needs a clean process
280+
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
281+
282+
- label: Decoder-only Language Models Test # 1h3min
283+
#mirror_hardwares: [amd]
284+
source_file_dependencies:
285+
- vllm/
286+
- tests/models/decoder_only/language
287+
commands:
288+
- pytest -v -s models/decoder_only/language
289+
290+
- label: Decoder-only Multi-Modal Models Test # 56min
291+
#mirror_hardwares: [amd]
292+
source_file_dependencies:
293+
- vllm/
294+
- tests/models/decoder_only/audio_language
295+
- tests/models/decoder_only/vision_language
296+
commands:
297+
- pytest -v -s models/decoder_only/audio_language
298+
- pytest -v -s models/decoder_only/vision_language
299+
300+
- label: Other Models Test # 5min
301+
#mirror_hardwares: [amd]
302+
source_file_dependencies:
303+
- vllm/
304+
- tests/models/embedding/language
305+
- tests/models/encoder_decoder/language
306+
commands:
307+
- pytest -v -s models/embedding/language
308+
- pytest -v -s models/encoder_decoder/language
309+
285310
##### 1 GPU test #####
286311
##### multi gpus test #####
287312

@@ -307,11 +332,11 @@ steps:
307332
- tests/distributed/
308333
commands:
309334
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
310-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
335+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
311336
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
312337
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
313338
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
314-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
339+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
315340

316341
- label: Distributed Tests (2 GPUs) # 28min
317342
#mirror_hardwares: [amd]
@@ -324,11 +349,10 @@ steps:
324349
- vllm/model_executor/models/
325350
- tests/distributed/
326351
commands:
327-
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
328-
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
329-
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
330-
- pytest -v -s distributed/test_chunked_prefill_distributed.py
331-
- pytest -v -s distributed/test_multimodal_broadcast.py
352+
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353+
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354+
# Avoid importing model tests that cause CUDA reinitialization error
355+
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
332356
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
333357
- pip install -e ./plugins/vllm_add_dummy_model
334358
- pytest -v -s distributed/test_distributed_oot.py
@@ -386,7 +410,18 @@ steps:
386410
- vllm/
387411
- tests/weight_loading
388412
commands:
389-
- bash weight_loading/run_model_weight_loading_test.sh
413+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
414+
415+
- label: Weight Loading Multiple GPU Test - Large Models # optional
416+
working_dir: "/vllm-workspace/tests"
417+
num_gpus: 2
418+
gpu: a100
419+
optional: true
420+
source_file_dependencies:
421+
- vllm/
422+
- tests/weight_loading
423+
commands:
424+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
390425

391426

392427
##### multi gpus test #####

Diff for: .github/ISSUE_TEMPLATE/400-bug report.yml

+9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ body:
3030
</details>
3131
validations:
3232
required: true
33+
- type: textarea
34+
attributes:
35+
label: Model Input Dumps
36+
description: |
37+
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
38+
placeholder: |
39+
Upload the dumped input file.
40+
validations:
41+
required: false
3342
- type: textarea
3443
attributes:
3544
label: 🐛 Describe the bug

Diff for: .github/PULL_REQUEST_TEMPLATE.md

+10
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
3939
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
4040
</ul>
4141

42+
<h3>Adding or changing kernels</h3>
43+
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
44+
<ul>
45+
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
46+
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
47+
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
48+
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
49+
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
50+
</ul>
51+
4252
<h3>Notes for Large Changes</h3>
4353
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
4454

Diff for: CMakeLists.txt

+36-26
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
208208
FetchContent_Declare(
209209
cutlass
210210
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
211-
# CUTLASS 3.5.1
212-
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
211+
GIT_TAG v3.5.1
213212
GIT_PROGRESS TRUE
213+
214+
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
215+
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
216+
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
217+
GIT_SHALLOW TRUE
214218
)
215219
FetchContent_MakeAvailable(cutlass)
216220

@@ -244,6 +248,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
244248
"-gencode arch=compute_90a,code=sm_90a")
245249
endif()
246250

251+
247252
#
248253
# Machete kernels
249254

@@ -307,28 +312,11 @@ define_gpu_extension_target(
307312
USE_SABI 3
308313
WITH_SOABI)
309314

310-
if(VLLM_GPU_LANG STREQUAL "HIP")
311-
#
312-
# custom extension
313-
#
314-
set(CUSTOM_SRC
315-
"csrc/custom/torch_bindings.cpp"
316-
"csrc/custom/custom_kernels.cu"
317-
"csrc/custom/fused_kernels.cu"
318-
"csrc/custom/custom.cu"
319-
"csrc/custom/paged_attention/attention_ll4mi.cu"
320-
)
321-
322-
define_gpu_extension_target(
323-
_custom_C
324-
DESTINATION vllm
325-
LANGUAGE ${VLLM_GPU_LANG}
326-
SOURCES ${CUSTOM_SRC}
327-
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
328-
ARCHITECTURES ${VLLM_GPU_ARCHES}
329-
USE_SABI 3
330-
WITH_SOABI)
331-
endif()
315+
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
316+
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
317+
# driver API. This causes problems when linking with earlier versions of CUDA.
318+
# Setting this variable sidesteps the issue by calling the driver directly.
319+
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
332320

333321
#
334322
# _moe_C extension
@@ -354,6 +342,28 @@ define_gpu_extension_target(
354342
WITH_SOABI)
355343

356344

345+
if(VLLM_GPU_LANG STREQUAL "HIP")
346+
#
347+
# _rocm_C extension
348+
#
349+
set(VLLM_ROCM_EXT_SRC
350+
"csrc/rocm/torch_bindings.cpp"
351+
"csrc/rocm/attention.cu"
352+
"csrc/rocm/custom_kernels.cu"
353+
"csrc/rocm/fused_kernels.cu"
354+
"csrc/rocm/custom.cu")
355+
356+
define_gpu_extension_target(
357+
_rocm_C
358+
DESTINATION vllm
359+
LANGUAGE ${VLLM_GPU_LANG}
360+
SOURCES ${VLLM_ROCM_EXT_SRC}
361+
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
362+
ARCHITECTURES ${VLLM_GPU_ARCHES}
363+
USE_SABI 3
364+
WITH_SOABI)
365+
endif()
366+
357367

358368
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
359369
message(STATUS "Enabling C extension.")
@@ -364,6 +374,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
364374
endif()
365375

366376
if(VLLM_GPU_LANG STREQUAL "HIP")
367-
message(STATUS "Enabling custom extension.")
368-
add_dependencies(default _custom_C)
377+
message(STATUS "Enabling rocm extension.")
378+
add_dependencies(default _rocm_C)
369379
endif()

Diff for: Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
145145
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
146146
&& apt-get update -y \
147147
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
148+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
148149
&& add-apt-repository ppa:deadsnakes/ppa \
149150
&& apt-get update -y \
150151
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \

0 commit comments

Comments
 (0)