Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
96f8105
introduce _MHA_Backend; platform vit selection logic; expand qwen25vl…
tjtanaa Oct 30, 2025
9713a3c
add logging and fix interface
tjtanaa Oct 30, 2025
b2ce71c
expand qwen25vl test to test alll backend
tjtanaa Oct 30, 2025
504e2f9
expand qwen2vl test and ensure all pass
tjtanaa Oct 30, 2025
e8d1fca
document pallas is for TPU and update platforms tpu
tjtanaa Oct 30, 2025
4f0e4fe
remove print statements
tjtanaa Oct 30, 2025
4e86fb0
remove qwen2_vl vit attention override
tjtanaa Oct 30, 2025
88c2c1b
remove print statement
tjtanaa Oct 30, 2025
e0791ee
replace _Backend with _MHA_Backend; remove use_upstream_fa parameters…
tjtanaa Oct 30, 2025
8f53deb
update readme
tjtanaa Oct 30, 2025
6a36d55
update MHAttention condition; pass maverick test
tjtanaa Oct 30, 2025
c6f45b9
bug fix keye
tjtanaa Oct 31, 2025
110d943
Merge remote-tracking branch 'origin/main' into rfc-vit-amd
tjtanaa Oct 31, 2025
efb4bdf
fix glm4_1v aiter fa bug
tjtanaa Nov 1, 2025
164a1a2
introduce glm4_1v functionality test
tjtanaa Nov 1, 2025
e6a373e
add doctsocr unit test; passed the test
tjtanaa Nov 1, 2025
9173f8f
add ernie45vl test; remove coremodel marker
tjtanaa Nov 1, 2025
e81ec18
sync upstream
tjtanaa Nov 1, 2025
10dd219
bugfix the get vit backend if else condition
tjtanaa Nov 1, 2025
32441f7
add ovis25 unit test; pass it
tjtanaa Nov 1, 2025
0b1bbc0
bugfix ovis25vl torchsdpa rocm
tjtanaa Nov 1, 2025
f04ee40
reduce test cases of qwen2vl
tjtanaa Nov 1, 2025
7f21257
add qwen3 omni image unit tests
tjtanaa Nov 1, 2025
38fb26a
clean up unit tests unused code; update unit test name
tjtanaa Nov 1, 2025
9f07c0a
Merge remote-tracking branch 'origin/main' into rfc-vit-amd
tjtanaa Nov 1, 2025
b8c835c
clean up maybe_get_vit_flash_attn_backend
tjtanaa Nov 1, 2025
f27f223
bugfix qwen2_5_vl
tjtanaa Nov 1, 2025
81e177f
bugfix MHA Attention logic
tjtanaa Nov 1, 2025
33f1e9b
define a new str to _MHA_Backend mapping function
tjtanaa Nov 1, 2025
b05f5d3
document that FLASH_ATTN is also for XPU
tjtanaa Nov 1, 2025
43be286
sync with main
tjtanaa Nov 3, 2025
5ed59e6
sync upstream
tjtanaa Nov 4, 2025
7f915f4
qwen25vl does not need maybe_get_vit_flash_attn_backend
tjtanaa Nov 4, 2025
a6cf45a
sync with main
tjtanaa Nov 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions tests/models/multimodal/generation/test_dots_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict

import pytest

from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import _MHA_Backend
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform

MODEL_NAME = "rednote-hilab/dots.ocr"

# Exact prompt from dots.ocr
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

1. Bbox format: [x1, y1, x2, y2]

2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

3. Text Extraction & Formatting Rules:
- Picture: For the 'Picture' category, the text field should be omitted.
- Formula: Format its text as LaTeX.
- Table: Format its text as HTML.
- All Others (Text, Title, etc.): Format their text as Markdown.

4. Constraints:
- The output text must be the original text from the image, with no translation.
- All layout elements must be sorted according to human reading order.

5. Final Output: The entire output must be a single JSON object.
"""


@pytest.mark.core_model
@pytest.mark.parametrize("prompt", [PROMPT])
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
def test_dots_ocr_vit_attn_backend_functionality(
image_assets,
prompt: str,
mm_encoder_attn_backend: _MHA_Backend | None,
):
# images = [asset.pil_image for asset in image_assets]
# Use the stop_sign image which has clear text
stop_sign_image = [
asset.pil_image for asset in image_assets if asset.name == "stop_sign"
][0]

image_urls = [f"data:image/jpeg;base64,{encode_image_base64(stop_sign_image)}"]

engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=32768,
max_num_seqs=1,
limit_mm_per_prompt={"image": 1},
mm_encoder_attn_backend=mm_encoder_attn_backend,
)

# From the demo example of dots.ocr
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/model/inference.py#L22

placeholders = [
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"},
],
},
]

engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)

sampling_params = SamplingParams(
temperature=0.1,
max_tokens=16384,
stop_token_ids=None,
top_p=0.9,
)

outputs = llm.chat(
messages=messages,
sampling_params=sampling_params,
)

print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
assert len(generated_text) > 10, (
f"Generated text is too short: {generated_text}"
)
assert "stop" in generated_text.lower(), (
f"Generated text does not contain 'stop': {generated_text}"
)
print("-" * 50)
85 changes: 85 additions & 0 deletions tests/models/multimodal/generation/test_ernie45_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict

import pytest
from transformers import AutoProcessor

from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import _MHA_Backend
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform

from ....utils import large_gpu_test

MODEL_NAME = "baidu/ERNIE-4.5-VL-28B-A3B-PT"

QUESTION = "What is the content of each image?"


@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("question", [QUESTION])
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
def test_ernie45_vl_vit_attn_backend_functionality(
image_assets,
question: str,
mm_encoder_attn_backend: _MHA_Backend | None,
):
images = [asset.pil_image for asset in image_assets]

image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]

engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=16384,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
mm_encoder_attn_backend=mm_encoder_attn_backend,
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]

processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)

prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)

engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)

sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
)

outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)

print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
assert len(generated_text) > 10, (
f"Generated text is too short: {generated_text}"
)
print("-" * 50)
82 changes: 82 additions & 0 deletions tests/models/multimodal/generation/test_glm4_1v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict

import pytest
from transformers import AutoProcessor

from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import _MHA_Backend
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform

MODEL_NAME = "zai-org/GLM-4.1V-9B-Thinking"

QUESTION = "What is the content of each image?"


@pytest.mark.parametrize("question", [QUESTION])
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
def test_glm4_1v_vit_attn_backend_vit_attn_backend_functionality(
image_assets,
question: str,
mm_encoder_attn_backend: _MHA_Backend,
):
images = [asset.pil_image for asset in image_assets]

image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]

engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
mm_encoder_attn_backend=mm_encoder_attn_backend,
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]

processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)

prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)

engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)

sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
)

outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {"image": images},
},
sampling_params=sampling_params,
)

print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
assert len(generated_text) > 10, (
f"Generated text is too short: {generated_text}"
)
print("-" * 50)
30 changes: 17 additions & 13 deletions tests/models/multimodal/generation/test_keye.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple

import pytest
from PIL.Image import Image
from transformers import AutoProcessor

from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import _MHA_Backend
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform

MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"

QUESTION = "What is the content of each image?"


class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
image_data: list[Image]
stop_token_ids: list[int] | None = None
chat_template: str | None = None
sampling_params: SamplingParams | None = None


@pytest.mark.core_model
@pytest.mark.parametrize("question", [QUESTION])
def test_keye_vl(
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
def test_keye_vl_vit_attn_backend_functionality(
image_assets,
question: str,
mm_encoder_attn_backend: _MHA_Backend | None,
):
if mm_encoder_attn_backend is not None and mm_encoder_attn_backend not in {
_MHA_Backend.FLASH_ATTN,
_MHA_Backend.XFORMERS,
_MHA_Backend.VLLM_FLASH_ATTN,
_MHA_Backend.ROCM_AITER_FA,
}:
pytest.skip(f"Keye-VL does not support {mm_encoder_attn_backend} backend now.")

images = [asset.pil_image for asset in image_assets]

image_urls = [
Expand All @@ -42,6 +45,7 @@ def test_keye_vl(
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
mm_encoder_attn_backend=mm_encoder_attn_backend,
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
Expand Down
8 changes: 8 additions & 0 deletions tests/models/multimodal/generation/test_maverick.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig

from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import _MHA_Backend
from vllm.platforms import current_platform
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec

Expand Down Expand Up @@ -600,6 +602,10 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.parametrize(
"mm_encoder_attn_backend",
[None] + current_platform.get_supported_vit_attn_backends(),
)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
monkeypatch,
Expand All @@ -610,6 +616,7 @@ def test_dummy_maverick(
enforce_eager: bool,
tp: int,
ep: bool,
mm_encoder_attn_backend: _MHA_Backend | None,
output_dir: str = "/tmp/reduced_maverick",
force_recreate: bool = True,
profile: bool = False,
Expand Down Expand Up @@ -638,6 +645,7 @@ def test_dummy_maverick(
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep,
mm_encoder_attn_backend=mm_encoder_attn_backend,
)

check_attention_spec_interleaved_rope(
Expand Down
Loading