vllm-project · tjtanaa · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+
+import pytest
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import _MHA_Backend
+from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
+
+MODEL_NAME = "rednote-hilab/dots.ocr"
+
+# Exact prompt from dots.ocr
+# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
+# ruff: noqa: E501
+PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+
+1. Bbox format: [x1, y1, x2, y2]
+
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+
+5. Final Output: The entire output must be a single JSON object.
+"""
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("prompt", [PROMPT])
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+def test_dots_ocr_vit_attn_backend_functionality(
+    image_assets,
+    prompt: str,
+    mm_encoder_attn_backend: _MHA_Backend | None,
+):
+    # images = [asset.pil_image for asset in image_assets]
+    # Use the stop_sign image which has clear text
+    stop_sign_image = [
+        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
+    ][0]
+
+    image_urls = [f"data:image/jpeg;base64,{encode_image_base64(stop_sign_image)}"]
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=32768,
+        max_num_seqs=1,
+        limit_mm_per_prompt={"image": 1},
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+    )
+
+    # From the demo example of dots.ocr
+    # https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/model/inference.py#L22
+
+    placeholders = [
+        {"type": "image_url", "image_url": {"url": image_url}}
+        for image_url in image_urls
+    ]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"},
+            ],
+        },
+    ]
+
+    engine_args = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.1,
+        max_tokens=16384,
+        stop_token_ids=None,
+        top_p=0.9,
+    )
+
+    outputs = llm.chat(
+        messages=messages,
+        sampling_params=sampling_params,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        assert len(generated_text) > 10, (
+            f"Generated text is too short: {generated_text}"
+        )
+        assert "stop" in generated_text.lower(), (
+            f"Generated text does not contain 'stop': {generated_text}"
+        )
+        print("-" * 50)
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+
+import pytest
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import _MHA_Backend
+from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
+
+from ....utils import large_gpu_test
+
+MODEL_NAME = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+
+QUESTION = "What is the content of each image?"
+
+
+@large_gpu_test(min_gb=80)
+@pytest.mark.parametrize("question", [QUESTION])
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+def test_ernie45_vl_vit_attn_backend_functionality(
+    image_assets,
+    question: str,
+    mm_encoder_attn_backend: _MHA_Backend | None,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
+    ]
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=16384,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=None
+    )
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        assert len(generated_text) > 10, (
+            f"Generated text is too short: {generated_text}"
+        )
+        print("-" * 50)
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import asdict
+
+import pytest
+from transformers import AutoProcessor
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import _MHA_Backend
+from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
+
+MODEL_NAME = "zai-org/GLM-4.1V-9B-Thinking"
+
+QUESTION = "What is the content of each image?"
+
+
+@pytest.mark.parametrize("question", [QUESTION])
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+def test_glm4_1v_vit_attn_backend_vit_attn_backend_functionality(
+    image_assets,
+    question: str,
+    mm_encoder_attn_backend: _MHA_Backend,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    image_urls = [
+        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
+    ]
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=32768,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    engine_args = asdict(engine_args) | {"seed": 42}
+    llm = LLM(**engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=None
+    )
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {"image": images},
+        },
+        sampling_params=sampling_params,
+    )
+
+    print("-" * 50)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+        assert len(generated_text) > 10, (
+            f"Generated text is too short: {generated_text}"
+        )
+        print("-" * 50)
@@ -1,35 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import asdict
-from typing import NamedTuple
 
 import pytest
-from PIL.Image import Image
 from transformers import AutoProcessor
 
 from vllm import LLM, EngineArgs, SamplingParams
+from vllm.attention.backends.registry import _MHA_Backend
 from vllm.multimodal.utils import encode_image_base64
+from vllm.platforms import current_platform
 
 MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
 
 QUESTION = "What is the content of each image?"
 
 
-class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
-    prompt: str
-    image_data: list[Image]
-    stop_token_ids: list[int] | None = None
-    chat_template: str | None = None
-    sampling_params: SamplingParams | None = None
-
-
-@pytest.mark.core_model
 @pytest.mark.parametrize("question", [QUESTION])
-def test_keye_vl(
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
+def test_keye_vl_vit_attn_backend_functionality(
     image_assets,
     question: str,
+    mm_encoder_attn_backend: _MHA_Backend | None,
 ):
+    if mm_encoder_attn_backend is not None and mm_encoder_attn_backend not in {
+        _MHA_Backend.FLASH_ATTN,
+        _MHA_Backend.XFORMERS,
+        _MHA_Backend.VLLM_FLASH_ATTN,
+        _MHA_Backend.ROCM_AITER_FA,
+    }:
+        pytest.skip(f"Keye-VL does not support {mm_encoder_attn_backend} backend now.")
+
     images = [asset.pil_image for asset in image_assets]
 
     image_urls = [
@@ -42,6 +45,7 @@ def test_keye_vl(
         max_model_len=8192,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]

@@ -21,6 +21,8 @@
 from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
 
 from vllm import LLM, SamplingParams
+from vllm.attention.backends.registry import _MHA_Backend
+from vllm.platforms import current_platform
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
 
@@ -600,6 +602,10 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
 )
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.parametrize("tp,ep", [(2, True)])
+@pytest.mark.parametrize(
+    "mm_encoder_attn_backend",
+    [None] + current_platform.get_supported_vit_attn_backends(),
+)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_dummy_maverick(
     monkeypatch,
@@ -610,6 +616,7 @@ def test_dummy_maverick(
     enforce_eager: bool,
     tp: int,
     ep: bool,
+    mm_encoder_attn_backend: _MHA_Backend | None,
     output_dir: str = "/tmp/reduced_maverick",
     force_recreate: bool = True,
     profile: bool = False,
@@ -638,6 +645,7 @@ def test_dummy_maverick(
         enforce_eager=enforce_eager,
         tensor_parallel_size=tp,
         enable_expert_parallel=ep,
+        mm_encoder_attn_backend=mm_encoder_attn_backend,
     )
 
     check_attention_spec_interleaved_rope(