fix: add back image token count check

tjohnson31415 · tjohnson31415 · commit bd789f87c2e6 · 2025-02-27T17:13:07.000-07:00
This check was originally added in vllm-project#11939 But was removed as part of the refactoring in vllm-project#11427 Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -54,7 +54,8 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalEncDecInputs,
+                                    MultiModalFieldConfig, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataDict, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -169,6 +170,26 @@ def get_dummy_processor_inputs(
 class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                                 ):
 
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalEncDecInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+        # Check that the number of image tokens matches the number of images
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(
+            self.info.get_hf_config().image_token_index)
+        image_data = mm_data.get("image", [])
+        num_images = 1 if isinstance(image_data, Image) else len(image_data)
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image tokens ({num_image_tokens}) must be"
+                f" the same as the number of images ({num_images})")
+
+        return mm_inputs
+
     def _call_hf_processor(
         self,
         prompt: str,