|
54 | 54 | default_weight_loader, maybe_remap_kv_scale_name) |
55 | 55 | from vllm.model_executor.sampling_metadata import SamplingMetadata |
56 | 56 | from vllm.multimodal import MULTIMODAL_REGISTRY |
57 | | -from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs |
| 57 | +from vllm.multimodal.inputs import (MultiModalEncDecInputs, |
| 58 | + MultiModalFieldConfig, MultiModalKwargs) |
58 | 59 | from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, |
59 | 60 | MultiModalDataDict, MultiModalDataItems) |
60 | 61 | from vllm.multimodal.processing import (BaseProcessingInfo, |
@@ -169,6 +170,26 @@ def get_dummy_processor_inputs( |
169 | 170 | class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] |
170 | 171 | ): |
171 | 172 |
|
| 173 | + def apply( |
| 174 | + self, |
| 175 | + prompt: Union[str, list[int]], |
| 176 | + mm_data: MultiModalDataDict, |
| 177 | + hf_processor_mm_kwargs: Mapping[str, object], |
| 178 | + ) -> MultiModalEncDecInputs: |
| 179 | + mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs) |
| 180 | + |
| 181 | + # Check that the number of image tokens matches the number of images |
| 182 | + num_image_tokens = mm_inputs['prompt_token_ids'].count( |
| 183 | + self.info.get_hf_config().image_token_index) |
| 184 | + image_data = mm_data.get("image", []) |
| 185 | + num_images = 1 if isinstance(image_data, Image) else len(image_data) |
| 186 | + if num_image_tokens != num_images: |
| 187 | + raise ValueError( |
| 188 | + f"The number of image tokens ({num_image_tokens}) must be" |
| 189 | + f" the same as the number of images ({num_images})") |
| 190 | + |
| 191 | + return mm_inputs |
| 192 | + |
172 | 193 | def _call_hf_processor( |
173 | 194 | self, |
174 | 195 | prompt: str, |
|
0 commit comments