vllm-project · mgazz · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 6, 2025
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -143,7 +143,8 @@ def __init__(self):
         self.model = LLM(
             model=os.path.join(os.path.dirname(__file__), "./model"),
             skip_tokenizer_init=True,
-            dtype="float32",
+            dtype="float16",
+            enforce_eager=True,
         )
 
     def run(self, input_data, location_coords):

@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from ....conftest import VllmRunner
+
+
+def generate_test_mm_data():
+    mm_data = {
+        "pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
+        "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
+    }
+    return mm_data
+
+
+def _run_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    mm_data = generate_test_mm_data()
+    prompt = {
+        # This model deals with no text input
+        "prompt_token_ids": [1],
+        "multi_modal_data": mm_data
+    }
+    with vllm_runner(model,
+                     task="embed",
+                     dtype=torch.float16,
+                     enforce_eager=True,
+                     skip_tokenizer_init=True) as vllm_model:
+        vllm_model.encode(prompt)
+
+
+MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+) -> None:
+    _run_test(
+        vllm_runner,
+        model,
+    )
@@ -614,6 +614,8 @@ def __post_init__(self) -> None:
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
         self.multimodal_config = self._init_multimodal_config()
+        self.model_supports_multimodal_raw_input = (
+            self._init_model_supports_multimodal_raw_input())
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
@@ -706,6 +708,9 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
 
         return None
 
+    def _init_model_supports_multimodal_raw_input(self):
+        return self.registry.supports_multimodal_raw_input(self.architectures)
+
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -1100,10 +1105,10 @@ def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]:
         return self.get_hf_config_sliding_window()
 
     def get_vocab_size(self) -> int:
-        return self.hf_text_config.vocab_size
+        return getattr(self.hf_text_config, "vocab_size", 0)
 
     def get_hidden_size(self) -> int:
-        return self.hf_text_config.hidden_size
+        return getattr(self.hf_text_config, "hidden_size", 0)
 
     @property
     def is_deepseek_mla(self) -> bool:
@@ -1397,6 +1402,10 @@ def uses_mrope(self) -> bool:
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
+
+    @property
+    def is_pooling_model(self) -> bool: 
+        return self.registry.is_pooling_model(self.architectures)
 
     @property
     def is_cross_encoder(self) -> bool:

@@ -75,6 +75,17 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class ChatCompletionContentPartTensorsParam(TypedDict, total=False):
+    tensors: Required[Union[str, dict[str, str]]]
+    """
+    The tensors. It can be either:
+    - A single base64 string.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["tensors"]]
+    """The type of the content part."""
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -129,6 +140,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
+    ChatCompletionContentPartTensorsParam,
     CustomChatCompletionContentSimpleAudioParam,
     CustomChatCompletionContentSimpleVideoParam, str]
 
@@ -468,7 +480,7 @@ def resolve_chat_template_content_format(
 
 
 
-ModalityStr = Literal["image", "audio", "video", "image_embeds"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds","tensors"]
 _T = TypeVar("_T")
 
 
@@ -572,6 +584,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.video_token_index)
             raise TypeError(f"Unknown {modality} model type: {model_type}")
+        elif modality == "tensors":
+            return None
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
@@ -630,6 +644,13 @@ def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(\
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
+
+        if "tensors" in items_by_modality:
+            tensors_lst = items_by_modality["tensors"]
+            if len(tensors_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'tensors'}")
+            mm_inputs["tensors"] = tensors_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
         if "audio" in items_by_modality:
@@ -663,6 +684,12 @@ async def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
+        if "tensors" in items_by_modality:
+            tensors_lst = items_by_modality["tensors"]
+            if len(tensors_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'tensors'}")
+            mm_inputs["tensors"] = tensors_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
         if "audio" in items_by_modality:
@@ -695,8 +722,9 @@ def parse_image(self, image_url: str) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_tensors(self,
+                           tensor_encodings: Union[str, dict[str, str]],
+                           modality_str: ModalityStr) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -729,18 +757,22 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
-        if isinstance(image_embeds, dict):
-            embeds = {
-                k: self._connector.fetch_image_embedding(v)
-                for k, v in image_embeds.items()
+    def parse_tensors(self,
+                           tensor_encodings: Union[str, dict[str, str]],
+                           modality_str: ModalityStr) -> None:
+        if modality_str not in ["image_embeds","tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors = {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
             }
-            placeholder = self._tracker.add("image_embeds", embeds)
+            placeholder = self._tracker.add(modality_str, tensors)
 
-        if isinstance(image_embeds, str):
-            embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
+        if isinstance(tensor_encodings, str):
+            tensor= self._connector.fetch_tensor_encoding(tensor_encodings)
+            placeholder = self._tracker.add(modality_str, tensor)
 
         self._add_placeholder(placeholder)
 
-    def parse_tensors(self,
-                           tensor_encodings: Union[str, dict[str, str]],
-                           modality_str: ModalityStr) -> None:
-        if modality_str not in ["image_embeds","tensors"]:
-            raise Exception("tensors are acceptable only as part "
-                            "of 'image_embeds' or 'tensors' modalities.")
-        if isinstance(tensor_encodings, dict):
-            tensors = {
-                k: self._connector.fetch_tensor_encoding(v)
-                for k, v in tensor_encodings.items()
-            }
-            placeholder = self._tracker.add("image_embeds", embeds)
-            placeholder = self._tracker.add(modality_str, tensors)
-
-        if isinstance(image_embeds, str):
-            embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
-        if isinstance(tensor_encodings, str):
-            tensor= self._connector.fetch_tensor_encoding(tensor_encodings)
-            placeholder = self._tracker.add(modality_str, tensor)
-
-        self._add_placeholder(placeholder)
+    def parse_tensors(self,
+                           tensor_encodings: Union[str, dict[str, str]],
+                           modality_str: ModalityStr) -> None:
+        if modality_str not in ["image_embeds", "tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors = {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
+            }
+            placeholder = self._tracker.add(modality_str, tensors)
+
+        elif isinstance(tensor_encodings, str):
+            tensor= self._connector.fetch_tensor_encoding(tensor_encodings)
+            placeholder = self._tracker.add(modality_str, tensor)
+        else:
+            raise TypeError(f"Unsupported type for tensor_encodings: {type(tensor_encodings)}")
+
+        self._add_placeholder(placeholder)
-    def parse_tensors(self,
-                           tensor_encodings: Union[str, dict[str, str]],
-                           modality_str: ModalityStr) -> None:
-        if modality_str not in ["image_embeds","tensors"]:
-            raise Exception("tensors are acceptable only as part "
-                            "of 'image_embeds' or 'tensors' modalities.")
-        if isinstance(tensor_encodings, dict):
-            tensors = {
-                k: self._connector.fetch_tensor_encoding(v)
-                for k, v in tensor_encodings.items()
-            }
-            placeholder = self._tracker.add("image_embeds", embeds)
-            placeholder = self._tracker.add(modality_str, tensors)
-
-        if isinstance(image_embeds, str):
-            embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
-        if isinstance(tensor_encodings, str):
-            tensor= self._connector.fetch_tensor_encoding(tensor_encodings)
-            placeholder = self._tracker.add(modality_str, tensor)
-
-        self._add_placeholder(placeholder)
+    def parse_tensors(self,
+                           tensor_encodings: Union[str, dict[str, str]],
+                           modality_str: ModalityStr) -> None:
+        if modality_str not in ["image_embeds", "tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors = {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
+            }
+            placeholder = self._tracker.add(modality_str, tensors)
+
+        elif isinstance(tensor_encodings, str):
+            tensor= self._connector.fetch_tensor_encoding(tensor_encodings)
+            placeholder = self._tracker.add(modality_str, tensor)
+        else:
+            raise TypeError(f"Unsupported type for tensor_encodings: {type(tensor_encodings)}")
+
+        self._add_placeholder(placeholder)
@@ -780,23 +812,27 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_tensors(self,
+                           tensor_encodings: Union[str, dict[str, str]],
+                           modality_str: ModalityStr) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
 
-        if isinstance(image_embeds, dict):
-            embeds = {
-                k: self._connector.fetch_image_embedding(v)
-                for k, v in image_embeds.items()
+        if modality_str not in ["image_embeds","tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors= {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
             }
-            future.set_result(embeds)
+            future.set_result(tensors)
 
-        if isinstance(image_embeds, str):
-            embedding = self._connector.\
-                fetch_image_embedding(image_embeds)
-            future.set_result(embedding)
+        if isinstance(tensors, str):
+            tensor= self._connector.\
+                fetch_tensor_encoding(tensor_encodings)
+            future.set_result(tensor)
-
-        if isinstance(image_embeds, dict):
-            embeds = {
-                k: self._connector.fetch_image_embedding(v)
-                for k, v in image_embeds.items()
-        if modality_str not in ["image_embeds","tensors"]:
-            raise Exception("tensors are acceptable only as part "
-                            "of 'image_embeds' or 'tensors' modalities.")
-        if isinstance(tensor_encodings, dict):
-            tensors= {
-                k: self._connector.fetch_tensor_encoding(v)
-                for k, v in tensor_encodings.items()
-            }
-            future.set_result(embeds)
-            future.set_result(tensors)
-
-        if isinstance(image_embeds, str):
-            embedding = self._connector.\
-                fetch_image_embedding(image_embeds)
-            future.set_result(embedding)
-        if isinstance(tensors, str):
-            tensor= self._connector.\
-                fetch_tensor_encoding(tensor_encodings)
-            future.set_result(tensor)
+        if modality_str not in ["image_embeds","tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors= {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
+            }
+            future.set_result(tensors)
+
+        elif isinstance(tensor_encodings, str):
+            tensor= self._connector.\
+                fetch_tensor_encoding(tensor_encodings)
+            future.set_result(tensor)
+        else:
+            raise TypeError(f"Unsupported type for tensor_encodings: {type(tensor_encodings)}")
-
-        if isinstance(image_embeds, dict):
-            embeds = {
-                k: self._connector.fetch_image_embedding(v)
-                for k, v in image_embeds.items()
-        if modality_str not in ["image_embeds","tensors"]:
-            raise Exception("tensors are acceptable only as part "
-                            "of 'image_embeds' or 'tensors' modalities.")
-        if isinstance(tensor_encodings, dict):
-            tensors= {
-                k: self._connector.fetch_tensor_encoding(v)
-                for k, v in tensor_encodings.items()
-            }
-            future.set_result(embeds)
-            future.set_result(tensors)
-
-        if isinstance(image_embeds, str):
-            embedding = self._connector.\
-                fetch_image_embedding(image_embeds)
-            future.set_result(embedding)
-        if isinstance(tensors, str):
-            tensor= self._connector.\
-                fetch_tensor_encoding(tensor_encodings)
-            future.set_result(tensor)
+        if modality_str not in ["image_embeds","tensors"]:
+            raise Exception("tensors are acceptable only as part "
+                            "of 'image_embeds' or 'tensors' modalities.")
+        if isinstance(tensor_encodings, dict):
+            tensors= {
+                k: self._connector.fetch_tensor_encoding(v)
+                for k, v in tensor_encodings.items()
+            }
+            future.set_result(tensors)
+
+        elif isinstance(tensor_encodings, str):
+            tensor= self._connector.\
+                fetch_tensor_encoding(tensor_encodings)
+            future.set_result(tensor)
+        else:
+            raise TypeError(f"Unsupported type for tensor_encodings: {type(tensor_encodings)}")
 
-        placeholder = self._tracker.add("image_embeds", future)
+        placeholder = self._tracker.add(modality_str, future)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
@@ -819,6 +855,8 @@ def parse_video(self, video_url: str) -> None:
         self._add_placeholder(placeholder)
 
 
+
+
 def validate_chat_template(chat_template: Optional[Union[Path, str]]):
     """Raises if the provided chat template appears invalid."""
     if chat_template is None:
@@ -915,6 +953,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
+_TensorsParser = partial(cast, ChatCompletionContentPartTensorsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 # Need to validate url objects
@@ -935,6 +974,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "tensors":
+    lambda part: _TensorsParser(part).get("tensors", None),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -1004,7 +1045,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "tensors",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -1081,8 +1122,12 @@ def _parse_chat_message_content_part(
         return {'type': 'image'} if wrap_dicts else None
     if part_type == "image_embeds":
         content = cast(Union[str, dict[str, str]], content)
-        mm_parser.parse_image_embeds(content)
+        mm_parser.parse_tensors(content,"image_embeds")
         return {'type': 'image'} if wrap_dicts else None
+    if part_type == "tensors":
+        content = cast(Union[str, dict[str, str]], content)
+        mm_parser.parse_tensors(content,"tensors")
+        return {'type': 'tensors'} if wrap_dicts else None
     if part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)

@@ -1109,7 +1109,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     model: Optional[str] = None
     messages: list[ChatCompletionMessageParam]
 
-    encoding_format: Literal["float", "base64"] = "float"
+    encoding_format: Literal["float", "base64", "tensor"] = "float"
     dimensions: Optional[int] = None
     user: Optional[str] = None
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

@@ -807,6 +807,8 @@ async def _preprocess_chat(
                 messages=messages,
                 **_chat_template_kwargs,
             )
+        elif tokenizer is None:
+            request_prompt = "placeholder"
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer=tokenizer,
@@ -831,7 +833,17 @@ async def _preprocess_chat(
             request = tool_parser(tokenizer).adjust_request(  # type: ignore
                 request=request)
 
-        if isinstance(request_prompt, str):
+        if tokenizer is None:
+            prompt_inputs = {}
+            if "prompt_token_ids" not in request.additional_data:
+                raise Exception("Request must contain "
+                                "additional_data['prompt_token_ids'] "
+                                "when the tokenizer is not initialised")
+
+            prompt_inputs["prompt_token_ids"] = request.additional_data[
+                "prompt_token_ids"]
+
+        elif isinstance(request_prompt, str):
             prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
                 tokenizer,

@@ -25,6 +25,7 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
+from vllm.multimodal.image import ImageEmbeddingMediaIO
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators
 
@@ -33,7 +34,7 @@
 
 def _get_data(
     output: PoolingOutput,
-    encoding_format: Literal["float", "base64"],
+    encoding_format: Literal["float", "base64", "tensors"],
 ) -> Union[list[float], str]:
     if encoding_format == "float":
         return output.data.tolist()
@@ -43,6 +44,9 @@ def _get_data(
         pt_float32 = output.data.to(dtype=torch.float32)
         pooling_bytes = np.array(pt_float32, dtype="float32").tobytes()
         return base64.b64encode(pooling_bytes).decode("utf-8")
+    elif encoding_format == "tensor":
+        tensor_encoding_io = ImageEmbeddingMediaIO()
+        return tensor_encoding_io.encode_base64(output.data)
 
     assert_never(encoding_format)
 
@@ -99,7 +103,11 @@ async def create_pooling(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            if not self.model_config.skip_tokenizer_init:
+                tokenizer = await self.engine_client.get_tokenizer(lora_request
+                                                                   )
+            else:
+                tokenizer = None
 
             if prompt_adapter_request is not None:
                 raise NotImplementedError("Prompt adapter is not supported "
@@ -205,7 +213,7 @@ def request_output_to_pooling_response(
         request_id: str,
         created_time: int,
         model_name: str,
-        encoding_format: Literal["float", "base64"],
+        encoding_format: Literal["float", "base64", "tensors"],
     ) -> PoolingResponse:
         items: list[PoolingResponseData] = []
         num_prompt_tokens = 0