Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions tests/entrypoints/openai/test_skip_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import base64
import io

import numpy as np
import pytest
import requests
import torch

from ...utils import RemoteOpenAIServer

MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
DTYPE = "float16"


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):

pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)

buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')

buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')

prompt = {
"model":
model_name,
"additional_data": {
"prompt_token_ids": [1]
},
"encoding_format":
"base64",
"messages": [{
"role":
"user",
"content": [{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}],
}]
}

# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()

output = response.json()["data"][0]['data']

np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)

assert len(np_response) == 524288
20 changes: 14 additions & 6 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,16 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config

# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
if self.vllm_config.model_config.skip_tokenizer_init:
self.tokenizer = None

else:
# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)

self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

If self.tokenizer is None, this will raise an AttributeError. It's critical to ensure self.tokenizer is checked for None before being used here to prevent a crash. Consider adding a condition to skip this line if self.tokenizer is None.

        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                    self.tokenizer if self.tokenizer else None)


Expand Down Expand Up @@ -375,7 +380,10 @@ async def get_input_preprocessor(self) -> InputPreprocessor:
return self.input_preprocessor

async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
if self.tokenizer is None:
return None
else:
return await self.tokenizer.get_lora_tokenizer_async(lora_request)

async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config
Expand Down
14 changes: 12 additions & 2 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,10 @@ async def _preprocess_chat(
_chat_template_kwargs.update(chat_template_kwargs or {})

request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer):

if tokenizer is None:
request_prompt = "placeholder"
elif isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template(
tokenizer,
messages=messages,
Expand Down Expand Up @@ -910,7 +913,14 @@ async def _preprocess_chat(
request = tool_parser(tokenizer).adjust_request( # type: ignore
request=request)

if isinstance(request_prompt, str):
if tokenizer is None:
assert isinstance(request_prompt, str), (
"Prompt has to be a string", \
"when the tokenizer is not initialised"
)
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
prompt_token_ids=[1])
elif isinstance(request_prompt, str):
prompt_inputs = await self._tokenize_prompt_input_async(
request,
tokenizer,
Expand Down
6 changes: 5 additions & 1 deletion vllm/entrypoints/openai/serving_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ async def create_pooling(
self.max_model_len, truncate_prompt_tokens)
lora_request = self._maybe_get_adapters(request)

tokenizer = await self.engine_client.get_tokenizer(lora_request)
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer(lora_request
)

if isinstance(request, PoolingChatRequest):
(
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,10 @@ def apply(
mm_kwargs = {}

for k, v in mm_data.items():
mm_kwargs[k] = v
if isinstance(v, dict) and k == "image":
mm_kwargs.update(v)
else:
mm_kwargs[k] = v
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}

# This model receives in input a multi-dimensional tensor representing
Expand Down