Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7f850be
Pass token type ids as pooling param to the model runner
maxdebayser Jul 31, 2025
809384e
fix errors
maxdebayser Jul 31, 2025
6f330b7
fix cudagraph problem
maxdebayser Jul 31, 2025
794aaf2
compress token type ids
maxdebayser Jul 31, 2025
a6f949d
forgot to(gpu)
maxdebayser Jul 31, 2025
56dba67
Address review comments
maxdebayser Jul 31, 2025
cdf802a
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Jul 31, 2025
3fe425a
fix mistake
maxdebayser Jul 31, 2025
4b19f4c
address review comments
maxdebayser Jul 31, 2025
5d0999c
fix type hints
maxdebayser Jul 31, 2025
2074d29
address review comments
maxdebayser Jul 31, 2025
148ab54
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Jul 31, 2025
accf2f7
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 1, 2025
cb935de
change comment order
maxdebayser Aug 1, 2025
a250e5b
fix test error message
maxdebayser Aug 2, 2025
939165f
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 2, 2025
2add932
fix error msg inconsistency
maxdebayser Aug 4, 2025
4df6cd2
sync with gpu after changing input tensors
maxdebayser Aug 4, 2025
0123dc5
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 4, 2025
e486790
increase test tolerance
maxdebayser Aug 4, 2025
164d890
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 5, 2025
7e3b671
add TODO comment
maxdebayser Aug 5, 2025
2cac159
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 5, 2025
29ca69b
rename method
maxdebayser Aug 5, 2025
ed5a7ef
fix editing mistake
maxdebayser Aug 5, 2025
656059b
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 5, 2025
d9a8835
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 6, 2025
3d089dd
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 7, 2025
0471896
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 9, 2025
db612f7
rename argument
maxdebayser Aug 10, 2025
5184a3d
Merge branch 'upstream_main' into v1_token_type_ids
maxdebayser Aug 10, 2025
96e3871
rename argument
maxdebayser Aug 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tests/entrypoints/openai/test_rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,9 @@ def test_invocations(server: RemoteOpenAIServer):
invocation_output["results"]):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01)
invocations_result["relevance_score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16


@pytest.mark.asyncio
Expand Down
4 changes: 3 additions & 1 deletion tests/entrypoints/openai/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,9 @@ def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
invocation_output["data"]):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.01)
invocation_data["score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16

def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
Any]):
Expand Down
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
54 changes: 26 additions & 28 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
apply_mistral_chat_template,
parse_chat_messages,
resolve_chat_template_content_format)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
ScoreMultiModalParam,
_cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids,
get_score_prompt)
# yapf: enable
from vllm.entrypoints.utils import (_validate_truncation_size,
log_non_default_args)
from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
Expand Down Expand Up @@ -1329,6 +1333,7 @@ def _cross_encoding_score(

model_config = self.llm_engine.model_config
pooling_params.verify("score", model_config)
pooling_params_list = list[PoolingParams]()

tokenization_kwargs: dict[str, Any] = {}

Expand All @@ -1339,38 +1344,31 @@ def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if model_config.is_multimodal_model:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)
model_config = self.llm_engine.model_config

parsed_prompts.append(engine_prompt)
else:
for q, t in input_pairs:
if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
text_pair=t, # type: ignore[arg-type]
**tokenization_kwargs)
else:
# `llm as reranker` models defaults to not using pad_token.
prompt_inputs = tokenizer(
text=q + t, # type: ignore[operator]
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)

if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
"token_type_ids", None)):
params = pooling_params.clone()
compressed = compress_token_type_ids(token_type_ids)
params.extra_kwargs = {"compressed_token_type_ids": compressed}
pooling_params_list.append(params)
else:
pooling_params_list.append(pooling_params)

parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
params=pooling_params_list,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
Expand Down
82 changes: 31 additions & 51 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from fastapi import Request

from vllm import envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
Expand All @@ -17,11 +18,15 @@
ScoreResponseData, UsageInfo)
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
ScoreMultiModalParam,
_cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids,
get_score_prompt)
# yapf: enable
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger
Expand Down Expand Up @@ -158,6 +163,8 @@ def _preprocess_score(
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)
self._validate_input(request, engine_prompt["prompt_token_ids"],
full_prompt)
if request.mm_processor_kwargs is not None:
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs

Expand Down Expand Up @@ -188,64 +195,27 @@ async def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if self.model_config.is_multimodal_model:
preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)

preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)
preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

else:
tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor)
use_pad_token = self.model_config.use_pad_token

if use_pad_token:
# cross_encoder models defaults to using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1, # type: ignore[arg-type]
text_pair=t2, # type: ignore[arg-type]
**tokenization_kwargs) for t1, t2 in input_pairs))
else:
# `llm as reranker` models defaults to not using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1 + # type: ignore[operator]
t2,
**tokenization_kwargs) for t1, t2 in input_pairs))

for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
sep_token = tokenizer.sep_token if (tokenizer.sep_token
and use_pad_token) else ''
request_prompt = f"{t1}{sep_token}{t2}"

input_ids = prompt_inputs["input_ids"]
text_token_prompt = \
self._validate_input(request, input_ids, request_prompt)
engine_prompt = TokensPrompt(
prompt_token_ids=text_token_prompt["prompt_token_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))

request_prompts.append(request_prompt)
engine_prompts.append(engine_prompt)
for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

pooling_params = request.to_pooling_params()
default_pooling_params = request.to_pooling_params()

try:
pooling_params.verify("score", self.model_config)
default_pooling_params.verify("score", self.model_config)
except ValueError as e:
return self.create_error_response(str(e))

Expand All @@ -254,9 +224,19 @@ async def _cross_encoding_score(

self._log_inputs(request_id_item,
request_prompts[i],
params=pooling_params,
params=default_pooling_params,
lora_request=lora_request)

if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
"token_type_ids", None)):
pooling_params = default_pooling_params.clone()
compressed = compress_token_type_ids(token_type_ids)
pooling_params.extra_kwargs = {
"compressed_token_type_ids": compressed
}
else:
pooling_params = (default_pooling_params)

generator = self.engine_client.encode(
engine_prompt,
pooling_params,
Expand Down
40 changes: 37 additions & 3 deletions vllm/entrypoints/score_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,49 @@ def get_score_prompt(
model_config,
tokenizer,
)
from vllm.model_executor.model_loader import get_model_cls

full_prompt = apply_score_template(model_config, prompt_1, prompt_2)

prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
model = get_model_cls(model_config)
if supports_score_template(model):
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
elif model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(text=prompt_1,
text_pair=prompt_2,
**tokenization_kwargs)
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
else:
# `llm as reranker` models defaults to not using pad_token.
full_prompt = prompt_1 + prompt_2
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)

engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])

if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
engine_prompt["token_type_ids"] = token_type_ids

post_process_tokens(model_config, engine_prompt)

if mm_data is not None:
engine_prompt["multi_modal_data"] = mm_data
return full_prompt, engine_prompt


def compress_token_type_ids(token_type_ids: list[int]) -> int:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to minimize the amount of data that is transferred between processes

"""
Return position of the first 1 or the length of the list
if not found.
"""
first_one = len(token_type_ids)
err_msg = "Token type ids are expected to be a sequence"\
" of zeros followed by a sequence of ones"
for i, type_id in enumerate(token_type_ids):
if type_id == 0 and first_one < i:
raise ValueError(err_msg)
elif type_id == 1 and first_one > i:
first_one = i
elif type_id > 1:
raise ValueError(err_msg)

return first_one
Loading