Skip to content

Commit daed30c

Browse files
[Bugfix] Fix feature size calculation for LLaVA-NeXT (vllm-project#6982)
1 parent 2f4e108 commit daed30c

File tree

5 files changed

+98
-50
lines changed

5 files changed

+98
-50
lines changed

tests/models/test_llava_next.py

Lines changed: 70 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
from typing import List, Optional, Tuple, Type
1+
from typing import List, Optional, Tuple, Type, overload
22

33
import pytest
4-
from transformers import AutoConfig, AutoTokenizer
4+
from transformers import AutoTokenizer
55

66
from vllm.multimodal.utils import rescale_image_size
77
from vllm.sequence import SampleLogprobs
@@ -50,6 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
5050
return hf_output_ids, hf_output_str, out_logprobs
5151

5252

53+
@overload
5354
def run_test(
5455
hf_runner: Type[HfRunner],
5556
vllm_runner: Type[VllmRunner],
@@ -62,13 +63,55 @@ def run_test(
6263
num_logprobs: int,
6364
tensor_parallel_size: int,
6465
distributed_executor_backend: Optional[str] = None,
66+
):
67+
...
68+
69+
70+
@overload
71+
def run_test(
72+
hf_runner: Type[HfRunner],
73+
vllm_runner: Type[VllmRunner],
74+
image_assets: _ImageAssets,
75+
model: str,
76+
*,
77+
sizes: List[Tuple[int, int]],
78+
dtype: str,
79+
max_tokens: int,
80+
num_logprobs: int,
81+
tensor_parallel_size: int,
82+
distributed_executor_backend: Optional[str] = None,
83+
):
84+
...
85+
86+
87+
def run_test(
88+
hf_runner: Type[HfRunner],
89+
vllm_runner: Type[VllmRunner],
90+
image_assets: _ImageAssets,
91+
model: str,
92+
*,
93+
size_factors: Optional[List[float]] = None,
94+
sizes: Optional[List[Tuple[int, int]]] = None,
95+
dtype: str,
96+
max_tokens: int,
97+
num_logprobs: int,
98+
tensor_parallel_size: int,
99+
distributed_executor_backend: Optional[str] = None,
65100
):
66101
images = [asset.pil_image for asset in image_assets]
67102

68-
inputs_per_image = [(
69-
[prompt for _ in size_factors],
70-
[rescale_image_size(image, factor) for factor in size_factors],
71-
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
103+
if size_factors is not None:
104+
inputs_per_image = [(
105+
[prompt for _ in size_factors],
106+
[rescale_image_size(image, factor) for factor in size_factors],
107+
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
108+
elif sizes is not None:
109+
inputs_per_image = [(
110+
[prompt for _ in sizes],
111+
[image.resize(size) for size in sizes],
112+
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
113+
else:
114+
raise ValueError("You must provide either `size_factors` or `sizes`")
72115

73116
# max_model_len should be greater than image_feature_size
74117
with vllm_runner(model,
@@ -150,15 +193,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
150193
)
151194

152195

153-
@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
154-
(183, 488, 776)])
155-
def test_image_feature_size(height_and_width_and_result):
156-
# Avoid initializing CUDA too early in distributed tests
157-
from vllm.model_executor.models.llava_next import (
158-
get_llava_next_image_feature_size)
159-
160-
height, width, result = height_and_width_and_result
161-
config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
162-
assert get_llava_next_image_feature_size(config,
163-
input_height=height,
164-
input_width=width) == result
196+
@pytest.mark.parametrize("model", models)
197+
@pytest.mark.parametrize(
198+
"sizes",
199+
[[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
200+
)
201+
@pytest.mark.parametrize("dtype", ["half"])
202+
@pytest.mark.parametrize("max_tokens", [128])
203+
@pytest.mark.parametrize("num_logprobs", [5])
204+
def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
205+
dtype, max_tokens, num_logprobs) -> None:
206+
run_test(
207+
hf_runner,
208+
vllm_runner,
209+
image_assets,
210+
model,
211+
sizes=sizes,
212+
dtype=dtype,
213+
max_tokens=max_tokens,
214+
num_logprobs=num_logprobs,
215+
tensor_parallel_size=1,
216+
)

vllm/model_executor/models/fuyu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
169169
raise TypeError(f"Invalid image type: {type(image_data)}")
170170

171171
# process prompts
172-
prompt = llm_inputs["prompt"]
172+
prompt = llm_inputs.get("prompt")
173173
prompt_token_ids = llm_inputs["prompt_token_ids"]
174174
tokenizer = cached_get_tokenizer(model_config.model)
175175
# dim0 is batch_size, dim1 is subseq_size which will always be 1

vllm/model_executor/models/internvl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from vllm.model_executor.models import ModelRegistry
2121
from vllm.model_executor.models.intern_vit import InternVisionModel
2222
from vllm.model_executor.sampling_metadata import SamplingMetadata
23-
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
23+
from vllm.multimodal import MULTIMODAL_REGISTRY
2424
from vllm.multimodal.base import MultiModalInputs
2525
from vllm.multimodal.image import cached_get_tokenizer
2626
from vllm.sequence import IntermediateTensors, SamplerOutput
@@ -43,7 +43,7 @@
4343

4444
class InternVLImagePixelInputs(TypedDict):
4545
type: Literal["pixel_values"]
46-
data: BatchedTensors
46+
data: Union[torch.Tensor, List[torch.Tensor]]
4747
"""
4848
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
4949
@@ -193,7 +193,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
193193
tokenizer = cached_get_tokenizer(model_config.tokenizer,
194194
trust_remote_code=True)
195195

196-
prompt = llm_inputs["prompt"]
196+
prompt = llm_inputs.get("prompt")
197197
prompt_token_ids = llm_inputs["prompt_token_ids"]
198198
if prompt is None:
199199
prompt = tokenizer.decode(prompt_token_ids)

vllm/model_executor/models/llava_next.py

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from vllm.model_executor.models.clip import CLIPVisionModel
2222
from vllm.model_executor.models.llama import LlamaModel
2323
from vllm.model_executor.sampling_metadata import SamplingMetadata
24-
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
24+
from vllm.multimodal import MULTIMODAL_REGISTRY
2525
from vllm.sequence import IntermediateTensors, SamplerOutput
2626

2727
from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
@@ -43,7 +43,7 @@
4343

4444
class LlavaNextImagePixelInputs(TypedDict):
4545
type: Literal["pixel_values"]
46-
data: BatchedTensors
46+
data: Union[torch.Tensor, List[torch.Tensor]]
4747
"""
4848
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
4949
@@ -62,31 +62,26 @@ class LlavaNextImagePixelInputs(TypedDict):
6262
LlavaNextImageInputs = LlavaNextImagePixelInputs
6363

6464

65-
# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
66-
# NOTE: new_height and new_width are further incremented to properly invert the
67-
# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
65+
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
6866
def _get_llava_next_num_unpadded_features(
69-
height: int,
70-
width: int,
67+
original_height: int,
68+
original_width: int,
7169
npatches: int,
7270
num_patch_height: int,
7371
num_patch_width: int,
7472
) -> Tuple[int, int]:
7573
current_height = npatches * num_patch_height
7674
current_width = npatches * num_patch_width
77-
current_height = torch.tensor(current_height).to("cuda")
78-
current_width = torch.tensor(current_width).to("cuda")
7975

80-
aspect_ratio: float = width / height
81-
current_aspect_ratio: float = current_width / current_height
76+
aspect_ratio = original_width / original_height
77+
current_aspect_ratio = current_width / current_height
78+
8279
if aspect_ratio > current_aspect_ratio:
83-
scale_factor = current_width / width
84-
new_height = int(height * scale_factor)
80+
new_height = (original_height * current_width) // original_width
8581
padding = (current_height - new_height) // 2
8682
current_height -= padding * 2
8783
else:
88-
scale_factor = current_height / height
89-
new_width = int(width * scale_factor)
84+
new_width = (original_width * current_height) // original_height
9085
padding = (current_width - new_width) // 2
9186
current_width -= padding * 2
9287

@@ -95,7 +90,7 @@ def _get_llava_next_num_unpadded_features(
9590
return (unpadded_features, newline_features)
9691

9792

98-
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
93+
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
9994
def get_llava_next_image_feature_size(
10095
hf_config: LlavaNextConfig,
10196
*,
@@ -111,9 +106,7 @@ def get_llava_next_image_feature_size(
111106
)
112107
base_feature_size = num_patches * num_patches
113108

114-
# Note: We follow the "wrong" width/height order
115-
# [ref: PR huggingface/transformers#31588]
116-
num_patch_width, num_patch_height = get_anyres_image_grid_shape(
109+
num_patch_height, num_patch_width = get_anyres_image_grid_shape(
117110
image_size=(input_height, input_width),
118111
grid_pinpoints=hf_config.image_grid_pinpoints,
119112
patch_size=vision_config.image_size,
@@ -349,11 +342,12 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
349342
if patch_embeddings.shape[0] > 1:
350343
other_patch_embeds = patch_embeddings[1:]
351344

345+
# Move to CPU to avoid floating-point errors
346+
orig_height, orig_width = image_size.tolist()
347+
352348
# image_aspect_ratio == "anyres"
353-
# Note: We follow the "wrong" width/height order
354-
# [ref: PR huggingface/transformers#31588]
355-
num_patch_width, num_patch_height = get_anyres_image_grid_shape(
356-
image_size,
349+
num_patch_height, num_patch_width = get_anyres_image_grid_shape(
350+
(orig_height, orig_width),
357351
self.config.image_grid_pinpoints,
358352
self.config.vision_config.image_size,
359353
)
@@ -365,7 +359,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
365359
.permute(4, 0, 2, 1, 3).contiguous() \
366360
.flatten(1, 2).flatten(2, 3)
367361
other_patch_embeds = unpad_image(other_patch_embeds,
368-
image_size)
362+
(orig_height, orig_width))
369363
other_patch_embeds = torch.cat((
370364
other_patch_embeds,
371365
self.image_newline[:, None, None] \
@@ -398,7 +392,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
398392
def _process_image_pixels(
399393
self,
400394
inputs: LlavaNextImagePixelInputs,
401-
) -> BatchedTensors:
395+
) -> Union[torch.Tensor, List[torch.Tensor]]:
402396
assert self.vision_tower is not None
403397

404398
pixel_values = inputs["data"]
@@ -425,7 +419,9 @@ def _process_image_pixels(
425419
]
426420

427421
def _process_image_input(
428-
self, image_input: LlavaNextImageInputs) -> BatchedTensors:
422+
self,
423+
image_input: LlavaNextImageInputs,
424+
) -> Union[torch.Tensor, List[torch.Tensor]]:
429425
patch_embeddings = self._process_image_pixels(image_input)
430426

431427
image_sizes = image_input.get("image_sizes")

vllm/model_executor/models/phi3v.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from vllm.model_executor.models.clip import CLIPVisionModel
3737
from vllm.model_executor.models.llama import LlamaModel
3838
from vllm.model_executor.sampling_metadata import SamplingMetadata
39-
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
39+
from vllm.multimodal import MULTIMODAL_REGISTRY
4040
from vllm.multimodal.image import cached_get_tokenizer
4141
from vllm.sequence import IntermediateTensors, SamplerOutput
4242

@@ -261,7 +261,7 @@ def add_image_newline(self, image_features_hd):
261261

262262
class Phi3VImagePixelInputs(TypedDict):
263263
type: Literal["pixel_values"]
264-
data: BatchedTensors
264+
data: Union[torch.Tensor, List[torch.Tensor]]
265265
"""
266266
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
267267

0 commit comments

Comments
 (0)