Skip to content

Commit 5e85f4f

Browse files
[VLM] Use SequenceData.from_token_counts to create dummy data (vllm-project#8687)
1 parent 71c6049 commit 5e85f4f

File tree

12 files changed

+73
-80
lines changed

12 files changed

+73
-80
lines changed

vllm/inputs/registry.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def _default_dummy_data_factory(
125125
# Avoid circular import
126126
from vllm.sequence import SequenceData
127127

128-
dummy_seq_data = SequenceData.from_counts({0: seq_len})
128+
dummy_seq_data = SequenceData.from_token_counts((0, seq_len))
129129
dummy_multi_modal_data = None
130130

131131
return dummy_seq_data, dummy_multi_modal_data

vllm/model_executor/models/blip.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Minimal implementation of BlipVisionModel intended to be only used
22
within a vision language model."""
3-
from array import array
43
from typing import Optional, Union
54

65
import torch
@@ -19,7 +18,7 @@
1918
from vllm.model_executor.layers.quantization import QuantizationConfig
2019
from vllm.multimodal.utils import (cached_get_tokenizer,
2120
repeat_and_pad_placeholder_tokens)
22-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
21+
from vllm.sequence import SequenceData
2322

2423
try:
2524
from xformers import ops as xops
@@ -53,6 +52,7 @@ def get_max_blip_image_tokens(
5352
def dummy_seq_data_for_blip(
5453
hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
5554
seq_len: int,
55+
num_images: int,
5656
*,
5757
image_token_id: int,
5858
image_feature_size_override: Optional[int] = None,
@@ -62,11 +62,10 @@ def dummy_seq_data_for_blip(
6262
else:
6363
image_feature_size = image_feature_size_override
6464

65-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
66-
[image_token_id]) * image_feature_size
67-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[0]) * (seq_len - image_feature_size)
69-
return SequenceData(token_ids)
65+
return SequenceData.from_token_counts(
66+
(image_token_id, image_feature_size * num_images),
67+
(0, seq_len - image_feature_size * num_images),
68+
)
7069

7170

7271
def dummy_image_for_blip(

vllm/model_executor/models/blip2.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
32
TypedDict, Union)
43

@@ -18,8 +17,7 @@
1817
from vllm.model_executor.models.opt import OPTModel
1918
from vllm.model_executor.sampling_metadata import SamplingMetadata
2019
from vllm.multimodal import MULTIMODAL_REGISTRY
21-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
22-
SequenceData)
20+
from vllm.sequence import IntermediateTensors, SequenceData
2321

2422
from .blip import (BlipVisionModel, dummy_image_for_blip,
2523
get_max_blip_image_tokens)
@@ -429,11 +427,10 @@ def dummy_seq_data_for_blip2(
429427
else:
430428
image_feature_size = image_feature_size_override
431429

432-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
433-
[image_token_id]) * image_feature_size * num_images
434-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
435-
[0]) * (seq_len - image_feature_size * num_images)
436-
return SequenceData(token_ids)
430+
return SequenceData.from_token_counts(
431+
(image_token_id, image_feature_size * num_images),
432+
(0, seq_len - image_feature_size * num_images),
433+
)
437434

438435

439436
def dummy_data_for_blip2(ctx: InputContext, seq_len: int,

vllm/model_executor/models/chameleon.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from functools import cached_property
32
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
43
Tuple, TypedDict)
@@ -32,8 +31,7 @@
3231
from vllm.multimodal import MULTIMODAL_REGISTRY
3332
from vllm.multimodal.utils import (cached_get_tokenizer,
3433
repeat_and_pad_placeholder_tokens)
35-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
36-
SequenceData)
34+
from vllm.sequence import IntermediateTensors, SequenceData
3735
from vllm.utils import print_warning_once
3836

3937
from .interfaces import SupportsMultiModal
@@ -72,11 +70,10 @@ def dummy_seq_data_for_chameleon(
7270
else:
7371
image_feature_size = image_feature_size_override
7472

75-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
76-
[image_token_id]) * image_feature_size * num_images
77-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
78-
[0]) * (seq_len - image_feature_size * num_images)
79-
return SequenceData(token_ids)
73+
return SequenceData.from_token_counts(
74+
(image_token_id, image_feature_size * num_images),
75+
(0, seq_len - image_feature_size * num_images),
76+
)
8077

8178

8279
def dummy_image_for_chameleon(

vllm/model_executor/models/clip.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Minimal implementation of CLIPVisionModel intended to be only used
22
within a vision language model."""
3-
from array import array
43
from typing import Iterable, List, Optional, Tuple, Union
54

65
import torch
@@ -20,7 +19,7 @@
2019
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
2120
from vllm.multimodal.utils import (cached_get_tokenizer,
2221
repeat_and_pad_placeholder_tokens)
23-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
22+
from vllm.sequence import SequenceData
2423

2524
try:
2625
from xformers import ops as xops
@@ -62,11 +61,10 @@ def dummy_seq_data_for_clip(
6261
else:
6362
image_feature_size = image_feature_size_override
6463

65-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
66-
[image_token_id]) * image_feature_size * num_images
67-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[0]) * (seq_len - image_feature_size * num_images)
69-
return SequenceData(token_ids)
64+
return SequenceData.from_token_counts(
65+
(image_token_id, image_feature_size * num_images),
66+
(0, seq_len - image_feature_size * num_images),
67+
)
7068

7169

7270
def dummy_image_for_clip(

vllm/model_executor/models/minicpmv.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
2424
import math
2525
import re
26-
from array import array
2726
from functools import partial
2827
from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
2928
TypedDict)
@@ -56,8 +55,7 @@
5655
from vllm.multimodal import MULTIMODAL_REGISTRY
5756
from vllm.multimodal.image import cached_get_image_processor
5857
from vllm.multimodal.utils import cached_get_tokenizer
59-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
60-
SequenceData)
58+
from vllm.sequence import IntermediateTensors, SequenceData
6159

6260
from .idefics2_vision_model import Idefics2VisionTransformer
6361

@@ -259,8 +257,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext):
259257

260258

261259
def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
262-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len
263-
return SequenceData(token_ids)
260+
return SequenceData.from_token_counts((0, seq_len))
264261

265262

266263
def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int):

vllm/model_executor/models/pixtral.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from dataclasses import dataclass, fields
32
from itertools import tee
43
from typing import Iterable, List, Mapping, Optional, Tuple, Union
@@ -24,8 +23,7 @@
2423
from vllm.multimodal import MULTIMODAL_REGISTRY
2524
from vllm.multimodal.base import MultiModalInputs
2625
from vllm.multimodal.utils import cached_get_tokenizer
27-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
28-
SequenceData)
26+
from vllm.sequence import IntermediateTensors, SequenceData
2927

3028
from .interfaces import SupportsMultiModal
3129
from .utils import init_vllm_registered_model
@@ -63,13 +61,11 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
6361
image_feature_size = (size**2) // (patch_size**2)
6462

6563
num_image_tokens = image_feature_size * num_images
64+
seq_data = SequenceData.from_token_counts(
65+
(image_token_id, num_image_tokens),
66+
(0, seq_len - num_image_tokens),
67+
)
6668

67-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[image_token_id]) * num_image_tokens
69-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
70-
[0]) * (seq_len - num_image_tokens)
71-
72-
seq_data = SequenceData(token_ids)
7369
mm_data = {"image": num_images * [image]}
7470
return seq_data, mm_data
7571

vllm/model_executor/models/qwen.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import math
99
import re
10-
from array import array
1110
from functools import partial
1211
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
1312
Optional, Tuple, TypedDict, Union)
@@ -45,8 +44,7 @@
4544
from vllm.multimodal import MULTIMODAL_REGISTRY
4645
from vllm.multimodal.base import MultiModalInputs
4746
from vllm.multimodal.utils import cached_get_tokenizer
48-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
49-
SequenceData)
47+
from vllm.sequence import IntermediateTensors, SequenceData
5048
from vllm.utils import is_list_of
5149

5250
from .utils import flatten_bn, is_pp_missing_parameter, make_layers
@@ -819,7 +817,7 @@ def dummy_data_for_qwen(
819817
# The presence of a visual config indicates this is a multimodal model.
820818
# If we don't have it, the model is considered an LLM for warmup purposes.
821819
if not hasattr(hf_config, "visual"):
822-
seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len))
820+
seq_data = SequenceData.from_token_counts((0, seq_len))
823821
mm_data = None
824822
return seq_data, mm_data
825823

@@ -846,11 +844,13 @@ def dummy_data_for_qwen(
846844
if len(toks) < seq_len:
847845
toks += [0] * (seq_len - len(toks))
848846

847+
seq_data = SequenceData.from_seqs(toks)
848+
849849
# Build the input images; width/height doesn't actually matter here since
850850
# the data will get resized and the # of tokens per image is constant
851851
image = Image.new("RGB", (224, 224), color=0)
852852
mm_data = {"image": image if num_images == 1 else [image] * num_images}
853-
return SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, toks)), mm_data
853+
return seq_data, mm_data
854854

855855

856856
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)

vllm/model_executor/models/qwen2_vl.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
# See the License for the specific language governing permissions and
2323
# limitations under the License.
2424
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
25-
from array import array
2625
from functools import lru_cache, partial
2726
from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
2827
Union)
@@ -66,8 +65,7 @@
6665
from vllm.multimodal.base import MultiModalData
6766
from vllm.multimodal.image import cached_get_image_processor
6867
from vllm.platforms import current_platform
69-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
70-
SequenceData)
68+
from vllm.sequence import IntermediateTensors, SequenceData
7169
from vllm.transformers_utils.processor import get_processor
7270

7371
logger = init_logger(__name__)
@@ -681,15 +679,14 @@ def dummy_data_for_qwen2_vl(
681679
"--limit-mm-per-prompt.")
682680

683681
hf_config = ctx.get_hf_config(Qwen2VLConfig)
684-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
685-
[hf_config.vision_start_token_id])
686-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
687-
[hf_config.image_token_id]) * max_llm_image_tokens
688-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
689-
[hf_config.vision_end_token_id])
690-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
691-
[0]) * (seq_len - max_llm_image_tokens - 2)
692-
dummy_seqdata = SequenceData(token_ids)
682+
683+
dummy_seqdata = SequenceData.from_token_counts(
684+
(hf_config.vision_start_token_id, 1),
685+
(hf_config.image_token_id, max_llm_image_tokens),
686+
(hf_config.vision_end_token_id, 1),
687+
(0, seq_len - max_llm_image_tokens - 2),
688+
)
689+
693690
dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
694691
color=0)
695692

vllm/model_executor/models/siglip.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
within a vision language model."""
33

44
import math
5-
from array import array
65
from typing import Iterable, List, Optional, Tuple, Union
76

87
import torch
@@ -24,7 +23,7 @@
2423
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
2524
from vllm.multimodal.utils import (cached_get_tokenizer,
2625
repeat_and_pad_placeholder_tokens)
27-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
26+
from vllm.sequence import SequenceData
2827

2928
try:
3029
from xformers import ops as xops
@@ -67,11 +66,10 @@ def dummy_seq_data_for_siglip(
6766
else:
6867
image_feature_size = image_feature_size_override
6968

70-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
71-
[image_token_id]) * image_feature_size
72-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
73-
[0]) * (seq_len - image_feature_size)
74-
return SequenceData(token_ids)
69+
return SequenceData.from_token_counts(
70+
(image_token_id, image_feature_size * num_images),
71+
(0, seq_len - image_feature_size * num_images),
72+
)
7573

7674

7775
def dummy_image_for_siglip(

vllm/model_executor/models/ultravox.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -77,15 +77,11 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
7777
return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
7878

7979

80-
def dummy_data_for_ultravox(
80+
def dummy_seq_data_for_ultravox(
8181
ctx: InputContext,
8282
seq_len: int,
83-
mm_counts: Mapping[str, int],
83+
audio_count: int,
8484
):
85-
feature_extractor = whisper_feature_extractor(ctx)
86-
87-
audio_count = mm_counts["audio"]
88-
8985
audio_placeholder = array(
9086
VLLM_TOKEN_ID_ARRAY_TYPE,
9187
[_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
@@ -96,10 +92,28 @@ def dummy_data_for_ultravox(
9692
other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
9793
[0]) * (seq_len - len(audio_token_ids))
9894

95+
return SequenceData(audio_token_ids + other_token_ids)
96+
97+
98+
def dummy_audio_for_ultravox(
99+
ctx: InputContext,
100+
audio_count: int,
101+
):
102+
feature_extractor = whisper_feature_extractor(ctx)
99103
audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
100-
mm_dict = {"audio": [audio_and_sr] * audio_count}
104+
return {"audio": [audio_and_sr] * audio_count}
105+
106+
107+
def dummy_data_for_ultravox(
108+
ctx: InputContext,
109+
seq_len: int,
110+
mm_counts: Mapping[str, int],
111+
):
112+
audio_count = mm_counts["audio"]
113+
seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
114+
mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
101115

102-
return (SequenceData(audio_token_ids + other_token_ids), mm_dict)
116+
return (seq_data, mm_dict)
103117

104118

105119
def input_mapper_for_ultravox(ctx: InputContext, data: object):

vllm/sequence.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,13 @@ class SequenceData(msgspec.Struct,
171171
_mrope_position_delta: Optional[int] = None
172172

173173
@staticmethod
174-
def from_counts(counts_by_token: Mapping[int, int]) -> "SequenceData":
175-
if len(counts_by_token) == 0:
174+
def from_token_counts(*token_counts: Tuple[int, int]) -> "SequenceData":
175+
if len(token_counts) == 0:
176176
return SequenceData.from_seqs([])
177177

178178
arrs = [
179179
array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
180-
for token_id, count in counts_by_token.items()
180+
for token_id, count in token_counts
181181
]
182182

183183
return SequenceData(reduce(array.__add__, arrs))

0 commit comments

Comments
 (0)