@@ -209,14 +209,15 @@ def _run_test(
209209 # will hurt multiprocessing backend with fork method (the default method).
210210
211211 # max_model_len should be greater than image_feature_size
212- with vllm_runner (model ,
213- dtype = dtype ,
214- max_model_len = 8192 ,
215- max_num_seqs = 3 ,
216- tensor_parallel_size = tensor_parallel_size ,
217- distributed_executor_backend = distributed_executor_backend ,
218- limit_mm_per_prompt = {"image" : _LIMIT_IMAGE_PER_PROMPT
219- }) as vllm_model :
212+ with vllm_runner (
213+ model ,
214+ dtype = dtype ,
215+ max_model_len = 19212 , # 3 max size images
216+ max_num_seqs = 3 ,
217+ tensor_parallel_size = tensor_parallel_size ,
218+ distributed_executor_backend = distributed_executor_backend ,
219+ limit_mm_per_prompt = {"image" :
220+ _LIMIT_IMAGE_PER_PROMPT }) as vllm_model :
220221 vllm_outputs_per_image = [
221222 vllm_model .generate_greedy_logprobs (prompts ,
222223 max_tokens ,
@@ -507,7 +508,7 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
507508 model ,
508509 dtype = dtype ,
509510 max_model_len = 8192 ,
510- max_num_seqs = 2 ,
511+ max_num_seqs = 4 ,
511512 tensor_parallel_size = 1 ,
512513 limit_mm_per_prompt = {"image" :
513514 _LIMIT_IMAGE_PER_PROMPT }) as vllm_model :
@@ -552,6 +553,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
552553 num_logprobs ,
553554 images = images )
554555
556+ # Mixed batch with text and images with different numbers of tiles
557+ prompts = [
558+ "<|begin_of_text|>Hello!" ,
559+ "<|begin_of_text|>Some text before.<|image|>What is in the image?" , # noqa: E501
560+ "<|begin_of_text|>Some text before.<|image|>What is in the image?" , # noqa: E501
561+ ]
562+ images = [
563+ None ,
564+ [stop_sign ],
565+ # smaller image must be 2nd for the repro
566+ [stop_sign .resize ((448 , 448 ))],
567+ ]
568+ vllm_model .generate_greedy_logprobs (prompts ,
569+ max_tokens ,
570+ num_logprobs ,
571+ images = images )
572+
555573
556574class DummyModel :
557575 image_token_id = MLLAMA_IMAGE_TOKEN_ID
@@ -674,3 +692,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
674692 f"full_text_row_masked_out_mask[{ idx } ] must be " \
675693 f"'{ must_be_masked } ' "
676694 idx += 1
695+
696+
697+ @pytest .mark .core_model
698+ @pytest .mark .parametrize ("encoder_seq_lens, num_tiles, expected" , [
699+ ([6404 ], [[4 ]], [6404 ]),
700+ ([0 , 6404 ], [[4 ]], [6404 ]),
701+ ([0 , 1601 , 8005 ], [[1 ], [4 , 1 ]], [1601 , 8005 ]),
702+ ([0 , 19212 , 0 , 3202 ], [[4 , 4 , 4 ], [2 ]], [19212 , 3202 ]),
703+ ])
704+ def test_parse_and_validate_encoder_lens (encoder_seq_lens , num_tiles ,
705+ expected ) -> None :
706+
707+ dummy = DummyModel ()
708+ num_tokens_per_tile = 1601
709+ actual_encoder_seq_lens = MllamaForConditionalGeneration \
710+ ._get_and_validate_encoder_lens (
711+ dummy ,
712+ encoder_seq_lens ,
713+ num_tiles ,
714+ num_tokens_per_tile ,
715+ )
716+ assert actual_encoder_seq_lens == expected , \
717+ f"Expected { expected } but got { actual_encoder_seq_lens } "
0 commit comments