Skip to content

Commit 1b3043d

Browse files
committed
Enabled VLMs via CLI
Signed-off-by: Asmita Goswami <[email protected]>
1 parent 188e751 commit 1b3043d

File tree

3 files changed

+105
-21
lines changed

3 files changed

+105
-21
lines changed

QEfficient/base/common.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,21 @@
1212
QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
1313
"""
1414

15+
import importlib
16+
from collections import OrderedDict
1517
from typing import Any
1618

19+
import transformers.models.auto.modeling_auto as mapping
1720
from transformers import AutoConfig
18-
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
1921

2022
from QEfficient.base.modeling_qeff import QEFFBaseModel
21-
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
23+
24+
MODEL_CLASS_MAPPING = OrderedDict(
25+
[
26+
(tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"),
27+
(tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"),
28+
]
29+
)
2230

2331

2432
class QEFFCommonLoader:
@@ -42,9 +50,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
4250
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
4351
architecture = config.architectures[0] if config.architectures else None
4452

45-
if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
46-
model_class = QEFFAutoModelForCausalLM
47-
else:
53+
model_class = None
54+
for key_tuple, class_name in MODEL_CLASS_MAPPING.items():
55+
if architecture in key_tuple:
56+
module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
57+
model_class = getattr(module, class_name)
58+
break
59+
if model_class is None:
4860
raise NotImplementedError(
4961
f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
5062
)

QEfficient/cloud/infer.py

+72-14
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
import sys
1111
from typing import List, Optional
1212

13+
import requests
14+
from PIL import Image
15+
from transformers import AutoConfig, AutoProcessor, TextStreamer
16+
from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
17+
1318
from QEfficient.base.common import QEFFCommonLoader
1419
from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
1520
from QEfficient.utils.logging_utils import logger
@@ -36,6 +41,7 @@ def main(
3641
allow_mxint8_mdp_io: bool = False,
3742
enable_qnn: Optional[bool] = False,
3843
qnn_config: Optional[str] = None,
44+
img_size: Optional[int] = None,
3945
**kwargs,
4046
) -> None:
4147
"""
@@ -65,18 +71,16 @@ def main(
6571
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
6672
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
6773
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
74+
:kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
75+
-allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
76+
-qpc_crc=True -> -qpc-crc
6877
6978
.. code-block:: bash
7079
7180
python -m QEfficient.cloud.infer OPTIONS
7281
7382
"""
7483
cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
75-
tokenizer = load_hf_tokenizer(
76-
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
77-
cache_dir=cache_dir,
78-
hf_token=hf_token,
79-
)
8084

8185
if "--mxfp6" in sys.argv:
8286
if args.mxfp6:
@@ -85,6 +89,9 @@ def main(
8589
if args.mxint8:
8690
logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
8791

92+
image_path = kwargs.pop("image_path", None)
93+
image_url = kwargs.pop("image_url", None)
94+
8895
qeff_model = QEFFCommonLoader.from_pretrained(
8996
pretrained_model_name_or_path=model_name,
9097
cache_dir=cache_dir,
@@ -110,20 +117,70 @@ def main(
110117
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
111118
enable_qnn=enable_qnn,
112119
qnn_config=qnn_config,
120+
img_size=img_size,
113121
**kwargs,
114122
)
115123

124+
tokenizer = load_hf_tokenizer(
125+
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
126+
cache_dir=cache_dir,
127+
hf_token=hf_token,
128+
)
129+
116130
#########
117131
# Execute
118132
#########
119-
_ = qeff_model.generate(
120-
tokenizer,
121-
prompts=prompt,
122-
device_id=device_group,
123-
prompt=prompt,
124-
prompts_txt_file_path=prompts_txt_file_path,
125-
generation_len=generation_len,
126-
)
133+
config = AutoConfig.from_pretrained(model_name)
134+
architecture = config.architectures[0] if config.architectures else None
135+
136+
if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
137+
processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
138+
139+
raw_image = None
140+
if image_url is not None:
141+
raw_image = Image.open(requests.get(image_url, stream=True).raw)
142+
elif image_path is not None:
143+
raw_image = Image.open(image_path)
144+
else:
145+
raise FileNotFoundError(
146+
'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"'
147+
)
148+
149+
conversation = [
150+
{
151+
"role": "user",
152+
"content": [
153+
{"type": "image"},
154+
{"type": "text", "text": prompt[0]}, # Currently accepting only 1 prompt
155+
],
156+
},
157+
]
158+
159+
# Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
160+
input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
161+
162+
split_inputs = processor(
163+
text=input_text,
164+
images=raw_image,
165+
return_tensors="pt",
166+
add_special_tokens=False,
167+
)
168+
streamer = TextStreamer(processor.tokenizer)
169+
_ = qeff_model.generate(
170+
inputs=split_inputs,
171+
streamer=streamer,
172+
device_ids=device_group,
173+
generation_len=generation_len,
174+
)
175+
else:
176+
_ = qeff_model.generate(
177+
tokenizer,
178+
prompts=prompt,
179+
device_id=device_group,
180+
prompt=prompt,
181+
prompts_txt_file_path=prompts_txt_file_path,
182+
generation_len=generation_len,
183+
)
127184

128185

129186
if __name__ == "__main__":
@@ -226,10 +283,11 @@ def main(
226283
Sample Config: QEfficient/compile/qnn_config.json",
227284
)
228285
parser.add_argument(
229-
"qnn_config",
286+
"--qnn_config",
230287
nargs="?",
231288
type=str,
232289
)
290+
parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image")
233291

234292
args, compiler_options = parser.parse_known_args()
235293
compiler_options_dict = {}

QEfficient/transformers/models/modeling_auto.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,8 @@ def compile(
603603
)
604604

605605
output_names = self.model.get_output_names(kv_offload=True)
606+
vision_onnx_path = compiler_options.get("vision_onnx_path", None)
607+
lang_onnx_path = compiler_options.get("lang_onnx_path", None)
606608

607609
specializations, compiler_options = self.model.get_specializations(
608610
batch_size=batch_size,
@@ -814,14 +816,17 @@ def kv_offload_generate(
814816
total_time = decode_end - prefill_start
815817
total_perf = num_token / total_time
816818

817-
return CloudAI100ExecInfoNew(
819+
exec_info = CloudAI100ExecInfoNew(
818820
batch_size=batch_size,
819821
generated_ids=generated_ids,
820822
perf_metrics=PerfMetrics(
821823
prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
822824
),
823825
)
824826

827+
print(exec_info)
828+
return exec_info
829+
825830

826831
class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin):
827832
_hf_auto_class = AutoModelForImageTextToText
@@ -1104,14 +1109,17 @@ def cloud_ai_100_generate(
11041109
total_time = decode_end - prefill_start
11051110
total_perf = num_token / total_time
11061111

1107-
return CloudAI100ExecInfoNew(
1112+
exec_info = CloudAI100ExecInfoNew(
11081113
batch_size=batch_size,
11091114
generated_ids=generated_ids,
11101115
perf_metrics=PerfMetrics(
11111116
prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
11121117
),
11131118
)
11141119

1120+
print(exec_info)
1121+
return exec_info
1122+
11151123
@property
11161124
def model_hash(self) -> str:
11171125
mhash = hashlib.sha256()
@@ -1163,6 +1171,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
11631171
if kwargs.get("low_cpu_mem_usage", None):
11641172
logger.warning("Updating low_cpu_mem_usage=False")
11651173

1174+
if kwargs.pop("continuous_batching", None):
1175+
NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
1176+
11661177
kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
11671178
model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
11681179
return cls(model, kv_offload=kv_offload, **kwargs)
@@ -1480,6 +1491,9 @@ def compile(
14801491
decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
14811492
specializations.append(decode_specialization)
14821493

1494+
if compiler_options.pop("img_size", None):
1495+
logger.warning("img_size is not a valid argument for Text-to-Text Model.")
1496+
14831497
if enable_qnn:
14841498
if compiler_options:
14851499
logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")

0 commit comments

Comments
 (0)