10
10
import sys
11
11
from typing import List , Optional
12
12
13
+ import requests
14
+ from PIL import Image
15
+ from transformers import PreTrainedModel , TextStreamer
16
+ from transformers .models .auto .modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
17
+
13
18
from QEfficient .base .common import QEFFCommonLoader
14
- from QEfficient .utils import check_and_assign_cache_dir , load_hf_tokenizer
19
+ from QEfficient .utils import check_and_assign_cache_dir , load_hf_processor , load_hf_tokenizer
15
20
from QEfficient .utils .logging_utils import logger
16
21
17
22
23
+ # TODO: Remove after adding support for VLM's compile and execute
24
+ def execute_vlm_model (
25
+ qeff_model : PreTrainedModel ,
26
+ model_name : str ,
27
+ image_url : str ,
28
+ image_path : str ,
29
+ prompt : Optional [str ] = None , # type: ignore
30
+ device_group : Optional [List [int ]] = None ,
31
+ local_model_dir : Optional [str ] = None ,
32
+ cache_dir : Optional [str ] = None ,
33
+ hf_token : Optional [str ] = None ,
34
+ generation_len : Optional [int ] = None ,
35
+ ):
36
+ """
37
+ This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
38
+ ``Mandatory`` Args:
39
+ :qeff_model (PreTrainedModel): QEfficient model object.
40
+ :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf``
41
+ :image_url (str): Image URL to be used for inference. ``Defaults to None.``
42
+ :image_path (str): Image path to be used for inference. ``Defaults to None.``
43
+ ``Optional`` Args:
44
+ :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
45
+ :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
46
+ :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
47
+ :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
48
+ :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
49
+ :generation_len (int): Number of tokens to be generated. ``Defaults to None.``
50
+ Returns:
51
+ :dict: Output from the ``AI_100`` runtime.
52
+ """
53
+ if not (image_url or image_path ):
54
+ raise ValueError ('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"' )
55
+ raw_image = Image .open (requests .get (image_url , stream = True ).raw ) if image_url else Image .open (image_path )
56
+
57
+ processor = load_hf_processor (
58
+ pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
59
+ cache_dir = cache_dir ,
60
+ hf_token = hf_token ,
61
+ )
62
+
63
+ # Added for QEff version 1.20 supported VLM models (mllama and llava)
64
+ conversation = [
65
+ {
66
+ "role" : "user" ,
67
+ "content" : [
68
+ {"type" : "image" },
69
+ {"type" : "text" , "text" : prompt [0 ]},
70
+ ],
71
+ }
72
+ ]
73
+
74
+ # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
75
+ input_text = processor .apply_chat_template (conversation , add_generation_prompt = True , tokenize = False )
76
+
77
+ split_inputs = processor (
78
+ text = input_text ,
79
+ images = raw_image ,
80
+ return_tensors = "pt" ,
81
+ add_special_tokens = False ,
82
+ )
83
+ streamer = TextStreamer (processor .tokenizer )
84
+ output = qeff_model .generate (
85
+ inputs = split_inputs ,
86
+ streamer = streamer ,
87
+ device_ids = device_group ,
88
+ generation_len = generation_len ,
89
+ )
90
+ return output
91
+
92
+
18
93
def main (
19
94
model_name : str ,
20
95
num_cores : int ,
@@ -65,18 +140,16 @@ def main(
65
140
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
66
141
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
67
142
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
143
+ :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
144
+ -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
145
+ -qpc_crc=True -> -qpc-crc
68
146
69
147
.. code-block:: bash
70
148
71
149
python -m QEfficient.cloud.infer OPTIONS
72
150
73
151
"""
74
152
cache_dir = check_and_assign_cache_dir (local_model_dir , cache_dir )
75
- tokenizer = load_hf_tokenizer (
76
- pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
77
- cache_dir = cache_dir ,
78
- hf_token = hf_token ,
79
- )
80
153
81
154
if "--mxfp6" in sys .argv :
82
155
if args .mxfp6 :
@@ -93,6 +166,17 @@ def main(
93
166
local_model_dir = local_model_dir ,
94
167
)
95
168
169
+ image_path = kwargs .pop ("image_path" , None )
170
+ image_url = kwargs .pop ("image_url" , None )
171
+
172
+ config = qeff_model .model .config
173
+ architecture = config .architectures [0 ] if config .architectures else None
174
+
175
+ if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES .values () and (
176
+ kwargs .pop ("img_size" , None ) or image_path or image_url
177
+ ):
178
+ logger .warning (f"Skipping image arguments as they are not valid for { architecture } " )
179
+
96
180
#########
97
181
# Compile
98
182
#########
@@ -116,14 +200,34 @@ def main(
116
200
#########
117
201
# Execute
118
202
#########
119
- _ = qeff_model .generate (
120
- tokenizer ,
121
- prompts = prompt ,
122
- device_id = device_group ,
123
- prompt = prompt ,
124
- prompts_txt_file_path = prompts_txt_file_path ,
125
- generation_len = generation_len ,
126
- )
203
+ if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES .values ():
204
+ exec_info = execute_vlm_model (
205
+ qeff_model = qeff_model ,
206
+ model_name = model_name ,
207
+ prompt = prompt ,
208
+ image_url = image_url ,
209
+ image_path = image_path ,
210
+ device_group = device_group ,
211
+ local_model_dir = local_model_dir ,
212
+ cache_dir = cache_dir ,
213
+ hf_token = hf_token ,
214
+ generation_len = generation_len ,
215
+ )
216
+ print (exec_info )
217
+ else :
218
+ tokenizer = load_hf_tokenizer (
219
+ pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
220
+ cache_dir = cache_dir ,
221
+ hf_token = hf_token ,
222
+ )
223
+ _ = qeff_model .generate (
224
+ tokenizer ,
225
+ prompts = prompt ,
226
+ device_id = device_group ,
227
+ prompt = prompt ,
228
+ prompts_txt_file_path = prompts_txt_file_path ,
229
+ generation_len = generation_len ,
230
+ )
127
231
128
232
129
233
if __name__ == "__main__" :
@@ -219,23 +323,25 @@ def main(
219
323
parser .add_argument (
220
324
"--enable_qnn" ,
221
325
"--enable-qnn" ,
222
- action = "store_true" ,
326
+ nargs = "?" ,
327
+ const = True ,
328
+ type = str ,
223
329
default = False ,
224
330
help = "Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
225
331
If not provided, the default configuration will be used.\
226
332
Sample Config: QEfficient/compile/qnn_config.json" ,
227
333
)
228
- parser .add_argument (
229
- "qnn_config" ,
230
- nargs = "?" ,
231
- type = str ,
232
- )
233
334
234
335
args , compiler_options = parser .parse_known_args ()
336
+
337
+ if isinstance (args .enable_qnn , str ):
338
+ args .qnn_config = args .enable_qnn
339
+ args .enable_qnn = True
340
+
235
341
compiler_options_dict = {}
236
342
for i in range (0 , len (compiler_options )):
237
343
if compiler_options [i ].startswith ("--" ):
238
- key = compiler_options [i ].lstrip ("-" )
344
+ key = compiler_options [i ].lstrip ("-" ). replace ( "-" , "_" )
239
345
value = (
240
346
compiler_options [i + 1 ]
241
347
if i + 1 < len (compiler_options ) and not compiler_options [i + 1 ].startswith ("-" )
0 commit comments