Enabled VLMs via CLI

asmigosw · asmigosw · commit 1b3043d1bbfd · 2025-03-03T11:49:09.000Z
Signed-off-by: Asmita Goswami &lt;quic_asmigosw@quicinc.com&gt;
diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
@@ -12,13 +12,21 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 
+import importlib
+from collections import OrderedDict
 from typing import Any
 
+import transformers.models.auto.modeling_auto as mapping
 from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+MODEL_CLASS_MAPPING = OrderedDict(
+    [
+        (tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"),
+        (tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"),
+    ]
+)
 
 
 class QEFFCommonLoader:
@@ -42,9 +50,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-            model_class = QEFFAutoModelForCausalLM
-        else:
+        model_class = None
+        for key_tuple, class_name in MODEL_CLASS_MAPPING.items():
+            if architecture in key_tuple:
+                module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
+                model_class = getattr(module, class_name)
+                break
+        if model_class is None:
             raise NotImplementedError(
                 f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
             )
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -10,6 +10,11 @@
 import sys
 from typing import List, Optional
 
+import requests
+from PIL import Image
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
@@ -36,6 +41,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    img_size: Optional[int] = None,
     **kwargs,
 ) -> None:
     """
@@ -65,18 +71,16 @@ def main(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
         :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+                -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
+                -qpc_crc=True -> -qpc-crc
 
     .. code-block:: bash
 
         python -m QEfficient.cloud.infer OPTIONS
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    tokenizer = load_hf_tokenizer(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
 
     if "--mxfp6" in sys.argv:
         if args.mxfp6:
@@ -85,6 +89,9 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
+    image_path = kwargs.pop("image_path", None)
+    image_url = kwargs.pop("image_url", None)
+
     qeff_model = QEFFCommonLoader.from_pretrained(
         pretrained_model_name_or_path=model_name,
         cache_dir=cache_dir,
@@ -110,20 +117,70 @@ def main(
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
+        img_size=img_size,
         **kwargs,
     )
 
+    tokenizer = load_hf_tokenizer(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+    )
+
     #########
     # Execute
     #########
-    _ = qeff_model.generate(
-        tokenizer,
-        prompts=prompt,
-        device_id=device_group,
-        prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
-        generation_len=generation_len,
-    )
+    config = AutoConfig.from_pretrained(model_name)
+    architecture = config.architectures[0] if config.architectures else None
+
+    if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+
+        raw_image = None
+        if image_url is not None:
+            raw_image = Image.open(requests.get(image_url, stream=True).raw)
+        elif image_path is not None:
+            raw_image = Image.open(image_path)
+        else:
+            raise FileNotFoundError(
+                'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"'
+            )
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt[0]},  # Currently accepting only 1 prompt
+                ],
+            },
+        ]
+
+        # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
+        input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+
+        split_inputs = processor(
+            text=input_text,
+            images=raw_image,
+            return_tensors="pt",
+            add_special_tokens=False,
+        )
+        streamer = TextStreamer(processor.tokenizer)
+        _ = qeff_model.generate(
+            inputs=split_inputs,
+            streamer=streamer,
+            device_ids=device_group,
+            generation_len=generation_len,
+        )
+    else:
+        _ = qeff_model.generate(
+            tokenizer,
+            prompts=prompt,
+            device_id=device_group,
+            prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
+            generation_len=generation_len,
+        )
 
 
 if __name__ == "__main__":
@@ -226,10 +283,11 @@ def main(
              Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
-        "qnn_config",
+        "--qnn_config",
         nargs="?",
         type=str,
     )
+    parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image")
 
     args, compiler_options = parser.parse_known_args()
     compiler_options_dict = {}
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -603,6 +603,8 @@ def compile(
             )
 
         output_names = self.model.get_output_names(kv_offload=True)
+        vision_onnx_path = compiler_options.get("vision_onnx_path", None)
+        lang_onnx_path = compiler_options.get("lang_onnx_path", None)
 
         specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
@@ -814,14 +816,17 @@ def kv_offload_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        return CloudAI100ExecInfoNew(
+        exec_info = CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
 
+        print(exec_info)
+        return exec_info
+
 
 class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin):
     _hf_auto_class = AutoModelForImageTextToText
@@ -1104,14 +1109,17 @@ def cloud_ai_100_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        return CloudAI100ExecInfoNew(
+        exec_info = CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
 
+        print(exec_info)
+        return exec_info
+
     @property
     def model_hash(self) -> str:
         mhash = hashlib.sha256()
@@ -1163,6 +1171,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
         if kwargs.get("low_cpu_mem_usage", None):
             logger.warning("Updating low_cpu_mem_usage=False")
 
+        if kwargs.pop("continuous_batching", None):
+            NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
+
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(model, kv_offload=kv_offload, **kwargs)
@@ -1480,6 +1491,9 @@ def compile(
             decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
+        if compiler_options.pop("img_size", None):
+            logger.warning("img_size is not a valid argument for Text-to-Text Model.")
+
         if enable_qnn:
             if compiler_options:
                 logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")