quic · asmigosw · Mar 3, 2025 · Mar 3, 2025 · Mar 6, 2025 · Mar 6, 2025
@@ -15,10 +15,9 @@
 from typing import Any
 
 from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.transformers.modeling_utils import model_class_mapping
 
 
 class QEFFCommonLoader:
@@ -42,8 +41,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-            model_class = QEFFAutoModelForCausalLM
+        class_name = model_class_mapping.get(architecture)
+        if class_name:
+            module = __import__("QEfficient.transformers.models.modeling_auto")
+            model_class = getattr(module, class_name)
         else:
             raise NotImplementedError(
                 f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"

@@ -23,7 +23,7 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants
+from QEfficient.utils import constants, dump_qconfig
 from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
@@ -211,6 +211,7 @@ def _export(
         self.onnx_path = onnx_path
         return onnx_path
 
+    @dump_qconfig
     def _compile(
         self,
         onnx_path: Optional[str] = None,
@@ -336,8 +337,10 @@ def _compile(
             )
 
         self.qpc_path = qpc_path
+
         return qpc_path
 
+    @dump_qconfig
     def _qnn_compile(
         self,
         onnx_path: Optional[str] = None,
@@ -435,4 +438,5 @@ def _qnn_compile(
         )
 
         self.qpc_path = qpc_path
+
         return qpc_path
@@ -10,8 +10,13 @@
 import sys
 from typing import List, Optional
 
+import requests
+from PIL import Image
+from transformers import AutoProcessor, TextStreamer
+from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -65,18 +70,16 @@ def main(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
         :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+                -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
+                -qpc_crc=True -> -qpc-crc
 
     .. code-block:: bash
 
         python -m QEfficient.cloud.infer OPTIONS
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    tokenizer = load_hf_tokenizer(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
 
     if "--mxfp6" in sys.argv:
         if args.mxfp6:
@@ -93,6 +96,16 @@ def main(
         local_model_dir=local_model_dir,
     )
 
+    image_path = kwargs.pop("image_path", None)
+    image_url = kwargs.pop("image_url", None)
+
+    config = qeff_model.model.config
+    architecture = config.architectures[0] if config.architectures else None
+    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        img_size = kwargs.pop("img_size", None)
+        if img_size or image_path or image_url:
+            logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+
     #########
     # Compile
     #########
@@ -116,14 +129,47 @@ def main(
     #########
     # Execute
     #########
-    _ = qeff_model.generate(
-        tokenizer,
-        prompts=prompt,
-        device_id=device_group,
-        prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
-        generation_len=generation_len,
-    )
+    if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+
+        if not (image_url or image_path):
+            raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
+        raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
+
+        conversation = constants.Constants.conversation
+        conversation[0]["content"][1].update({"text": prompt[0]})  # Currently accepting only 1 prompt
+
+        # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
+        input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+
+        split_inputs = processor(
+            text=input_text,
+            images=raw_image,
+            return_tensors="pt",
+            add_special_tokens=False,
+        )
+        streamer = TextStreamer(processor.tokenizer)
+        output = qeff_model.generate(
+            inputs=split_inputs,
+            streamer=streamer,
+            device_ids=device_group,
+            generation_len=generation_len,
+        )
+        print(output)
+    else:
+        tokenizer = load_hf_tokenizer(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+            cache_dir=cache_dir,
+            hf_token=hf_token,
+        )
+        _ = qeff_model.generate(
+            tokenizer,
+            prompts=prompt,
+            device_id=device_group,
+            prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
+            generation_len=generation_len,
+        )
 
 
 if __name__ == "__main__":
@@ -220,18 +266,21 @@ def main(
         "--enable_qnn",
         "--enable-qnn",
         action="store_true",
+        nargs="?",
+        const=True,
+        type=str,
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
              Sample Config: QEfficient/compile/qnn_config.json",
     )
-    parser.add_argument(
-        "qnn_config",
-        nargs="?",
-        type=str,
-    )
 
     args, compiler_options = parser.parse_known_args()
+
+    if isinstance(args.enable_qnn, str):
+        args.qnn_config = args.enable_qnn
+        args.enable_qnn = True
+
     compiler_options_dict = {}
     for i in range(0, len(compiler_options)):
         if compiler_options[i].startswith("--"):

@@ -107,6 +107,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.get_base_model().config.__dict__
+
     def load_adapter(self, model_id: str, adapter_name: str):
         """Loads a new adapter from huggingface hub or local path
 

@@ -90,6 +90,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.config.__dict__
+
     def download_adapter(
         self,
         adapter_model_id: str,

@@ -10,6 +10,7 @@
 
 import torch
 import torch.nn as nn
+import transformers.models.auto.modeling_auto as mapping
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -272,6 +273,15 @@
 }
 
 
+model_class_mapping = {
+    **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()},
+    **{
+        architecture: "QEFFAutoModelForImageTextToText"
+        for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()
+    },
+}
+
+
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
     num_vision_tokens: int,