quic
diff --git a/‎QEfficient/transformers/models/mllama/modeling_mllama.py
Lines changed: 5 additions & 3 deletions b/‎QEfficient/transformers/models/mllama/modeling_mllama.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎QEfficient/utils/__init__.py
Lines changed: 2 additions & 2 deletions b/‎QEfficient/utils/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎QEfficient/utils/_utils.py
Lines changed: 5 additions & 3 deletions b/‎QEfficient/utils/_utils.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎QEfficient/utils/generate_inputs.py
Lines changed: 184 additions & 10 deletions b/‎QEfficient/utils/generate_inputs.py
Lines changed: 184 additions & 10 deletions
@@ -127,8 +127,8 @@ def forward(
         value_states_new = torch.index_put(value_states_old, indices, value_states)
 
         # Select old or new image KV states based on q_len
-        key_states = torch.where(q_len == 1, key_states_old, key_states_new)
-        value_states = torch.where(q_len == 1, value_states_old, value_states_new)
+        key_states = torch.where(torch.tensor(q_len == 1), key_states_old, key_states_new)
+        value_states = torch.where(torch.tensor(q_len == 1), value_states_old, value_states_new)
 
         # Update the image cache
         past_key_value.key_cache[self.layer_idx] = key_states
@@ -1113,7 +1113,7 @@ def forward(
             cache_position=cache_position,
             num_logits_to_keep=num_logits_to_keep,
         )
-
+        outputs["pixel_values"] = pixel_values
         return outputs
 
     def get_dummy_inputs(self, kv_offload: bool = False):
@@ -1281,6 +1281,8 @@ def get_output_names(self, kv_offload: bool = False):
             "logits",
             *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]],
         ]
+        if not kv_offload:
+            lang_output_names.append("pixel_values_RetainedState")
 
         output_names = {}
         if kv_offload:
 
@@ -13,15 +13,15 @@
     check_and_assign_cache_dir,
     dump_qconfig,
     get_num_layers_from_config,
+    get_num_layers_vlm,
     get_onnx_dir_name,
     get_padding_shape_from_config,
+    get_padding_shape_vlm,
     get_qpc_dir_path,
     hf_download,
     load_hf_tokenizer,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,
     qpc_exists,
-    get_padding_shape_vlm,
-    get_num_layers_vlm,
 )
@@ -387,12 +387,14 @@ def get_padding_shape_vlm(config, ctx_len, batch_size=1):
     Return:
         List[int, int, int, int]
     """
-    if hasattr(config, "architectures") and "LlavaForConditionalGeneration" in config.architectures:
+    if hasattr(config, "text_config"):
         n_heads = config.text_config.num_key_value_heads
         d_head = config.text_config.hidden_size // config.text_config.num_attention_heads
         padding_shape = [batch_size, n_heads, ctx_len, d_head]
-    elif hasattr(config, "architectures") and "MllamaForConditionalGeneration" in config.architectures:
-        padding_shape = []
+    elif hasattr(config, "llm_config"):
+        n_heads = config.llm_config.num_key_value_heads
+        d_head = config.llm_config.hidden_size // config.llm_config.num_attention_heads
+        padding_shape = [batch_size, n_heads, ctx_len, d_head]
     return padding_shape
 
 
 
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+from typing import List
 
 import numpy as np
 import torch
@@ -12,7 +13,6 @@
     get_num_layers_from_config,
     get_padding_shape_from_config,
     padding_check_and_fix,
-    get_padding_shape_vlm,
 )
 
 
@@ -206,27 +206,108 @@ def update_ort_outputs(self, ort_outputs):
 
 
 class InputHandlerVLM:
-    def __init__(self, batch_size, config, image, conversation, processor, prompt, ctx_len, n_layer):
+    def __init__(
+        self, batch_size, config, image, conversation, processor, prompt, prompt_len, ctx_len, max_gen_len, n_layer
+    ):
         self.ctx_len = ctx_len
+        self.prompt_len = prompt_len
+        self.max_gen_len = max_gen_len
         self.config = config
         self.image = image
         self.prompt = prompt
         self.batch_size = batch_size
-        self.padding_shape = get_padding_shape_vlm(config, ctx_len, batch_size)
         self.n_layer = n_layer
         self.processor = processor
         self.conversation = conversation
 
+    def prepare_pytorch_inputs(self):
+        """
+        Function responsible for creating Prefill stage tensor inputs for PyTorch model.
+
+        Return:
+            :Dict: input_ids, position_ids, past_key_values
+        """
+        inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt")
+        if hasattr(self.config, "text_config"):
+            txt_cfg = self.config.text_config
+        else:
+            txt_cfg = self.config.llm_config
+
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        num_key_value_heads = txt_cfg.num_key_value_heads
+        head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads
+        if hasattr(txt_cfg, "cross_attention_layers"):
+            cross_attention_layers = txt_cfg.cross_attention_layers
+
+            vis_cfg = self.config.vision_config
+            num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1
+            image_tokens_len = vis_cfg.max_num_tiles * num_patches
+
+        inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - 1
+        inputs["past_key_values"] = []
+        for i in range(num_hidden_layers):
+            # Specific to mllama as of now
+            if hasattr(txt_cfg, "cross_attention_layers") and i in cross_attention_layers:
+                idx = cross_attention_layers.index(i)
+                assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}"
+                inputs["past_key_values"].append(
+                    (
+                        torch.zeros(1, num_key_value_heads, image_tokens_len, head_dim),
+                        torch.zeros(1, num_key_value_heads, image_tokens_len, head_dim),
+                    )
+                )
+            else:
+                inputs["past_key_values"].append(
+                    (
+                        torch.zeros(1, num_key_value_heads, self.ctx_len, head_dim),
+                        torch.zeros(1, num_key_value_heads, self.ctx_len, head_dim),
+                    )
+                )
+
+        return inputs
+
     def prepare_vlm_ort_inputs(self):
+        if hasattr(self.config, "text_config"):
+            txt_cfg = self.config.text_config
+        else:
+            txt_cfg = self.config.llm_config
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        num_key_value_heads = txt_cfg.num_key_value_heads
+        head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads
+        if hasattr(txt_cfg, "cross_attention_layers"):
+            cross_attention_layers = txt_cfg.cross_attention_layers
+            vis_cfg = self.config.vision_config
+            num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1
+            image_tokens_len = vis_cfg.max_num_tiles * num_patches
+
         inputs = self.processor(images=self.image, text=self.prompt, return_tensors="np")
         if "attention_mask" in inputs.keys():
-            inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1)
+            inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - 1
         inputs["past_key_values"] = []
-        for i in range(self.n_layer[0]):
-            inputs["past_key." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
-            inputs["past_value." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
 
-        return inputs
+        vision_inputs = {
+            k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+        }
+
+        for i in range(num_hidden_layers):
+            if hasattr(txt_cfg, "cross_attention_layers") and i in cross_attention_layers:
+                idx = cross_attention_layers.index(i)
+                assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}"
+                inputs["past_key." + str(i)] = np.zeros(
+                    (self.batch_size, num_key_value_heads, image_tokens_len, head_dim), dtype=np.float32
+                )
+                inputs["past_value." + str(i)] = np.zeros(
+                    (self.batch_size, num_key_value_heads, image_tokens_len, head_dim), dtype=np.float32
+                )
+            else:
+                inputs["past_key." + str(i)] = np.zeros(
+                    (self.batch_size, num_key_value_heads, self.ctx_len, head_dim), dtype=np.float32
+                )
+                inputs["past_value." + str(i)] = np.zeros(
+                    (self.batch_size, num_key_value_heads, self.ctx_len, head_dim), dtype=np.float32
+                )
+        lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        return vision_inputs, lang_inputs
 
     def update_vlm_ort_outputs(self, ort_outputs):
         """
@@ -238,7 +319,6 @@ def update_vlm_ort_outputs(self, ort_outputs):
         Return:
             updated_outputs (Dict): Updated past_key_values, logits, pixel_values
         """
-
         present_key_values = []
         for i in range(self.n_layer[0]):
             if "past_key." + str(i) + "_RetainedState" in ort_outputs:
@@ -252,6 +332,9 @@ def update_vlm_ort_outputs(self, ort_outputs):
         outputs["pixel_values_RetainedState"] = (
             ort_outputs["pixel_values_RetainedState"] if "pixel_values_RetainedState" in ort_outputs else None
         )
+        outputs["image_features_RetainedState"] = (
+            ort_outputs["image_features_RetainedState"] if "image_features_RetainedState" in ort_outputs else None
+        )
         return outputs
 
     def update_vlm_ort_inputs(self, inputs, ort_outputs):
@@ -265,7 +348,6 @@ def update_vlm_ort_inputs(self, inputs, ort_outputs):
         Return:
             :Dict: Updated input_ids, position_ids, pixel_values and past_key_values
         """
-
         updated_inputs = {}
         updated_inputs["input_ids"] = ort_outputs["logits"].argmax(-1)
         updated_inputs["position_ids"] = np.max(inputs["position_ids"], axis=1, keepdims=True) + 1
@@ -274,4 +356,96 @@ def update_vlm_ort_inputs(self, inputs, ort_outputs):
             updated_inputs["past_value." + str(i)] = ort_outputs["past_key_values"][i * 2 + 1]
         if "pixel_values_RetainedState" in ort_outputs.keys():
             updated_inputs["pixel_values"] = ort_outputs["pixel_values_RetainedState"]
+        if "image_features_RetainedState" in ort_outputs.keys():
+            updated_inputs["image_features"] = ort_outputs["image_features_RetainedState"]
+
+        if "cross_attention_mask" in inputs.keys():
+            bs, _, num_images, img_tiles = inputs["cross_attention_mask"].shape
+            updated_inputs["cross_attention_mask"] = torch.ones(
+                (bs, 1, num_images, img_tiles), dtype=torch.int64
+            ).numpy()
+
+        for k, v in inputs.items():
+            if k not in updated_inputs.keys():
+                updated_inputs[k] = v
         return updated_inputs
+
+
+class InputHandlerInternVL(InputHandlerVLM):
+    def __init__(self, batch_size, config, image, processor, prompt, prompt_len, ctx_len, max_gen_len, n_layer):
+        self.ctx_len = ctx_len
+        self.prompt_len = prompt_len
+        self.max_gen_len = max_gen_len
+        self.config = config
+        self.image = image
+        self.prompt = prompt
+        self.batch_size = batch_size
+        self.n_layer = n_layer
+        self.processor = processor
+
+    def prepare_pytorch_inputs(self):
+        question = "<image>\n" + self.prompt
+        pixel_values = self.processor.load_image(self.image, max_num=12)
+        # Chat Template information for prompt preprocessing
+        messages: List[List[str]] = []
+        roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
+        prompt = self.processor(pixel_values, question, messages, roles)
+        inputs = self.processor.tokenizer(prompt, return_tensors="pt")
+        inputs["pixel_values"] = pixel_values.clone()
+
+        if hasattr(self.config, "text_config"):
+            txt_cfg = self.config.text_config
+        else:
+            txt_cfg = self.config.llm_config
+
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        num_key_value_heads = txt_cfg.num_key_value_heads
+        head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads
+
+        inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - 1
+        inputs["past_key_values"] = []
+        for i in range(num_hidden_layers):
+            inputs["past_key_values"].append(
+                (
+                    torch.zeros(1, num_key_value_heads, self.ctx_len, head_dim),
+                    torch.zeros(1, num_key_value_heads, self.ctx_len, head_dim),
+                )
+            )
+
+        return inputs
+
+    def prepare_vlm_ort_inputs(self):
+        if hasattr(self.config, "text_config"):
+            txt_cfg = self.config.text_config
+        else:
+            txt_cfg = self.config.llm_config
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        num_key_value_heads = txt_cfg.num_key_value_heads
+        head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads
+
+        question = "<image>\n" + self.prompt
+        pixel_values = self.processor.load_image(self.image, max_num=12)
+        # Chat Template information for prompt preprocessing
+        messages: List[List[str]] = []
+        roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
+        prompt = self.processor(pixel_values, question, messages, roles)
+        inputs = self.processor.tokenizer(prompt, return_tensors="np")
+        inputs["pixel_values"] = pixel_values.numpy()
+
+        if "attention_mask" in inputs.keys():
+            inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - 1
+        inputs["past_key_values"] = []
+
+        vision_inputs = {
+            k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+        }
+
+        for i in range(num_hidden_layers):
+            inputs["past_key." + str(i)] = np.zeros(
+                (self.batch_size, num_key_value_heads, self.ctx_len, head_dim), dtype=np.float32
+            )
+            inputs["past_value." + str(i)] = np.zeros(
+                (self.batch_size, num_key_value_heads, self.ctx_len, head_dim), dtype=np.float32
+            )
+        lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        return vision_inputs, lang_inputs