Gemma 3 minor fixes (#476)

quic-akuruvil · web-flow · commit eff947203478 · 2025-06-25T19:13:51.000+05:30
CI enablement and other minor fixes for Gemma3

---------

Signed-off-by: Ann Kuruvilla &lt;quic_akuruvil@quicinc.com&gt;
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
@@ -288,7 +288,6 @@ def from_legacy_cache(
 class QEffHybridCache(HybridCache):
     def __init__(self, config, batch_size, max_cache_len):
         super().__init__(config, batch_size, max_cache_len=max_cache_len)
-        # breakpoint()
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
 
@@ -327,7 +326,6 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
         """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
         backward compatibility."""
         legacy_cache = ()
-        # breakpoint()
         for layer_idx in range(len(self)):
             legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
         return legacy_cache
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -238,9 +238,9 @@ def forward(
                 )
             kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         if self.is_sliding:
-            cos, sin = self.rotary_emb_local(value_states, seq_len=constants.GEMMA3_MAX_POSITION_EMBEDDINGS)
+            cos, sin = self.rotary_emb_local(value_states, seq_len=self.config.max_position_embeddings)
         else:
-            cos, sin = self.rotary_emb(value_states, seq_len=constants.GEMMA3_MAX_POSITION_EMBEDDINGS)
+            cos, sin = self.rotary_emb(value_states, seq_len=self.config.max_position_embeddings)
 
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
@@ -687,7 +687,6 @@ def get_specializations(
                 "mm_tokens_per_image": mm_tokens_per_image,
             },
         ]
-
         specializations = {}
 
         if kv_offload:
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@
 
 *Latest news* :fire: <br>
 - [06/2025] Added support for Llama4 Multi-Model [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
+- [06/2025] Added support for Gemma3 Multi-Modal-Model [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)
 - [06/2025] Added support of model `hpcai-tech/grok-1` [hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)
 - [04/2025] Added support of model `ibm-granite/granite-vision-3.2-2b`[ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b)
 - [03/2025] Added support for swiftkv model [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct)
diff --git a/docs/source/validate.md b/docs/source/validate.md
@@ -63,6 +63,8 @@
 | **MllamaForConditionalGeneration** | Llama 3.2   | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)<br>[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) |
 |**LlavaNextForConditionalGeneration** | Granite Vision | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b)
 |**Llama4ForConditionalGeneration** | Llama-4-Scout | [Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
+|**Gemma3ForConditionalGeneration** | Gemma3 | [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)
+
 ### Audio Models
 (Automatic Speech Recognition) - Transcription Task
 **QEff Auto Class:** `QEFFAutoModelForSpeechSeq2Seq`
diff --git a/examples/gemma3_example/fp32_mm.yaml b/examples/gemma3_example/fp32_mm.yaml
@@ -370,7 +370,7 @@ FP32NodeInstanceNames:
  - /language_model/model/layers.4/self_attn/Mul_6_output_0
  - /language_model/model/layers.4/self_attn/Mul_7_output_0
  - /language_model/model/layers.4/self_attn/Mul_8_output_0
-  - /language_model/model/layers.4/self_attn/Mul_9_output_0                                                                                                     [274/1312]
+ - /language_model/model/layers.4/self_attn/Mul_9_output_0
  - /language_model/model/layers.5/self_attn/Mul_output_0                                                                                                                 
  - /language_model/model/layers.5/self_attn/Mul_1_output_0                                                                                                               
  - /language_model/model/layers.5/self_attn/Mul_2_output_0                                                                                                               
@@ -415,7 +415,7 @@ FP32NodeInstanceNames:
  - /language_model/model/layers.9/self_attn/Mul_1_output_0
  - /language_model/model/layers.9/self_attn/Mul_2_output_0
  - /language_model/model/layers.9/self_attn/Mul_3_output_0
-  - /language_model/model/layers.9/self_attn/Mul_4_output_0                                                                                                     [229/1312]
+ - /language_model/model/layers.9/self_attn/Mul_4_output_0
  - /language_model/model/layers.9/self_attn/Mul_5_output_0
  - /language_model/model/layers.9/self_attn/Mul_6_output_0
  - /language_model/model/layers.9/self_attn/Mul_7_output_0
diff --git a/examples/gemma3_example/gemma3_mm.py b/examples/gemma3_example/gemma3_mm.py
@@ -7,7 +7,7 @@
 
 import torch
 import transformers
-from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer
+from transformers import AutoConfig, AutoProcessor
 
 from QEfficient import QEFFAutoModelForImageTextToText
 
@@ -16,12 +16,14 @@
 # For Testing Purpose Only
 config.text_config.num_hidden_layers = 1
 config.vision_config.num_hidden_layers = 2
-
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config)
-model.eval()
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(model_id)
-qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True)
+
+# pass HF_TOKEN if gated model
+# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, config=config, attn_implementation="eager", kv_offload=True
+)
 
 ### use skip_vision=Ture, if want to run only text, or false ###
 skip_vision = True
@@ -59,9 +61,7 @@
         return_tensors="pt",
     )
 
-    streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, device_ids=[0], generation_len=100)
-    print(output.generated_ids)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
     print(tokenizer.batch_decode(output.generated_ids))
     print(output)
 
@@ -72,7 +72,7 @@
         ctx_len=3072,
         img_size=896,
         num_cores=16,
-        num_devices=8,
+        num_devices=1,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
         aic_enable_depth_first=True,
@@ -103,9 +103,6 @@
         return_tensors="pt",
     )
     inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-    streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100)
-    print(output.generated_ids)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
     print(tokenizer.batch_decode(output.generated_ids))
     print(output)
-    print()
diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py
@@ -88,29 +88,28 @@
         "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
         4,
     ),
-    # FIX: Accuracy in AIC
-    # (
-    #     "google/gemma-3-4b-it",
-    #     True,
-    #     1,
-    #     128,
-    #     3072,
-    #     896,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-    #     "Can you describe the image in detail.",
-    #     6,
-    # ),
-    # (
-    #     "google/gemma-3-4b-it",
-    #     False,
-    #     1,
-    #     128,
-    #     3072,
-    #     896,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-    #     "Can you describe the image in detail.",
-    #     6,
-    # ),
+    (
+        "google/gemma-3-4b-it",
+        True,
+        1,
+        128,
+        3072,
+        896,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    (
+        "google/gemma-3-4b-it",
+        False,
+        1,
+        128,
+        3072,
+        896,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
     # (
     #     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     #     True,