Dynamic cache support on llama4 (quic#494)

quic-rishinr · quic-dhirajku · commit e3f5ab4a7a9b · 2025-08-04T09:56:56.000Z
Signed-off-by: Rishin &lt;rishinr@qti.qualcomm.com&gt;
Signed-off-by: Dhiraj Kumar Sah &lt;dhirajku@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -32,7 +32,7 @@
     repeat_kv,
 )
 
-from QEfficient.transformers.cache_utils import QEffHybridChunkedCache
+from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
 from QEfficient.utils._utils import IOInfo
@@ -638,7 +638,7 @@ def forward(
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = QEffHybridChunkedCache.from_legacy_cache(self.config, past_key_values)
+            past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0