update to use new attention_type interface

NickLucche · NickLucche · commit 455d0cb7288e · 2025-01-09T15:26:44.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py
@@ -202,7 +202,8 @@ def __init__(self,
                               1.0,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=self.attn_type)
 
         # Only the first SelfAttention block in encoder decoder has this
         # embedding layer, the others reuse its output.
@@ -418,12 +419,7 @@ def forward(
             # Encoder/Decoder Self-Attention Layer, attn bias already cached.
             assert attn_bias is not None
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=self.attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.out_proj(attn_output)
         return output