Fix for Replicate_kv_heads script (#412)

quic-hemagnih · web-flow · commit 4ec01a57ad0f · 2025-05-21T10:44:37.000+05:30
As we have moved to the transformer version 4.50.0, we have changed the
attention implementation and hence number of attention heads and hidden
size are no more attributes of this class.

We have added these parameters are optional arguments for the
replicate_kv_heads script. User can now pass them as arguments, if not
passed explicitly then it shall be picked from config.json file of the
model.

---------

Signed-off-by: Hem Agnihotri &lt;hemagnih@qti.qualcomm.com&gt;
diff --git a/scripts/replicate_kv_head/README.md b/scripts/replicate_kv_head/README.md
@@ -30,4 +30,6 @@ Replace `<hf_token>` with your actual token.
 ### Arguments
 - **--model_name**: Model card name to use (default: “meta-llama/Meta-Llama-3-8B-Instruct”).
 - **--prompt**: Prompt to use for the model (default: “My name is”).
-- **--repeat**: Factor to repeat key-value heads (default: 2).
+- **--repeat**: Factor to repeat key-value heads (default: 2).
+- **--num_attention_heads**: Number of attentin heads (default: None). This is optional param, if not given explicitly the will be read from config.json.
+- **--hidden_size**: Hidden size (default: None). This is optional param, if not given explicitly the will be read from config.json.
diff --git a/scripts/replicate_kv_head/replicate_kv_heads.py b/scripts/replicate_kv_head/replicate_kv_heads.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import argparse
+from typing import Optional
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -70,46 +71,78 @@ def duplicate_weights_for_linear_layer(
             )
 
 
-def main(args):
+def replicate_kv_heads(
+    model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",
+    prompt: str = "My name is",
+    repeat: int = 2,
+    full_batch_size: Optional[int] = None,
+    num_hidden_layers: Optional[int] = None,
+    num_attention_heads: Optional[int] = None,
+    hidden_size: Optional[int] = None,
+):
+    """
+    Replicate the KV heads. The script performs the following steps:
+    1. Runs inference with the original model.
+    2. Replicates the KV heads.
+    3. Runs inference on the modified model to validate the changes.
+    4. Exports the modified model to ONNX format.
+
+    ``Mandatory`` Args:
+        :model_name (str): Model card name to use, default value as meta-llama/Meta-Llama-3-8B-Instruct.
+        :prompt (str): Prompt to use for the model, default value as My name is
+        :repeat (int): Factor to repeat key-value heads.
+    ``Optional`` Args:
+        :full_batch_size (int): Set full batch size to enable continuous batching mode, default is None.
+        :num_hidden_layers (int): Number of hidden layers to use, default is None.
+        :num_attention_heads (int): Number of attention heads, if not passed explicitly then will be picked from config.json.
+        :hidden_size (int): Hidden size to use, if not passed explicitly then will be picked from config.json.
+
+    """
     # Load the model and tokenizer
-    model_name = args.model_name
     model_base_name = model_name.split("/")[-1]
     # Replace quantizers for loading Quantized AWQ/GPTQ models on CPU.
     replace_transformers_quantizers()
     # Prepare kwargs for model loading
     model_kwargs = {"attn_implementation": "eager"}
-    if args.num_hidden_layers:
-        model_kwargs["num_hidden_layers"] = args.num_hidden_layers
+
+    if num_hidden_layers:
+        model_kwargs["num_hidden_layers"] = num_hidden_layers
 
     pretrained_model_name_or_path = login_and_download_hf_lm(model_name)
     model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
 
     # Undo the effect of replace_transformers_quantizers
     undo_transformers_quantizers()
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    inputs = tokenizer(args.prompt, return_tensors="pt")
+    inputs = tokenizer(prompt, return_tensors="pt")
 
     # Generate original outputs and tokens
     with torch.inference_mode():
         _ = model(**inputs)  # original output
         orig_tokens = model.generate(**inputs, max_new_tokens=10, num_beams=1, do_sample=False)
 
     # Modify the number of key-value heads
-    repeat = args.repeat
     orig_kv_heads = model.config.num_key_value_heads
     new_kv_heads = repeat * orig_kv_heads
     model.config.num_key_value_heads = new_kv_heads
 
     print("Original KV heads:", orig_kv_heads)
     print("Modified KV heads:", new_kv_heads)
 
+    # Check if hidden size and number of attention heads are explicitly passed as arguments or not
+    if num_attention_heads is None:
+        num_attention_heads = model.config.num_attention_heads
+
+    if hidden_size is None:
+        hidden_size = model.config.hidden_size
+
     # Update the model's attention layers with new key-value heads
     for block in model.model.layers:
         attn = block.self_attn
         attn.num_key_value_heads = new_kv_heads
-        attn.num_key_value_groups = block.self_attn.num_heads // new_kv_heads
-        duplicate_weights_for_linear_layer(attn.k_proj, orig_kv_heads, repeat, attn.head_dim, attn.hidden_size)
-        duplicate_weights_for_linear_layer(attn.v_proj, orig_kv_heads, repeat, attn.head_dim, attn.hidden_size)
+        attn.num_key_value_groups = num_attention_heads // new_kv_heads
+        duplicate_weights_for_linear_layer(attn.k_proj, orig_kv_heads, repeat, attn.head_dim, hidden_size)
+        duplicate_weights_for_linear_layer(attn.v_proj, orig_kv_heads, repeat, attn.head_dim, hidden_size)
 
     # Generate modified outputs and tokens
     with torch.inference_mode():
@@ -126,13 +159,13 @@ def main(args):
         )
 
     # Export the modified model
-    q_model = QEFFAutoModelForCausalLM(model, continuous_batching=(True if args.full_batch_size else False))
+    q_model = QEFFAutoModelForCausalLM(model, continuous_batching=(True if full_batch_size else False))
     export(
         model_name,
         q_model,
         tokenizer=tokenizer,
         onnx_dir_path=f"{model_base_name}-{new_kv_heads}kvheads",
-        full_batch_size=(args.full_batch_size if args.full_batch_size else None),
+        full_batch_size=(full_batch_size if full_batch_size else None),
     )
 
 
@@ -162,6 +195,29 @@ def main(args):
         default=None,
         help="Number of hidden layers to use, default is None",
     )
+    parser.add_argument(
+        "--num_attention_heads",
+        "--num-attention-heads",
+        type=int,
+        default=None,
+        help="Number of attention heads, if not passed explicitly then will be picked from config.json",
+    )
+    parser.add_argument(
+        "--hidden_size",
+        "--hidden-size",
+        type=int,
+        default=None,
+        help="Hidden size to use, if not passed explicitly then will be picked from config.json",
+    )
 
     args = parser.parse_args()
-    main(args)
+
+    replicate_kv_heads(
+        model_name=args.model_name,
+        prompt=args.prompt,
+        repeat=args.repeat,
+        full_batch_size=args.full_batch_size,
+        num_hidden_layers=args.num_hidden_layers,
+        num_attention_heads=args.num_attention_heads,
+        hidden_size=args.hidden_size,
+    )