Changes to support QNN Compilation path in QEFFBaseModel class.

Shubham Agrawal · Shubham Agrawal · commit 4f60ecd95e7f · 2025-04-01T16:23:07.000+05:30
Signed-off-by: Shubham Agrawal &lt;quic_shubhagr@quicinc.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -98,7 +98,12 @@ def compile(self, *args, **kwargs) -> Path:
             :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
             :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+                any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
 
@@ -217,6 +222,7 @@ def _compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
+        mxint8_kv_cache: bool = False,
         specializations: Optional[List[Dict[str, int]]] = None,
         custom_io: Optional[Dict[str, str]] = None,
         mdp_ts_num_devices: int = 1,
@@ -233,10 +239,32 @@ def _compile(
             :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+                any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
+
         """
+        enable_qnn = compiler_options["enable_qnn"] if "enable_qnn" in compiler_options else False
+        qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+
+        if enable_qnn:
+            return self._qnn_compile(
+                onnx_path,
+                compile_dir,
+                specializations=specializations,
+                custom_io=custom_io,
+                mdp_ts_num_devices=mdp_ts_num_devices,
+                num_cores=compiler_options.get("aic_num_cores", 16),
+                mxfp6_matmul=compiler_options.get("mxfp6_matmul", False),
+                mxint8_kv_cache=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+
         if onnx_path is None and self.onnx_path is None:
             self.export()
 
@@ -346,17 +374,13 @@ def _qnn_compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
+        custom_io: Optional[Dict[str, str]] = None,
         specializations: Optional[List[Dict[str, int]]] = None,
-        prefill_seq_len: int = 32,
-        ctx_len: int = 128,
-        batch_size: int = 1,
-        full_batch_size: Optional[int] = None,
         mdp_ts_num_devices: int = 1,
         num_cores: int = 16,
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         qnn_config: Optional[str] = None,
-        kv_cache_batch_size: Optional[int] = None,
     ) -> str:
         """
         Interface for QNN compiler
@@ -365,16 +389,11 @@ def _qnn_compile(
             :onnx_path (str): Onnx file to compile
             :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
             :specializations (list): List of specializations to compile for
-            :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
-            :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
-            :batch_size (int, optional): Batch size. ``Defaults to 1``.
-            :full_batch_size (int, optional): Continuous batching batch size.
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
             :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
         """
         if onnx_path is None and self.onnx_path is None:
             self.export()
@@ -390,6 +409,9 @@ def _qnn_compile(
         if specializations is not None:
             compile_hash.update(to_hashable(specializations))
 
+        if custom_io is not None:
+            compile_hash.update(to_hashable(custom_io))
+
         if qnn_config is not None:
             qnn_config_values = load_json(qnn_config)
             compile_hash.update(to_hashable(qnn_config_values))
@@ -426,15 +448,12 @@ def _qnn_compile(
             qpc_base_path=compile_dir,
             num_cores=num_cores,
             device_group=list(range(mdp_ts_num_devices)),
-            batch_size=batch_size,
-            prompt_len=prefill_seq_len,
-            ctx_len=ctx_len,
             mxfp6=mxfp6_matmul,
             mxint8=mxint8_kv_cache,
-            full_batch_size=full_batch_size,
             qnn_config=qnn_config,
             qnn_binary_dir=qpc_path,
-            kv_cache_batch_size=kv_cache_batch_size,
+            specializations=specializations,
+            custom_io=custom_io,
         )
 
         self.qpc_path = qpc_path
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
@@ -7,11 +7,14 @@
 
 import os
 import shutil
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from QEfficient.utils._utils import create_json, execute_command, load_json
 from QEfficient.utils.constants import QnnConstants
-from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
+from QEfficient.utils.generate_qnn_network_specialization_config import (
+    generate_data_format_config,
+    generate_qnn_specialization,
+)
 from QEfficient.utils.logging_utils import logger
 
 
@@ -31,9 +34,6 @@ def __init__(
         device_group: Optional[List[int]] = None,
         compiler_enable_depth_first: bool = False,
         compiler_max_out_channel_split: int = -1,
-        batch_size: int = 1,
-        prompt_len: int = 32,
-        ctx_len: int = 128,
         compiler_mxfp6_matmul_weights: bool = True,
         qnn_target: str = QnnConstants.TARGET,
         qnn_config_path: Optional[str] = None,
@@ -48,9 +48,6 @@ def __init__(
         self.device_group = device_group
         self.compiler_enable_depth_first = compiler_enable_depth_first
         self.compiler_max_out_channel_split = compiler_max_out_channel_split
-        self.batch_size = batch_size
-        self.prompt_len = prompt_len
-        self.ctx_len = ctx_len
         self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
         self.qnn_config_path = qnn_config_path
         self.qnn_binary_dir = qnn_binary_dir
@@ -327,16 +324,15 @@ def compile(
     device_group: Optional[List[int]] = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
     mxfp6: bool = True,
     mxint8: bool = False,
     allow_mxint8_mdp_io: Optional[bool] = False,
     full_batch_size=None,
     qnn_config: Optional[str] = None,
     qnn_binary_dir: Optional[str] = None,
     kv_cache_batch_size: Optional[int] = None,
+    custom_io: Optional[Dict[str, str]] = None,
+    specializations: Optional[List[Dict[str, int]]] = None,
     **kwargs,
 ) -> str:
     """
@@ -377,16 +373,11 @@ def compile(
     # TODO To make custom_io_config.yaml configurable as not all models need it.
     custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
 
-    kv_precision = "uint8" if mxint8 else "float16"
-    fetch_nodes_info(
+    generate_qnn_specialization(
         onnx_graph_path=onnx_path,
-        batch_size=batch_size,
-        sequence_length=prompt_len,
-        context_length=ctx_len,
+        specializations=specializations,
+        custom_io=custom_io,
         file_path=custom_io_file_path,
-        full_batch_size=full_batch_size,
-        kv_precision=kv_precision,
-        kv_cache_batch_size=kv_cache_batch_size,
     )
 
     if not os.path.isfile(custom_io_file_path):
@@ -403,9 +394,6 @@ def compile(
         custom_io_path=custom_io_file_path,
         compiler_enable_depth_first=aic_enable_depth_first,
         compiler_max_out_channel_split=mos,
-        batch_size=batch_size,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
         compiler_mxfp6_matmul_weights=mxfp6,
         qnn_binary_dir=qnn_binary_dir,
         mxint8=mxint8,
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
@@ -251,6 +251,7 @@ def compile(
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -598,21 +598,17 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         if (
             any(
                 param is not None
-                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
+                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]
             )
-            or enable_qnn
         ):
             raise ValueError(
-                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
-                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
             )
 
         output_names = self.model.get_output_names(kv_offload=True)
@@ -651,6 +647,7 @@ def compile(
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             custom_io=custom_io_vision,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
 
@@ -675,6 +672,7 @@ def compile(
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             custom_io=custom_io_lang,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
         return self.qpc_path
@@ -915,21 +913,17 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         if (
             any(
                 param is not None
-                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
+                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]
             )
-            or enable_qnn
         ):
             raise ValueError(
-                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
-                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
             )
 
         output_names = self.model.get_output_names()
@@ -967,6 +961,7 @@ def compile(
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
         return self.qpc_path
@@ -1476,8 +1471,6 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1499,8 +1492,6 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
@@ -1562,48 +1553,29 @@ def compile(
             decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
-        if enable_qnn:
-            if compiler_options:
-                logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")
-
-            qpc_path = self._qnn_compile(
-                onnx_path,
-                compile_dir,
-                specializations=specializations,
-                prefill_seq_len=prefill_seq_len,
-                ctx_len=ctx_len,
-                batch_size=batch_size,
-                full_batch_size=full_batch_size,
-                mdp_ts_num_devices=num_devices,
-                num_cores=num_cores,
-                mxfp6_matmul=mxfp6_matmul,
-                mxint8_kv_cache=mxint8_kv_cache,
-                qnn_config=qnn_config,
-                kv_cache_batch_size=kv_cache_batch_size,
-            )
-        else:
-            # Custom IO
-            custom_io = {}
-            kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
-            for suffix in ["", "_RetainedState"]:
-                for i in range(self.num_layers):
-                    for kv in ["key", "value"]:
-                        custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
-
-            qpc_path = self._compile(
-                onnx_path,
-                compile_dir,
-                compile_only=True,
-                retained_state=True,
-                specializations=specializations,
-                convert_to_fp16=True,
-                mxfp6_matmul=mxfp6_matmul,
-                custom_io=custom_io,
-                mdp_ts_num_devices=num_devices,
-                num_speculative_tokens=num_speculative_tokens,
-                aic_num_cores=num_cores,
-                **compiler_options,
-            )
+        # Custom IO
+        custom_io = {}
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+        for suffix in ["", "_RetainedState"]:
+            for i in range(self.num_layers):
+                for kv in ["key", "value"]:
+                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
+
+        qpc_path = self._compile(
+            onnx_path,
+            compile_dir,
+            compile_only=True,
+            retained_state=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            custom_io=custom_io,
+            mdp_ts_num_devices=num_devices,
+            num_speculative_tokens=num_speculative_tokens,
+            aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
+            **compiler_options,
+        )
         return qpc_path
 
     # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
@@ -1747,8 +1719,6 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1790,9 +1760,6 @@ def compile(
         if num_speculative_tokens:
             logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
 
-        if enable_qnn or qnn_config:
-            logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
-
         return self._compile(
             onnx_path,
             compile_dir,
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py

Original file line number	Diff line number	Diff line change
`@@ -251,6 +251,7 @@ def compile(`
`251`	`251`	`custom_io=custom_io,`
`252`	`252`	`mdp_ts_num_devices=num_devices,`
`253`	`253`	`aic_num_cores=num_cores,`
	`254`	`+ mxint8_kv_cache=mxint8_kv_cache,`
`254`	`255`	`**compiler_options,`
`255`	`256`	`)`
`256`	`257`