quic · shubhagr-qc · Jul 7, 2025
@@ -224,6 +224,7 @@ def _compile(
         custom_io: Optional[Dict[str, str]] = None,
         mdp_ts_num_devices: int = 1,
         num_speculative_tokens: Optional[int] = None,
+        mxfp6_matmul: bool = constants.DEFAULT_AIC_MXPF6_MATMUL,
         enable_qnn: Optional[bool] = False,
         qnn_config: Optional[str] = None,
         **compiler_options,
@@ -239,6 +240,7 @@ def _compile(
             :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
+            :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input.
@@ -269,7 +271,7 @@ def _compile(
                 custom_io=custom_io,
                 device_group=list(range(mdp_ts_num_devices)),
                 num_cores=compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES),
-                mxfp6=compiler_options.get("mxfp6_matmul", constants.DEFAULT_AIC_MXPF6_MATMUL),
+                mxfp6=mxfp6_matmul,
                 mxint8=mxint8_kv_cache,
                 qnn_config=qnn_config,
             )
@@ -281,6 +283,9 @@ def _compile(
         if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None):
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
 
+        if mxfp6_matmul:
+            command.append("-mxfp6-matmul")
+
         for key, value in compiler_options.items():
             option = "-" + key.replace("_", "-")
             if isinstance(value, bool):

@@ -106,8 +106,17 @@ def parse_qnn_config(self):
         for key, value in config_data.items():
             if key == QnnConstants.CONVERTER_ARGS_EXTENSION_STR:
                 self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONVERTER_ARGS)
-            if key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR:
+            elif key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR:
                 self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONTEXT_BIN_GEN_ARGS)
+            elif key == QnnConstants.QNN_COMPILATION_BACKEND_STR:
+                immutable_param = [
+                    sub_key for sub_key in value.keys() if sub_key in QnnConstants.IMMUTABLE_COMPILATION_BACKEND_ARGS
+                ]
+                if immutable_param:
+                    raise AttributeError(
+                        f"Immutable Parameters {immutable_param} found in {QnnConstants.QNN_COMPILATION_BACKEND_STR}. Please remove them from QNN Configuration file."
+                    )
+
             self.qnn_config[key] = value
 
     def create_qnn_tensor_slicing_json(self) -> str:

@@ -190,6 +190,10 @@ class QnnConstants:
         "--config_file ",
     ]
 
+    IMMUTABLE_COMPILATION_BACKEND_ARGS = [
+        "compiler_mxfp6_matmul_weights",
+    ]
+
     QNN_SAMPLE_CONFIG = {
         "converter_args_extension": "--onnx_defer_loading",
         "context_binary_generator_args_extension": "--log_level debug",