Added enable_qnn as arg in _compile

Shubham Agrawal · Shubham Agrawal · commit ee2faf11fe77 · 2025-04-09T15:41:01.000+05:30
Signed-off-by: Shubham Agrawal &lt;quic_shubhagr@quicinc.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -102,7 +102,6 @@ def compile(self, *args, **kwargs) -> Path:
             Following flag can be passed in compiler_options to enable QNN Compilation path.
                 :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
                 :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
-                any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
             for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -244,12 +243,7 @@ def _compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :compiler_options: Pass any compiler option as input.
-            Following flag can be passed in compiler_options to enable QNN Compilation path.
-                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
-                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
-                any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
-            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
 
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -161,7 +161,7 @@ class QnnConstants:
     ]
 
     QNN_SAMPLE_CONFIG = {
-        "converter_args_extension": "",
+        "converter_args_extension": "--onnx_defer_loading",
         "context_binary_generator_args_extension": "--log_level debug",
         "qnn_compilation_backend": {
             "compiler_enable_depth_first": True,
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -130,27 +130,16 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    if enable_qnn:
-        qpc_path = qeff_model.compile(
-            prefill_seq_len=prompt_len,
-            ctx_len=ctx_len,
-            num_cores=14,
-            mxfp6=False,
-            aic_enable_depth_first=False,
-            num_speculative_tokens=num_speculative_tokens,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
-    else:
-        qpc_path = qeff_model.compile(
-            prefill_seq_len=prompt_len,
-            ctx_len=ctx_len,
-            num_cores=14,
-            mxfp6=False,
-            aic_enable_depth_first=False,
-            num_speculative_tokens=num_speculative_tokens,
-        )
-
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        num_speculative_tokens=num_speculative_tokens,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+    )
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     gen_len = ort_tokens.shape[-1]
@@ -182,29 +171,17 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    if enable_qnn:
-        qpc_path = qeff_model.compile(
-            prefill_seq_len=prompt_len,
-            ctx_len=ctx_len,
-            num_cores=14,
-            mxfp6=False,
-            aic_enable_depth_first=False,
-            full_batch_size=full_batch_size,
-            num_speculative_tokens=num_speculative_tokens,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
-    else:
-        qpc_path = qeff_model.compile(
-            prefill_seq_len=prompt_len,
-            ctx_len=ctx_len,
-            num_cores=14,
-            mxfp6=False,
-            aic_enable_depth_first=False,
-            full_batch_size=full_batch_size,
-            num_speculative_tokens=num_speculative_tokens,
-        )
-
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+    )
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
     assert all(
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
@@ -73,17 +73,11 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     print("Mad for onnx and PyTorch is ", mad)
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
-    if enable_qnn:
-        qeff_model.compile(
-            num_cores=14,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
-    else:
-        qeff_model.compile(
-            num_cores=14,
-        )
-
+    qeff_model.compile(
+        num_cores=14,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+    )
     ai100_output = qeff_model.generate(inputs=inputs)
 
     # Compare ONNX and AI 100 outputs
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -335,20 +335,13 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    if enable_qnn:
-        qeff_model.compile(
-            ctx_len=ctx_len,
-            num_cores=16,
-            batch_size=batch_size,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
-    else:
-        qeff_model.compile(
-            ctx_len=ctx_len,
-            num_cores=16,
-            batch_size=batch_size,
-        )
+    qeff_model.compile(
+        ctx_len=ctx_len,
+        num_cores=16,
+        batch_size=batch_size,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+    )
 
     exec_info = qeff_model.generate(
         inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ class QnnConstants:`
`161`	`161`	`]`
`162`	`162`
`163`	`163`	`QNN_SAMPLE_CONFIG = {`
`164`		`- "converter_args_extension": "",`
	`164`	`+ "converter_args_extension": "--onnx_defer_loading",`
`165`	`165`	`"context_binary_generator_args_extension": "--log_level debug",`
`166`	`166`	`"qnn_compilation_backend": {`
`167`	`167`	`"compiler_enable_depth_first": True,`