Skip to content

Commit c3fe1bb

Browse files
committed
QNN Compilation path Support in QEFFBaseModel class.
Signed-off-by: Shubham Agrawal <[email protected]> Added enable_qnn as arg in _compile Signed-off-by: Shubham Agrawal <[email protected]> Reduced number of models tested for QNN, and skipped Whisper model Signed-off-by: Shubham Agrawal <[email protected]>
1 parent bdcd7e5 commit c3fe1bb

14 files changed

+333
-434
lines changed

QEfficient/base/modeling_qeff.py

+32-16
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,11 @@ def compile(self, *args, **kwargs) -> Path:
9898
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
9999
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
100100
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101-
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
101+
:compiler_options: Pass any compiler option as input.
102+
Following flag can be passed in compiler_options to enable QNN Compilation path.
103+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
104+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
105+
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102106
- aic_num_cores=16 -> -aic-num-cores=16
103107
- convert_to_fp16=True -> -convert-to-fp16
104108
@@ -217,10 +221,13 @@ def _compile(
217221
onnx_path: Optional[str] = None,
218222
compile_dir: Optional[str] = None,
219223
*,
224+
mxint8_kv_cache: bool = False,
220225
specializations: Optional[List[Dict[str, int]]] = None,
221226
custom_io: Optional[Dict[str, str]] = None,
222227
mdp_ts_num_devices: int = 1,
223228
num_speculative_tokens: Optional[int] = None,
229+
enable_qnn: Optional[bool] = False,
230+
qnn_config: Optional[str] = None,
224231
**compiler_options,
225232
) -> str:
226233
"""
@@ -229,14 +236,31 @@ def _compile(
229236
Args:
230237
:onnx_path (str): Onnx file to compile
231238
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
239+
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232240
:specializations (list): List of specializations to compile for
233241
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234242
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235243
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
244+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
245+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
236246
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
237247
- aic_num_cores=16 -> -aic-num-cores=16
238248
- convert_to_fp16=True -> -convert-to-fp16
249+
239250
"""
251+
if enable_qnn:
252+
return self._qnn_compile(
253+
onnx_path,
254+
compile_dir,
255+
specializations=specializations,
256+
custom_io=custom_io,
257+
mdp_ts_num_devices=mdp_ts_num_devices,
258+
num_cores=compiler_options.get("aic_num_cores", 16),
259+
mxfp6_matmul=compiler_options.get("mxfp6_matmul", False),
260+
mxint8_kv_cache=mxint8_kv_cache,
261+
qnn_config=qnn_config,
262+
)
263+
240264
if onnx_path is None and self.onnx_path is None:
241265
self.export()
242266

@@ -346,35 +370,27 @@ def _qnn_compile(
346370
onnx_path: Optional[str] = None,
347371
compile_dir: Optional[str] = None,
348372
*,
373+
custom_io: Optional[Dict[str, str]] = None,
349374
specializations: Optional[List[Dict[str, int]]] = None,
350-
prefill_seq_len: int = 32,
351-
ctx_len: int = 128,
352-
batch_size: int = 1,
353-
full_batch_size: Optional[int] = None,
354375
mdp_ts_num_devices: int = 1,
355376
num_cores: int = 16,
356377
mxfp6_matmul: bool = False,
357378
mxint8_kv_cache: bool = False,
358379
qnn_config: Optional[str] = None,
359-
kv_cache_batch_size: Optional[int] = None,
360380
) -> str:
361381
"""
362382
Interface for QNN compiler
363383
364384
Args:
365385
:onnx_path (str): Onnx file to compile
366386
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
387+
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
367388
:specializations (list): List of specializations to compile for
368-
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369-
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370-
:batch_size (int, optional): Batch size. ``Defaults to 1``.
371-
:full_batch_size (int, optional): Continuous batching batch size.
372389
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373390
:num_cores (int): Number of cores used to compile the model.
374391
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375392
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376393
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378394
"""
379395
if onnx_path is None and self.onnx_path is None:
380396
self.export()
@@ -390,6 +406,9 @@ def _qnn_compile(
390406
if specializations is not None:
391407
compile_hash.update(to_hashable(specializations))
392408

409+
if custom_io is not None:
410+
compile_hash.update(to_hashable(custom_io))
411+
393412
if qnn_config is not None:
394413
qnn_config_values = load_json(qnn_config)
395414
compile_hash.update(to_hashable(qnn_config_values))
@@ -426,15 +445,12 @@ def _qnn_compile(
426445
qpc_base_path=compile_dir,
427446
num_cores=num_cores,
428447
device_group=list(range(mdp_ts_num_devices)),
429-
batch_size=batch_size,
430-
prompt_len=prefill_seq_len,
431-
ctx_len=ctx_len,
432448
mxfp6=mxfp6_matmul,
433449
mxint8=mxint8_kv_cache,
434-
full_batch_size=full_batch_size,
435450
qnn_config=qnn_config,
436451
qnn_binary_dir=qpc_path,
437-
kv_cache_batch_size=kv_cache_batch_size,
452+
specializations=specializations,
453+
custom_io=custom_io,
438454
)
439455

440456
self.qpc_path = qpc_path

QEfficient/compile/compile_helper.py

+14-15
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import List, Optional, Tuple
1414

1515
from QEfficient.compile.qnn_compiler import compile as qnn_compile
16+
from QEfficient.utils._utils import load_json, load_yaml
1617
from QEfficient.utils.logging_utils import logger
1718

1819

@@ -180,36 +181,34 @@ def compile(
180181
full_batch_size=full_batch_size,
181182
)
182183

184+
# Select the customIO config based on the mx flag.
185+
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
186+
187+
if custom_io_file_path is None:
188+
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
189+
190+
if not os.path.isfile(custom_io_file_path):
191+
raise FileNotFoundError(
192+
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
193+
)
194+
183195
if enable_qnn:
184196
qpc_path = qnn_compile(
185197
onnx_path=onnx_path,
186198
qpc_base_path=qpc_path,
187199
num_cores=num_cores,
188-
batch_size=batch_size,
189-
prompt_len=prompt_len,
190-
ctx_len=ctx_len,
191200
mxfp6=mxfp6,
192201
mxint8=mxint8,
193202
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
194203
aic_enable_depth_first=aic_enable_depth_first,
195204
mos=mos,
196205
device_group=device_group,
197-
full_batch_size=full_batch_size,
198206
qnn_config=qnn_config,
207+
specializations=(load_json(specialization_json_path))["specializations"],
208+
custom_io=load_yaml(custom_io_file_path),
199209
)
200210
logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
201211
else:
202-
# Select the customIO config based on the mx flag.
203-
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
204-
205-
if custom_io_file_path is None:
206-
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
207-
208-
if not os.path.isfile(custom_io_file_path):
209-
raise FileNotFoundError(
210-
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
211-
)
212-
213212
_, qpc_path = compile_kv_model_on_cloud_ai_100(
214213
onnx_path=onnx_path,
215214
specializations_json=specialization_json_path,

QEfficient/compile/qnn_compiler.py

+19-31
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@
77

88
import os
99
import shutil
10-
from typing import List, Optional
10+
from typing import Dict, List, Optional
1111

1212
from QEfficient.utils._utils import create_json, execute_command, load_json
1313
from QEfficient.utils.constants import QnnConstants
14-
from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
14+
from QEfficient.utils.generate_qnn_network_specialization_config import (
15+
generate_data_format_config,
16+
generate_qnn_specialization,
17+
)
1518
from QEfficient.utils.logging_utils import logger
1619

1720

@@ -31,15 +34,13 @@ def __init__(
3134
device_group: Optional[List[int]] = None,
3235
compiler_enable_depth_first: bool = False,
3336
compiler_max_out_channel_split: int = -1,
34-
batch_size: int = 1,
35-
prompt_len: int = 32,
36-
ctx_len: int = 128,
3737
compiler_mxfp6_matmul_weights: bool = True,
3838
qnn_target: str = QnnConstants.TARGET,
3939
qnn_config_path: Optional[str] = None,
4040
qnn_binary_dir: Optional[str] = None,
4141
mxint8: Optional[bool] = False,
4242
compiler_mxint8_mdp_io: Optional[bool] = False,
43+
decode_only: Optional[bool] = False,
4344
**kwargs,
4445
) -> None:
4546
self.onnx_path = onnx_path
@@ -48,9 +49,6 @@ def __init__(
4849
self.device_group = device_group
4950
self.compiler_enable_depth_first = compiler_enable_depth_first
5051
self.compiler_max_out_channel_split = compiler_max_out_channel_split
51-
self.batch_size = batch_size
52-
self.prompt_len = prompt_len
53-
self.ctx_len = ctx_len
5452
self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
5553
self.qnn_config_path = qnn_config_path
5654
self.qnn_binary_dir = qnn_binary_dir
@@ -59,6 +57,7 @@ def __init__(
5957
self.custom_io_path = custom_io_path
6058
self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
6159
self.qnn_target = qnn_target
60+
self.decode_only = decode_only
6261
self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
6362
if not self.qnn_sdk_path:
6463
raise EnvironmentError(
@@ -141,7 +140,7 @@ def create_qnn_compile_backend_json(self) -> str:
141140
"compiler_hardware_version": QnnConstants.COMPILER_HARDWARE_VERSION,
142141
"compiler_convert_to_FP16": QnnConstants.COMPILER_CONVERT_TO_FP16,
143142
"compiler_retained_state": QnnConstants.COMPILER_RETAINED_STATE,
144-
"graph_names": QnnConstants.GRAPH_NAMES,
143+
"graph_names": QnnConstants.GRAPH_NAMES_DECODE_ONLY if self.decode_only else QnnConstants.GRAPH_NAMES,
145144
"compiler_enable_depth_first": self.compiler_enable_depth_first,
146145
"compiler_mxfp6_matmul_weights": self.compiler_mxfp6_matmul_weights,
147146
"compiler_num_of_cores": self.num_cores,
@@ -327,16 +326,13 @@ def compile(
327326
device_group: Optional[List[int]] = None,
328327
aic_enable_depth_first: bool = False,
329328
mos: int = -1,
330-
batch_size: int = 1,
331-
prompt_len: int = 32,
332-
ctx_len: int = 128,
333329
mxfp6: bool = True,
334330
mxint8: bool = False,
335331
allow_mxint8_mdp_io: Optional[bool] = False,
336-
full_batch_size=None,
337332
qnn_config: Optional[str] = None,
338333
qnn_binary_dir: Optional[str] = None,
339-
kv_cache_batch_size: Optional[int] = None,
334+
custom_io: Optional[Dict[str, str]] = None,
335+
specializations: Optional[List[Dict[str, int]]] = None,
340336
**kwargs,
341337
) -> str:
342338
"""
@@ -352,16 +348,13 @@ def compile(
352348
:device_group (List[int]): Used for finding the number of devices to compile for.
353349
:aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
354350
:mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
355-
:batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
356-
:full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
357-
:prompt_len (int): Prompt length for the model to compile. ``Defaults to 32``
358-
:ctx_len (int): Maximum context length to compile the model. ``Defaults to 128``
359351
:mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.``
360-
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
361352
:mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
353+
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
362354
:qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
363355
:qnn_binary_dir (str): Path for saving qnn binaries.
364-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
356+
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
357+
:specializations (list): List of specializations to compile for
365358
366359
Returns:
367360
:str: Path to compiled ``qpc`` package.
@@ -377,23 +370,20 @@ def compile(
377370
# TODO To make custom_io_config.yaml configurable as not all models need it.
378371
custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
379372

380-
kv_precision = "uint8" if mxint8 else "float16"
381-
fetch_nodes_info(
373+
generate_qnn_specialization(
382374
onnx_graph_path=onnx_path,
383-
batch_size=batch_size,
384-
sequence_length=prompt_len,
385-
context_length=ctx_len,
375+
specializations=specializations,
376+
custom_io=custom_io,
386377
file_path=custom_io_file_path,
387-
full_batch_size=full_batch_size,
388-
kv_precision=kv_precision,
389-
kv_cache_batch_size=kv_cache_batch_size,
390378
)
391379

392380
if not os.path.isfile(custom_io_file_path):
393381
raise FileNotFoundError(
394382
f"file {custom_io_file_path} needs to exist in the qpc_base_path for Compilation. Please rerun infer/compile Api"
395383
)
396384

385+
decode_only = True if len(specializations) == 1 else False
386+
397387
qnn_obj = QNN(
398388
onnx_path=onnx_path,
399389
qpc_base_path=qpc_base_path,
@@ -403,13 +393,11 @@ def compile(
403393
custom_io_path=custom_io_file_path,
404394
compiler_enable_depth_first=aic_enable_depth_first,
405395
compiler_max_out_channel_split=mos,
406-
batch_size=batch_size,
407-
prompt_len=prompt_len,
408-
ctx_len=ctx_len,
409396
compiler_mxfp6_matmul_weights=mxfp6,
410397
qnn_binary_dir=qnn_binary_dir,
411398
mxint8=mxint8,
412399
compiler_mxint8_mdp_io=allow_mxint8_mdp_io,
400+
decode_only=decode_only,
413401
)
414402

415403
compiled_binary_path = qnn_obj.compile()

QEfficient/peft/auto.py

+1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def compile(
251251
custom_io=custom_io,
252252
mdp_ts_num_devices=num_devices,
253253
aic_num_cores=num_cores,
254+
mxint8_kv_cache=mxint8_kv_cache,
254255
**compiler_options,
255256
)
256257

0 commit comments

Comments
 (0)