Skip to content

Commit 4ba2c26

Browse files
committed
QNN Compilation path Support in QEFFBaseModel class.
Signed-off-by: Shubham Agrawal <[email protected]>
1 parent fc89e8b commit 4ba2c26

12 files changed

+334
-449
lines changed

QEfficient/base/modeling_qeff.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,12 @@ def compile(self, *args, **kwargs) -> Path:
9898
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
9999
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
100100
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101-
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
101+
:compiler_options: Pass any compiler option as input.
102+
Following flag can be passed in compiler_options to enable QNN Compilation path.
103+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
104+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
105+
any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
106+
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102107
- aic_num_cores=16 -> -aic-num-cores=16
103108
- convert_to_fp16=True -> -convert-to-fp16
104109
@@ -217,6 +222,7 @@ def _compile(
217222
onnx_path: Optional[str] = None,
218223
compile_dir: Optional[str] = None,
219224
*,
225+
mxint8_kv_cache: bool = False,
220226
specializations: Optional[List[Dict[str, int]]] = None,
221227
custom_io: Optional[Dict[str, str]] = None,
222228
mdp_ts_num_devices: int = 1,
@@ -229,14 +235,37 @@ def _compile(
229235
Args:
230236
:onnx_path (str): Onnx file to compile
231237
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238+
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232239
:specializations (list): List of specializations to compile for
233240
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
236-
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
243+
:compiler_options: Pass any compiler option as input.
244+
Following flag can be passed in compiler_options to enable QNN Compilation path.
245+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
246+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
247+
any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
248+
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
237249
- aic_num_cores=16 -> -aic-num-cores=16
238250
- convert_to_fp16=True -> -convert-to-fp16
251+
239252
"""
253+
enable_qnn = compiler_options["enable_qnn"] if "enable_qnn" in compiler_options else False
254+
qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
255+
256+
if enable_qnn:
257+
return self._qnn_compile(
258+
onnx_path,
259+
compile_dir,
260+
specializations=specializations,
261+
custom_io=custom_io,
262+
mdp_ts_num_devices=mdp_ts_num_devices,
263+
num_cores=compiler_options.get("aic_num_cores", 16),
264+
mxfp6_matmul=compiler_options.get("mxfp6_matmul", False),
265+
mxint8_kv_cache=mxint8_kv_cache,
266+
qnn_config=qnn_config,
267+
)
268+
240269
if onnx_path is None and self.onnx_path is None:
241270
self.export()
242271

@@ -346,35 +375,27 @@ def _qnn_compile(
346375
onnx_path: Optional[str] = None,
347376
compile_dir: Optional[str] = None,
348377
*,
378+
custom_io: Optional[Dict[str, str]] = None,
349379
specializations: Optional[List[Dict[str, int]]] = None,
350-
prefill_seq_len: int = 32,
351-
ctx_len: int = 128,
352-
batch_size: int = 1,
353-
full_batch_size: Optional[int] = None,
354380
mdp_ts_num_devices: int = 1,
355381
num_cores: int = 16,
356382
mxfp6_matmul: bool = False,
357383
mxint8_kv_cache: bool = False,
358384
qnn_config: Optional[str] = None,
359-
kv_cache_batch_size: Optional[int] = None,
360385
) -> str:
361386
"""
362387
Interface for QNN compiler
363388
364389
Args:
365390
:onnx_path (str): Onnx file to compile
366391
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
392+
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
367393
:specializations (list): List of specializations to compile for
368-
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369-
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370-
:batch_size (int, optional): Batch size. ``Defaults to 1``.
371-
:full_batch_size (int, optional): Continuous batching batch size.
372394
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373395
:num_cores (int): Number of cores used to compile the model.
374396
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375397
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376398
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378399
"""
379400
if onnx_path is None and self.onnx_path is None:
380401
self.export()
@@ -390,6 +411,9 @@ def _qnn_compile(
390411
if specializations is not None:
391412
compile_hash.update(to_hashable(specializations))
392413

414+
if custom_io is not None:
415+
compile_hash.update(to_hashable(custom_io))
416+
393417
if qnn_config is not None:
394418
qnn_config_values = load_json(qnn_config)
395419
compile_hash.update(to_hashable(qnn_config_values))
@@ -426,15 +450,12 @@ def _qnn_compile(
426450
qpc_base_path=compile_dir,
427451
num_cores=num_cores,
428452
device_group=list(range(mdp_ts_num_devices)),
429-
batch_size=batch_size,
430-
prompt_len=prefill_seq_len,
431-
ctx_len=ctx_len,
432453
mxfp6=mxfp6_matmul,
433454
mxint8=mxint8_kv_cache,
434-
full_batch_size=full_batch_size,
435455
qnn_config=qnn_config,
436456
qnn_binary_dir=qpc_path,
437-
kv_cache_batch_size=kv_cache_batch_size,
457+
specializations=specializations,
458+
custom_io=custom_io,
438459
)
439460

440461
self.qpc_path = qpc_path

QEfficient/compile/compile_helper.py

-4
Original file line numberDiff line numberDiff line change
@@ -185,16 +185,12 @@ def compile(
185185
onnx_path=onnx_path,
186186
qpc_base_path=qpc_path,
187187
num_cores=num_cores,
188-
batch_size=batch_size,
189-
prompt_len=prompt_len,
190-
ctx_len=ctx_len,
191188
mxfp6=mxfp6,
192189
mxint8=mxint8,
193190
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
194191
aic_enable_depth_first=aic_enable_depth_first,
195192
mos=mos,
196193
device_group=device_group,
197-
full_batch_size=full_batch_size,
198194
qnn_config=qnn_config,
199195
)
200196
logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")

QEfficient/compile/qnn_compiler.py

+13-30
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@
77

88
import os
99
import shutil
10-
from typing import List, Optional
10+
from typing import Dict, List, Optional
1111

1212
from QEfficient.utils._utils import create_json, execute_command, load_json
1313
from QEfficient.utils.constants import QnnConstants
14-
from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
14+
from QEfficient.utils.generate_qnn_network_specialization_config import (
15+
generate_data_format_config,
16+
generate_qnn_specialization,
17+
)
1518
from QEfficient.utils.logging_utils import logger
1619

1720

@@ -31,9 +34,6 @@ def __init__(
3134
device_group: Optional[List[int]] = None,
3235
compiler_enable_depth_first: bool = False,
3336
compiler_max_out_channel_split: int = -1,
34-
batch_size: int = 1,
35-
prompt_len: int = 32,
36-
ctx_len: int = 128,
3737
compiler_mxfp6_matmul_weights: bool = True,
3838
qnn_target: str = QnnConstants.TARGET,
3939
qnn_config_path: Optional[str] = None,
@@ -48,9 +48,6 @@ def __init__(
4848
self.device_group = device_group
4949
self.compiler_enable_depth_first = compiler_enable_depth_first
5050
self.compiler_max_out_channel_split = compiler_max_out_channel_split
51-
self.batch_size = batch_size
52-
self.prompt_len = prompt_len
53-
self.ctx_len = ctx_len
5451
self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
5552
self.qnn_config_path = qnn_config_path
5653
self.qnn_binary_dir = qnn_binary_dir
@@ -327,16 +324,13 @@ def compile(
327324
device_group: Optional[List[int]] = None,
328325
aic_enable_depth_first: bool = False,
329326
mos: int = -1,
330-
batch_size: int = 1,
331-
prompt_len: int = 32,
332-
ctx_len: int = 128,
333327
mxfp6: bool = True,
334328
mxint8: bool = False,
335329
allow_mxint8_mdp_io: Optional[bool] = False,
336-
full_batch_size=None,
337330
qnn_config: Optional[str] = None,
338331
qnn_binary_dir: Optional[str] = None,
339-
kv_cache_batch_size: Optional[int] = None,
332+
custom_io: Optional[Dict[str, str]] = None,
333+
specializations: Optional[List[Dict[str, int]]] = None,
340334
**kwargs,
341335
) -> str:
342336
"""
@@ -352,16 +346,13 @@ def compile(
352346
:device_group (List[int]): Used for finding the number of devices to compile for.
353347
:aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
354348
:mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
355-
:batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
356-
:full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
357-
:prompt_len (int): Prompt length for the model to compile. ``Defaults to 32``
358-
:ctx_len (int): Maximum context length to compile the model. ``Defaults to 128``
359349
:mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.``
360-
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
361350
:mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
351+
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
362352
:qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
363353
:qnn_binary_dir (str): Path for saving qnn binaries.
364-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
354+
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
355+
:specializations (list): List of specializations to compile for
365356
366357
Returns:
367358
:str: Path to compiled ``qpc`` package.
@@ -377,16 +368,11 @@ def compile(
377368
# TODO To make custom_io_config.yaml configurable as not all models need it.
378369
custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
379370

380-
kv_precision = "uint8" if mxint8 else "float16"
381-
fetch_nodes_info(
371+
generate_qnn_specialization(
382372
onnx_graph_path=onnx_path,
383-
batch_size=batch_size,
384-
sequence_length=prompt_len,
385-
context_length=ctx_len,
373+
specializations=specializations,
374+
custom_io=custom_io,
386375
file_path=custom_io_file_path,
387-
full_batch_size=full_batch_size,
388-
kv_precision=kv_precision,
389-
kv_cache_batch_size=kv_cache_batch_size,
390376
)
391377

392378
if not os.path.isfile(custom_io_file_path):
@@ -403,9 +389,6 @@ def compile(
403389
custom_io_path=custom_io_file_path,
404390
compiler_enable_depth_first=aic_enable_depth_first,
405391
compiler_max_out_channel_split=mos,
406-
batch_size=batch_size,
407-
prompt_len=prompt_len,
408-
ctx_len=ctx_len,
409392
compiler_mxfp6_matmul_weights=mxfp6,
410393
qnn_binary_dir=qnn_binary_dir,
411394
mxint8=mxint8,

QEfficient/peft/auto.py

+1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def compile(
251251
custom_io=custom_io,
252252
mdp_ts_num_devices=num_devices,
253253
aic_num_cores=num_cores,
254+
mxint8_kv_cache=mxint8_kv_cache,
254255
**compiler_options,
255256
)
256257

0 commit comments

Comments
 (0)