@@ -98,7 +98,12 @@ def compile(self, *args, **kwargs) -> Path:
98
98
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
99
99
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
100
100
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101
- :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
101
+ :compiler_options: Pass any compiler option as input.
102
+ Following flag can be passed in compiler_options to enable QNN Compilation path.
103
+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
104
+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
105
+ any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
106
+ for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102
107
- aic_num_cores=16 -> -aic-num-cores=16
103
108
- convert_to_fp16=True -> -convert-to-fp16
104
109
@@ -217,6 +222,7 @@ def _compile(
217
222
onnx_path : Optional [str ] = None ,
218
223
compile_dir : Optional [str ] = None ,
219
224
* ,
225
+ mxint8_kv_cache : bool = False ,
220
226
specializations : Optional [List [Dict [str , int ]]] = None ,
221
227
custom_io : Optional [Dict [str , str ]] = None ,
222
228
mdp_ts_num_devices : int = 1 ,
@@ -229,14 +235,37 @@ def _compile(
229
235
Args:
230
236
:onnx_path (str): Onnx file to compile
231
237
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238
+ :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232
239
:specializations (list): List of specializations to compile for
233
240
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234
241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235
242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
236
- :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
243
+ :compiler_options: Pass any compiler option as input.
244
+ Following flag can be passed in compiler_options to enable QNN Compilation path.
245
+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
246
+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
247
+ any other parameter passed will be ignored in QNN compilation path as we expect overriding or extra parameters for QNN via config file.
248
+ for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
237
249
- aic_num_cores=16 -> -aic-num-cores=16
238
250
- convert_to_fp16=True -> -convert-to-fp16
251
+
239
252
"""
253
+ enable_qnn = compiler_options ["enable_qnn" ] if "enable_qnn" in compiler_options else False
254
+ qnn_config = compiler_options ["qnn_config" ] if "qnn_config" in compiler_options else None
255
+
256
+ if enable_qnn :
257
+ return self ._qnn_compile (
258
+ onnx_path ,
259
+ compile_dir ,
260
+ specializations = specializations ,
261
+ custom_io = custom_io ,
262
+ mdp_ts_num_devices = mdp_ts_num_devices ,
263
+ num_cores = compiler_options .get ("aic_num_cores" , 16 ),
264
+ mxfp6_matmul = compiler_options .get ("mxfp6_matmul" , False ),
265
+ mxint8_kv_cache = mxint8_kv_cache ,
266
+ qnn_config = qnn_config ,
267
+ )
268
+
240
269
if onnx_path is None and self .onnx_path is None :
241
270
self .export ()
242
271
@@ -346,35 +375,27 @@ def _qnn_compile(
346
375
onnx_path : Optional [str ] = None ,
347
376
compile_dir : Optional [str ] = None ,
348
377
* ,
378
+ custom_io : Optional [Dict [str , str ]] = None ,
349
379
specializations : Optional [List [Dict [str , int ]]] = None ,
350
- prefill_seq_len : int = 32 ,
351
- ctx_len : int = 128 ,
352
- batch_size : int = 1 ,
353
- full_batch_size : Optional [int ] = None ,
354
380
mdp_ts_num_devices : int = 1 ,
355
381
num_cores : int = 16 ,
356
382
mxfp6_matmul : bool = False ,
357
383
mxint8_kv_cache : bool = False ,
358
384
qnn_config : Optional [str ] = None ,
359
- kv_cache_batch_size : Optional [int ] = None ,
360
385
) -> str :
361
386
"""
362
387
Interface for QNN compiler
363
388
364
389
Args:
365
390
:onnx_path (str): Onnx file to compile
366
391
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
392
+ :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
367
393
:specializations (list): List of specializations to compile for
368
- :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369
- :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370
- :batch_size (int, optional): Batch size. ``Defaults to 1``.
371
- :full_batch_size (int, optional): Continuous batching batch size.
372
394
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373
395
:num_cores (int): Number of cores used to compile the model.
374
396
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375
397
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376
398
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377
- :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378
399
"""
379
400
if onnx_path is None and self .onnx_path is None :
380
401
self .export ()
@@ -390,6 +411,9 @@ def _qnn_compile(
390
411
if specializations is not None :
391
412
compile_hash .update (to_hashable (specializations ))
392
413
414
+ if custom_io is not None :
415
+ compile_hash .update (to_hashable (custom_io ))
416
+
393
417
if qnn_config is not None :
394
418
qnn_config_values = load_json (qnn_config )
395
419
compile_hash .update (to_hashable (qnn_config_values ))
@@ -426,15 +450,12 @@ def _qnn_compile(
426
450
qpc_base_path = compile_dir ,
427
451
num_cores = num_cores ,
428
452
device_group = list (range (mdp_ts_num_devices )),
429
- batch_size = batch_size ,
430
- prompt_len = prefill_seq_len ,
431
- ctx_len = ctx_len ,
432
453
mxfp6 = mxfp6_matmul ,
433
454
mxint8 = mxint8_kv_cache ,
434
- full_batch_size = full_batch_size ,
435
455
qnn_config = qnn_config ,
436
456
qnn_binary_dir = qpc_path ,
437
- kv_cache_batch_size = kv_cache_batch_size ,
457
+ specializations = specializations ,
458
+ custom_io = custom_io ,
438
459
)
439
460
440
461
self .qpc_path = qpc_path
0 commit comments