@@ -598,21 +598,12 @@ def compile(
598
598
mxfp6_matmul : bool = False ,
599
599
mxint8_kv_cache : bool = False ,
600
600
num_speculative_tokens : Optional [int ] = None ,
601
- enable_qnn : bool = False ,
602
- qnn_config : Optional [str ] = None ,
603
601
** compiler_options ,
604
602
) -> str :
605
- if (
606
- any (
607
- param is not None
608
- for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens , qnn_config ]
609
- )
610
- or enable_qnn
611
- ):
603
+ if any (param is not None for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens ]):
612
604
raise ValueError (
613
- f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
605
+ f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
614
606
f"full_batch_size={ full_batch_size } , kv_cache_batch_size={ kv_cache_batch_size } , num_speculative_tokens={ num_speculative_tokens } , "
615
- f"enable_qnn={ enable_qnn } , qnn_config={ qnn_config } "
616
607
)
617
608
618
609
output_names = self .model .get_output_names (kv_offload = True )
@@ -651,6 +642,7 @@ def compile(
651
642
mdp_ts_num_devices = num_devices ,
652
643
aic_num_cores = num_cores ,
653
644
custom_io = custom_io_vision ,
645
+ mxint8_kv_cache = mxint8_kv_cache ,
654
646
** compiler_options ,
655
647
)
656
648
@@ -675,6 +667,7 @@ def compile(
675
667
mdp_ts_num_devices = num_devices ,
676
668
aic_num_cores = num_cores ,
677
669
custom_io = custom_io_lang ,
670
+ mxint8_kv_cache = mxint8_kv_cache ,
678
671
** compiler_options ,
679
672
)
680
673
return self .qpc_path
@@ -915,21 +908,12 @@ def compile(
915
908
mxfp6_matmul : bool = False ,
916
909
mxint8_kv_cache : bool = False ,
917
910
num_speculative_tokens : Optional [int ] = None ,
918
- enable_qnn : bool = False ,
919
- qnn_config : Optional [str ] = None ,
920
911
** compiler_options ,
921
912
) -> str :
922
- if (
923
- any (
924
- param is not None
925
- for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens , qnn_config ]
926
- )
927
- or enable_qnn
928
- ):
913
+ if any (param is not None for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens ]):
929
914
raise ValueError (
930
- f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
915
+ f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
931
916
f"full_batch_size={ full_batch_size } , kv_cache_batch_size={ kv_cache_batch_size } , num_speculative_tokens={ num_speculative_tokens } , "
932
- f"enable_qnn={ enable_qnn } , qnn_config={ qnn_config } "
933
917
)
934
918
935
919
output_names = self .model .get_output_names ()
@@ -967,6 +951,7 @@ def compile(
967
951
custom_io = custom_io ,
968
952
mdp_ts_num_devices = num_devices ,
969
953
aic_num_cores = num_cores ,
954
+ mxint8_kv_cache = mxint8_kv_cache ,
970
955
** compiler_options ,
971
956
)
972
957
return self .qpc_path
@@ -1476,8 +1461,6 @@ def compile(
1476
1461
mxfp6_matmul : bool = False ,
1477
1462
mxint8_kv_cache : bool = False ,
1478
1463
num_speculative_tokens : Optional [int ] = None ,
1479
- enable_qnn : bool = False ,
1480
- qnn_config : Optional [str ] = None ,
1481
1464
** compiler_options ,
1482
1465
) -> str :
1483
1466
"""
@@ -1499,8 +1482,6 @@ def compile(
1499
1482
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
1500
1483
:mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
1501
1484
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
1502
- :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
1503
- :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
1504
1485
1505
1486
Returns:
1506
1487
:str: Path of the compiled ``qpc`` package.
@@ -1562,48 +1543,29 @@ def compile(
1562
1543
decode_specialization .update ({"num_logits_to_keep" : num_speculative_tokens + 1 }) if self .is_tlm else ...
1563
1544
specializations .append (decode_specialization )
1564
1545
1565
- if enable_qnn :
1566
- if compiler_options :
1567
- logger .warning ("Extra arguments to QNN compilation are supported via qnn_config.json only" )
1568
-
1569
- qpc_path = self ._qnn_compile (
1570
- onnx_path ,
1571
- compile_dir ,
1572
- specializations = specializations ,
1573
- prefill_seq_len = prefill_seq_len ,
1574
- ctx_len = ctx_len ,
1575
- batch_size = batch_size ,
1576
- full_batch_size = full_batch_size ,
1577
- mdp_ts_num_devices = num_devices ,
1578
- num_cores = num_cores ,
1579
- mxfp6_matmul = mxfp6_matmul ,
1580
- mxint8_kv_cache = mxint8_kv_cache ,
1581
- qnn_config = qnn_config ,
1582
- kv_cache_batch_size = kv_cache_batch_size ,
1583
- )
1584
- else :
1585
- # Custom IO
1586
- custom_io = {}
1587
- kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
1588
- for suffix in ["" , "_RetainedState" ]:
1589
- for i in range (self .num_layers ):
1590
- for kv in ["key" , "value" ]:
1591
- custom_io [f"past_{ kv } .{ i } { suffix } " ] = kv_cache_dtype
1592
-
1593
- qpc_path = self ._compile (
1594
- onnx_path ,
1595
- compile_dir ,
1596
- compile_only = True ,
1597
- retained_state = True ,
1598
- specializations = specializations ,
1599
- convert_to_fp16 = True ,
1600
- mxfp6_matmul = mxfp6_matmul ,
1601
- custom_io = custom_io ,
1602
- mdp_ts_num_devices = num_devices ,
1603
- num_speculative_tokens = num_speculative_tokens ,
1604
- aic_num_cores = num_cores ,
1605
- ** compiler_options ,
1606
- )
1546
+ # Custom IO
1547
+ custom_io = {}
1548
+ kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
1549
+ for suffix in ["" , "_RetainedState" ]:
1550
+ for i in range (self .num_layers ):
1551
+ for kv in ["key" , "value" ]:
1552
+ custom_io [f"past_{ kv } .{ i } { suffix } " ] = kv_cache_dtype
1553
+
1554
+ qpc_path = self ._compile (
1555
+ onnx_path ,
1556
+ compile_dir ,
1557
+ compile_only = True ,
1558
+ retained_state = True ,
1559
+ specializations = specializations ,
1560
+ convert_to_fp16 = True ,
1561
+ mxfp6_matmul = mxfp6_matmul ,
1562
+ custom_io = custom_io ,
1563
+ mdp_ts_num_devices = num_devices ,
1564
+ num_speculative_tokens = num_speculative_tokens ,
1565
+ aic_num_cores = num_cores ,
1566
+ mxint8_kv_cache = mxint8_kv_cache ,
1567
+ ** compiler_options ,
1568
+ )
1607
1569
return qpc_path
1608
1570
1609
1571
# FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
@@ -1747,8 +1709,6 @@ def compile(
1747
1709
mxfp6_matmul : bool = False ,
1748
1710
mxint8_kv_cache : bool = False ,
1749
1711
num_speculative_tokens : Optional [int ] = None ,
1750
- enable_qnn : bool = False ,
1751
- qnn_config : Optional [str ] = None ,
1752
1712
** compiler_options ,
1753
1713
) -> str :
1754
1714
"""
@@ -1790,9 +1750,6 @@ def compile(
1790
1750
if num_speculative_tokens :
1791
1751
logger .warning ("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq" )
1792
1752
1793
- if enable_qnn or qnn_config :
1794
- logger .warning ("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq" )
1795
-
1796
1753
return self ._compile (
1797
1754
onnx_path ,
1798
1755
compile_dir ,
0 commit comments