@@ -598,21 +598,17 @@ def compile(
598
598
mxfp6_matmul : bool = False ,
599
599
mxint8_kv_cache : bool = False ,
600
600
num_speculative_tokens : Optional [int ] = None ,
601
- enable_qnn : bool = False ,
602
- qnn_config : Optional [str ] = None ,
603
601
** compiler_options ,
604
602
) -> str :
605
603
if (
606
604
any (
607
605
param is not None
608
- for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens , qnn_config ]
606
+ for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens ]
609
607
)
610
- or enable_qnn
611
608
):
612
609
raise ValueError (
613
- f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
610
+ f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
614
611
f"full_batch_size={ full_batch_size } , kv_cache_batch_size={ kv_cache_batch_size } , num_speculative_tokens={ num_speculative_tokens } , "
615
- f"enable_qnn={ enable_qnn } , qnn_config={ qnn_config } "
616
612
)
617
613
618
614
output_names = self .model .get_output_names (kv_offload = True )
@@ -651,6 +647,7 @@ def compile(
651
647
mdp_ts_num_devices = num_devices ,
652
648
aic_num_cores = num_cores ,
653
649
custom_io = custom_io_vision ,
650
+ mxint8_kv_cache = mxint8_kv_cache ,
654
651
** compiler_options ,
655
652
)
656
653
@@ -675,6 +672,7 @@ def compile(
675
672
mdp_ts_num_devices = num_devices ,
676
673
aic_num_cores = num_cores ,
677
674
custom_io = custom_io_lang ,
675
+ mxint8_kv_cache = mxint8_kv_cache ,
678
676
** compiler_options ,
679
677
)
680
678
return self .qpc_path
@@ -915,21 +913,17 @@ def compile(
915
913
mxfp6_matmul : bool = False ,
916
914
mxint8_kv_cache : bool = False ,
917
915
num_speculative_tokens : Optional [int ] = None ,
918
- enable_qnn : bool = False ,
919
- qnn_config : Optional [str ] = None ,
920
916
** compiler_options ,
921
917
) -> str :
922
918
if (
923
919
any (
924
920
param is not None
925
- for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens , qnn_config ]
921
+ for param in [full_batch_size , kv_cache_batch_size , num_speculative_tokens ]
926
922
)
927
- or enable_qnn
928
923
):
929
924
raise ValueError (
930
- f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
925
+ f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
931
926
f"full_batch_size={ full_batch_size } , kv_cache_batch_size={ kv_cache_batch_size } , num_speculative_tokens={ num_speculative_tokens } , "
932
- f"enable_qnn={ enable_qnn } , qnn_config={ qnn_config } "
933
927
)
934
928
935
929
output_names = self .model .get_output_names ()
@@ -967,6 +961,7 @@ def compile(
967
961
custom_io = custom_io ,
968
962
mdp_ts_num_devices = num_devices ,
969
963
aic_num_cores = num_cores ,
964
+ mxint8_kv_cache = mxint8_kv_cache ,
970
965
** compiler_options ,
971
966
)
972
967
return self .qpc_path
@@ -1476,8 +1471,6 @@ def compile(
1476
1471
mxfp6_matmul : bool = False ,
1477
1472
mxint8_kv_cache : bool = False ,
1478
1473
num_speculative_tokens : Optional [int ] = None ,
1479
- enable_qnn : bool = False ,
1480
- qnn_config : Optional [str ] = None ,
1481
1474
** compiler_options ,
1482
1475
) -> str :
1483
1476
"""
@@ -1499,8 +1492,6 @@ def compile(
1499
1492
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
1500
1493
:mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
1501
1494
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
1502
- :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
1503
- :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
1504
1495
1505
1496
Returns:
1506
1497
:str: Path of the compiled ``qpc`` package.
@@ -1562,48 +1553,29 @@ def compile(
1562
1553
decode_specialization .update ({"num_logits_to_keep" : num_speculative_tokens + 1 }) if self .is_tlm else ...
1563
1554
specializations .append (decode_specialization )
1564
1555
1565
- if enable_qnn :
1566
- if compiler_options :
1567
- logger .warning ("Extra arguments to QNN compilation are supported via qnn_config.json only" )
1568
-
1569
- qpc_path = self ._qnn_compile (
1570
- onnx_path ,
1571
- compile_dir ,
1572
- specializations = specializations ,
1573
- prefill_seq_len = prefill_seq_len ,
1574
- ctx_len = ctx_len ,
1575
- batch_size = batch_size ,
1576
- full_batch_size = full_batch_size ,
1577
- mdp_ts_num_devices = num_devices ,
1578
- num_cores = num_cores ,
1579
- mxfp6_matmul = mxfp6_matmul ,
1580
- mxint8_kv_cache = mxint8_kv_cache ,
1581
- qnn_config = qnn_config ,
1582
- kv_cache_batch_size = kv_cache_batch_size ,
1583
- )
1584
- else :
1585
- # Custom IO
1586
- custom_io = {}
1587
- kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
1588
- for suffix in ["" , "_RetainedState" ]:
1589
- for i in range (self .num_layers ):
1590
- for kv in ["key" , "value" ]:
1591
- custom_io [f"past_{ kv } .{ i } { suffix } " ] = kv_cache_dtype
1592
-
1593
- qpc_path = self ._compile (
1594
- onnx_path ,
1595
- compile_dir ,
1596
- compile_only = True ,
1597
- retained_state = True ,
1598
- specializations = specializations ,
1599
- convert_to_fp16 = True ,
1600
- mxfp6_matmul = mxfp6_matmul ,
1601
- custom_io = custom_io ,
1602
- mdp_ts_num_devices = num_devices ,
1603
- num_speculative_tokens = num_speculative_tokens ,
1604
- aic_num_cores = num_cores ,
1605
- ** compiler_options ,
1606
- )
1556
+ # Custom IO
1557
+ custom_io = {}
1558
+ kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
1559
+ for suffix in ["" , "_RetainedState" ]:
1560
+ for i in range (self .num_layers ):
1561
+ for kv in ["key" , "value" ]:
1562
+ custom_io [f"past_{ kv } .{ i } { suffix } " ] = kv_cache_dtype
1563
+
1564
+ qpc_path = self ._compile (
1565
+ onnx_path ,
1566
+ compile_dir ,
1567
+ compile_only = True ,
1568
+ retained_state = True ,
1569
+ specializations = specializations ,
1570
+ convert_to_fp16 = True ,
1571
+ mxfp6_matmul = mxfp6_matmul ,
1572
+ custom_io = custom_io ,
1573
+ mdp_ts_num_devices = num_devices ,
1574
+ num_speculative_tokens = num_speculative_tokens ,
1575
+ aic_num_cores = num_cores ,
1576
+ mxint8_kv_cache = mxint8_kv_cache ,
1577
+ ** compiler_options ,
1578
+ )
1607
1579
return qpc_path
1608
1580
1609
1581
# FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
@@ -1747,8 +1719,6 @@ def compile(
1747
1719
mxfp6_matmul : bool = False ,
1748
1720
mxint8_kv_cache : bool = False ,
1749
1721
num_speculative_tokens : Optional [int ] = None ,
1750
- enable_qnn : bool = False ,
1751
- qnn_config : Optional [str ] = None ,
1752
1722
** compiler_options ,
1753
1723
) -> str :
1754
1724
"""
@@ -1790,9 +1760,6 @@ def compile(
1790
1760
if num_speculative_tokens :
1791
1761
logger .warning ("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq" )
1792
1762
1793
- if enable_qnn or qnn_config :
1794
- logger .warning ("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq" )
1795
-
1796
1763
return self ._compile (
1797
1764
onnx_path ,
1798
1765
compile_dir ,
0 commit comments