-
Notifications
You must be signed in to change notification settings - Fork 45
QNN Compilation path Support in QEFFBaseModel class. #335
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,7 +98,11 @@ def compile(self, *args, **kwargs) -> Path: | |
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``. | ||
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``. | ||
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``. | ||
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: | ||
:compiler_options: Pass any compiler option as input. | ||
Following flag can be passed in compiler_options to enable QNN Compilation path. | ||
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.`` | ||
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed`` | ||
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: | ||
- aic_num_cores=16 -> -aic-num-cores=16 | ||
- convert_to_fp16=True -> -convert-to-fp16 | ||
|
||
|
@@ -217,10 +221,13 @@ def _compile( | |
onnx_path: Optional[str] = None, | ||
compile_dir: Optional[str] = None, | ||
*, | ||
mxint8_kv_cache: bool = False, | ||
specializations: Optional[List[Dict[str, int]]] = None, | ||
custom_io: Optional[Dict[str, str]] = None, | ||
mdp_ts_num_devices: int = 1, | ||
num_speculative_tokens: Optional[int] = None, | ||
enable_qnn: Optional[bool] = False, | ||
qnn_config: Optional[str] = None, | ||
**compiler_options, | ||
) -> str: | ||
""" | ||
|
@@ -229,14 +236,31 @@ def _compile( | |
Args: | ||
:onnx_path (str): Onnx file to compile | ||
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters. | ||
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. | ||
:specializations (list): List of specializations to compile for | ||
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default | ||
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. | ||
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. | ||
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` | ||
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` | ||
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: | ||
- aic_num_cores=16 -> -aic-num-cores=16 | ||
- convert_to_fp16=True -> -convert-to-fp16 | ||
|
||
""" | ||
if enable_qnn: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest to have two different python file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. Lets take it as a separate change post merging this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @quic-amitraj |
||
return self._qnn_compile( | ||
onnx_path, | ||
compile_dir, | ||
specializations=specializations, | ||
custom_io=custom_io, | ||
mdp_ts_num_devices=mdp_ts_num_devices, | ||
num_cores=compiler_options.get("aic_num_cores", 16), | ||
mxfp6_matmul=compiler_options.get("mxfp6_matmul", False), | ||
mxint8_kv_cache=mxint8_kv_cache, | ||
qnn_config=qnn_config, | ||
) | ||
|
||
if onnx_path is None and self.onnx_path is None: | ||
self.export() | ||
|
||
|
@@ -346,35 +370,27 @@ def _qnn_compile( | |
onnx_path: Optional[str] = None, | ||
compile_dir: Optional[str] = None, | ||
*, | ||
custom_io: Optional[Dict[str, str]] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After addressing then above comment there will be no need of this method to be here. All the preprocessing to compilation should be done inside the their specific file. This way we will have clean code and easier to understand. |
||
specializations: Optional[List[Dict[str, int]]] = None, | ||
prefill_seq_len: int = 32, | ||
ctx_len: int = 128, | ||
batch_size: int = 1, | ||
full_batch_size: Optional[int] = None, | ||
mdp_ts_num_devices: int = 1, | ||
num_cores: int = 16, | ||
mxfp6_matmul: bool = False, | ||
mxint8_kv_cache: bool = False, | ||
qnn_config: Optional[str] = None, | ||
kv_cache_batch_size: Optional[int] = None, | ||
) -> str: | ||
""" | ||
Interface for QNN compiler | ||
|
||
Args: | ||
:onnx_path (str): Onnx file to compile | ||
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters. | ||
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default | ||
:specializations (list): List of specializations to compile for | ||
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. | ||
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. | ||
:batch_size (int, optional): Batch size. ``Defaults to 1``. | ||
:full_batch_size (int, optional): Continuous batching batch size. | ||
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. | ||
:num_cores (int): Number of cores used to compile the model. | ||
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``. | ||
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. | ||
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` | ||
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.`` | ||
""" | ||
if onnx_path is None and self.onnx_path is None: | ||
self.export() | ||
|
@@ -390,6 +406,9 @@ def _qnn_compile( | |
if specializations is not None: | ||
compile_hash.update(to_hashable(specializations)) | ||
|
||
if custom_io is not None: | ||
compile_hash.update(to_hashable(custom_io)) | ||
|
||
if qnn_config is not None: | ||
qnn_config_values = load_json(qnn_config) | ||
compile_hash.update(to_hashable(qnn_config_values)) | ||
|
@@ -426,15 +445,12 @@ def _qnn_compile( | |
qpc_base_path=compile_dir, | ||
num_cores=num_cores, | ||
device_group=list(range(mdp_ts_num_devices)), | ||
batch_size=batch_size, | ||
prompt_len=prefill_seq_len, | ||
ctx_len=ctx_len, | ||
mxfp6=mxfp6_matmul, | ||
mxint8=mxint8_kv_cache, | ||
full_batch_size=full_batch_size, | ||
qnn_config=qnn_config, | ||
qnn_binary_dir=qpc_path, | ||
kv_cache_batch_size=kv_cache_batch_size, | ||
specializations=specializations, | ||
custom_io=custom_io, | ||
) | ||
|
||
self.qpc_path = qpc_path | ||
|
Uh oh!
There was an error while loading. Please reload this page.