diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index a9fd8d663..119ec599f 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -24,7 +24,6 @@ from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils import constants, dump_qconfig -from QEfficient.utils._utils import load_json from QEfficient.utils.cache import QEFF_HOME, to_hashable logger = logging.getLogger(__name__) @@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path: :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``. :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``. :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``. - :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: + :compiler_options: Pass any compiler option as input. + Following flag can be passed in compiler_options to enable QNN Compilation path. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed`` + for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: - aic_num_cores=16 -> -aic-num-cores=16 - convert_to_fp16=True -> -convert-to-fp16 @@ -217,10 +220,13 @@ def _compile( onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, *, + mxint8_kv_cache: bool = False, specializations: Optional[List[Dict[str, int]]] = None, custom_io: Optional[Dict[str, str]] = None, mdp_ts_num_devices: int = 1, num_speculative_tokens: Optional[int] = None, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, **compiler_options, ) -> str: """ @@ -229,10 +235,13 @@ def _compile( Args: :onnx_path (str): Onnx file to compile :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters. + :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. :specializations (list): List of specializations to compile for :custom_io (dict): Custom IO to specify the input and outputs in different formats than default :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: - aic_num_cores=16 -> -aic-num-cores=16 - convert_to_fp16=True -> -convert-to-fp16 @@ -245,6 +254,22 @@ def _compile( qpc_path = compile_dir / "qpc" if not onnx_path.is_file(): raise FileNotFoundError(f"ONNX file not found at: {onnx_path}") + + if enable_qnn: + self.qpc_path = qnn_compile( + onnx_path=onnx_path, + qpc_base_path=compile_dir, + specializations=specializations, + custom_io=custom_io, + device_group=list(range(mdp_ts_num_devices)), + num_cores=compiler_options.get("aic_num_cores", 16), + mxfp6=compiler_options.get("mxfp6_matmul", False), + mxint8=mxint8_kv_cache, + qnn_config=qnn_config, + ) + + return self.qpc_path + command = constants.COMPILER + [f"-m={onnx_path}"] if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None): mdp_ts_num_devices = None @@ -339,104 +364,3 @@ def _compile( self.qpc_path = qpc_path return qpc_path - - @dump_qconfig - def _qnn_compile( - self, - onnx_path: Optional[str] = None, - compile_dir: Optional[str] = None, - *, - specializations: Optional[List[Dict[str, int]]] = None, - prefill_seq_len: int = 32, - ctx_len: int = 128, - batch_size: int = 1, - full_batch_size: Optional[int] = None, - mdp_ts_num_devices: int = 1, - num_cores: int = 16, - mxfp6_matmul: bool = False, - mxint8_kv_cache: bool = False, - qnn_config: Optional[str] = None, - kv_cache_batch_size: Optional[int] = None, - ) -> str: - """ - Interface for QNN compiler - - Args: - :onnx_path (str): Onnx file to compile - :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters. - :specializations (list): List of specializations to compile for - :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. - :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :full_batch_size (int, optional): Continuous batching batch size. - :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. - :num_cores (int): Number of cores used to compile the model. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``. - :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` - :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.`` - """ - if onnx_path is None and self.onnx_path is None: - self.export() - - onnx_path = Path(onnx_path or self.onnx_path) - compile_dir = Path(compile_dir or onnx_path.parent) - qpc_path = compile_dir / "qpc" - if not onnx_path.is_file(): - raise FileNotFoundError(f"ONNX file not found at: {onnx_path}") - - compile_hash = hashlib.sha256(to_hashable("qnn")) - - if specializations is not None: - compile_hash.update(to_hashable(specializations)) - - if qnn_config is not None: - qnn_config_values = load_json(qnn_config) - compile_hash.update(to_hashable(qnn_config_values)) - - if mdp_ts_num_devices > 1: - compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices})) - - compile_hash.update(to_hashable({"num_cores": num_cores})) - compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul})) - compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache})) - - # Check if already compiled - compile_hash = compile_hash.hexdigest()[:16] - qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash) - if qpc_path.is_dir(): - if (qpc_path / "programqpc.bin").is_file(): - self.qpc_path = qpc_path - return qpc_path - # Probably compilation failure last time, delete directory to start over - shutil.rmtree(qpc_path) - - # Write specializations.json file - if specializations is not None: - specializations_json = compile_dir / "specializations.json" - with open(specializations_json, "w") as fp: - json.dump( - {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]}, - fp, - indent=4, - ) - - qnn_compile( - onnx_path=onnx_path, - qpc_base_path=compile_dir, - num_cores=num_cores, - device_group=list(range(mdp_ts_num_devices)), - batch_size=batch_size, - prompt_len=prefill_seq_len, - ctx_len=ctx_len, - mxfp6=mxfp6_matmul, - mxint8=mxint8_kv_cache, - full_batch_size=full_batch_size, - qnn_config=qnn_config, - qnn_binary_dir=qpc_path, - kv_cache_batch_size=kv_cache_batch_size, - ) - - self.qpc_path = qpc_path - - return qpc_path diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index ae86b493a..5ce22bed9 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -13,6 +13,7 @@ from typing import List, Optional, Tuple from QEfficient.compile.qnn_compiler import compile as qnn_compile +from QEfficient.utils._utils import load_json, load_yaml from QEfficient.utils.logging_utils import logger @@ -180,36 +181,35 @@ def compile( full_batch_size=full_batch_size, ) + # Select the customIO config based on the mx flag. + custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml" + + if custom_io_file_path is None: + custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) + + if not os.path.isfile(custom_io_file_path): + raise FileNotFoundError( + f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API" + ) + if enable_qnn: qpc_path = qnn_compile( onnx_path=onnx_path, qpc_base_path=qpc_path, + qnn_binary_dir=os.path.join(qpc_path, "qpcs"), num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, mxfp6=mxfp6, mxint8=mxint8, allow_mxint8_mdp_io=allow_mxint8_mdp_io, aic_enable_depth_first=aic_enable_depth_first, mos=mos, device_group=device_group, - full_batch_size=full_batch_size, qnn_config=qnn_config, + specializations=(load_json(specialization_json_path))["specializations"], + custom_io=load_yaml(custom_io_file_path), ) logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}") else: - # Select the customIO config based on the mx flag. - custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml" - - if custom_io_file_path is None: - custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) - - if not os.path.isfile(custom_io_file_path): - raise FileNotFoundError( - f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API" - ) - _, qpc_path = compile_kv_model_on_cloud_ai_100( onnx_path=onnx_path, specializations_json=specialization_json_path, diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py index e6758c826..e4b8f854b 100644 --- a/QEfficient/compile/qnn_compiler.py +++ b/QEfficient/compile/qnn_compiler.py @@ -5,13 +5,19 @@ # # ----------------------------------------------------------------------------- +import hashlib +import json import os import shutil -from typing import List, Optional +from typing import Dict, List, Optional from QEfficient.utils._utils import create_json, execute_command, load_json +from QEfficient.utils.cache import to_hashable from QEfficient.utils.constants import QnnConstants -from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config +from QEfficient.utils.generate_qnn_network_specialization_config import ( + generate_data_format_config, + generate_qnn_specialization, +) from QEfficient.utils.logging_utils import logger @@ -31,15 +37,13 @@ def __init__( device_group: Optional[List[int]] = None, compiler_enable_depth_first: bool = False, compiler_max_out_channel_split: int = -1, - batch_size: int = 1, - prompt_len: int = 32, - ctx_len: int = 128, compiler_mxfp6_matmul_weights: bool = True, qnn_target: str = QnnConstants.TARGET, qnn_config_path: Optional[str] = None, qnn_binary_dir: Optional[str] = None, mxint8: Optional[bool] = False, compiler_mxint8_mdp_io: Optional[bool] = False, + prefill_only: Optional[bool] = False, **kwargs, ) -> None: self.onnx_path = onnx_path @@ -48,9 +52,6 @@ def __init__( self.device_group = device_group self.compiler_enable_depth_first = compiler_enable_depth_first self.compiler_max_out_channel_split = compiler_max_out_channel_split - self.batch_size = batch_size - self.prompt_len = prompt_len - self.ctx_len = ctx_len self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights self.qnn_config_path = qnn_config_path self.qnn_binary_dir = qnn_binary_dir @@ -59,6 +60,7 @@ def __init__( self.custom_io_path = custom_io_path self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc") self.qnn_target = qnn_target + self.prefill_only = prefill_only self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) if not self.qnn_sdk_path: raise EnvironmentError( @@ -141,7 +143,7 @@ def create_qnn_compile_backend_json(self) -> str: "compiler_hardware_version": QnnConstants.COMPILER_HARDWARE_VERSION, "compiler_convert_to_FP16": QnnConstants.COMPILER_CONVERT_TO_FP16, "compiler_retained_state": QnnConstants.COMPILER_RETAINED_STATE, - "graph_names": QnnConstants.GRAPH_NAMES, + "graph_names": QnnConstants.GRAPH_NAMES_PREFILL_ONLY if self.prefill_only else QnnConstants.GRAPH_NAMES, "compiler_enable_depth_first": self.compiler_enable_depth_first, "compiler_mxfp6_matmul_weights": self.compiler_mxfp6_matmul_weights, "compiler_num_of_cores": self.num_cores, @@ -327,16 +329,13 @@ def compile( device_group: Optional[List[int]] = None, aic_enable_depth_first: bool = False, mos: int = -1, - batch_size: int = 1, - prompt_len: int = 32, - ctx_len: int = 128, mxfp6: bool = True, mxint8: bool = False, allow_mxint8_mdp_io: Optional[bool] = False, - full_batch_size=None, qnn_config: Optional[str] = None, qnn_binary_dir: Optional[str] = None, - kv_cache_batch_size: Optional[int] = None, + custom_io: Optional[Dict[str, str]] = None, + specializations: Optional[List[Dict[str, int]]] = None, **kwargs, ) -> str: """ @@ -352,16 +351,13 @@ def compile( :device_group (List[int]): Used for finding the number of devices to compile for. :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` - :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` - :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` - :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` - :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.`` :qnn_binary_dir (str): Path for saving qnn binaries. - :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.`` + :custom_io (dict): Custom IO to specify the input and outputs in different formats than default + :specializations (list): List of specializations to compile for Returns: :str: Path to compiled ``qpc`` package. @@ -377,16 +373,11 @@ def compile( # TODO To make custom_io_config.yaml configurable as not all models need it. custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml") - kv_precision = "uint8" if mxint8 else "float16" - fetch_nodes_info( + generate_qnn_specialization( onnx_graph_path=onnx_path, - batch_size=batch_size, - sequence_length=prompt_len, - context_length=ctx_len, + specializations=specializations, + custom_io=custom_io, file_path=custom_io_file_path, - full_batch_size=full_batch_size, - kv_precision=kv_precision, - kv_cache_batch_size=kv_cache_batch_size, ) if not os.path.isfile(custom_io_file_path): @@ -394,6 +385,49 @@ def compile( f"file {custom_io_file_path} needs to exist in the qpc_base_path for Compilation. Please rerun infer/compile Api" ) + prefill_only = True if len(specializations) == 1 else False + + if qnn_binary_dir is None: + compile_hash = hashlib.sha256(to_hashable("qnn")) + + if specializations is not None: + compile_hash.update(to_hashable(specializations)) + + if custom_io is not None: + compile_hash.update(to_hashable(custom_io)) + + if qnn_config is not None: + qnn_config_values = load_json(qnn_config) + compile_hash.update(to_hashable(qnn_config_values)) + + if device_group is not None: + compile_hash.update(to_hashable({"device_group": device_group})) + + compile_hash.update(to_hashable({"num_cores": num_cores})) + compile_hash.update(to_hashable({"mxfp6": mxfp6})) + compile_hash.update(to_hashable({"mxint8": mxint8})) + + # Check if already compiled + compile_hash = compile_hash.hexdigest()[:16] + + qnn_binary_dir = qpc_base_path / "qpc" + qnn_binary_dir = qnn_binary_dir.with_name(qnn_binary_dir.name + "-" + compile_hash) + if qnn_binary_dir.is_dir(): + if (qnn_binary_dir / "programqpc.bin").is_file(): + return qnn_binary_dir + # Probably compilation failure last time, delete directory to start over + shutil.rmtree(qnn_binary_dir) + + # Write specializations.json file + if specializations is not None: + specializations_json = qpc_base_path / "specializations.json" + with open(specializations_json, "w") as fp: + json.dump( + {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]}, + fp, + indent=4, + ) + qnn_obj = QNN( onnx_path=onnx_path, qpc_base_path=qpc_base_path, @@ -403,13 +437,11 @@ def compile( custom_io_path=custom_io_file_path, compiler_enable_depth_first=aic_enable_depth_first, compiler_max_out_channel_split=mos, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, compiler_mxfp6_matmul_weights=mxfp6, qnn_binary_dir=qnn_binary_dir, mxint8=mxint8, compiler_mxint8_mdp_io=allow_mxint8_mdp_io, + prefill_only=prefill_only, ) compiled_binary_path = qnn_obj.compile() diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py index deb64fae1..12c814ce6 100644 --- a/QEfficient/peft/auto.py +++ b/QEfficient/peft/auto.py @@ -251,6 +251,7 @@ def compile( custom_io=custom_io, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, + mxint8_kv_cache=mxint8_kv_cache, **compiler_options, ) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 0182c4ef1..04f54047d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -603,23 +603,14 @@ def compile( mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, num_speculative_tokens: Optional[int] = None, - enable_qnn: bool = False, - qnn_config: Optional[str] = None, skip_vision: Optional[bool] = False, skip_lang: Optional[bool] = False, **compiler_options, ) -> str: - if ( - any( - param is not None - for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config] - ) - or enable_qnn - ): + if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]): raise ValueError( - f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: " + f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: " f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " - f"enable_qnn={enable_qnn}, qnn_config={qnn_config}" ) if skip_lang and skip_vision: @@ -662,6 +653,7 @@ def compile( mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, custom_io=custom_io_vision, + mxint8_kv_cache=mxint8_kv_cache, **compiler_options, ) @@ -687,6 +679,7 @@ def compile( mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, custom_io=custom_io_lang, + mxint8_kv_cache=mxint8_kv_cache, **compiler_options, ) return self.qpc_path @@ -930,21 +923,12 @@ def compile( mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, num_speculative_tokens: Optional[int] = None, - enable_qnn: bool = False, - qnn_config: Optional[str] = None, **compiler_options, ) -> str: - if ( - any( - param is not None - for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config] - ) - or enable_qnn - ): + if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]): raise ValueError( - f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: " + f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: " f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " - f"enable_qnn={enable_qnn}, qnn_config={qnn_config}" ) output_names = self.model.get_output_names() @@ -982,6 +966,7 @@ def compile( custom_io=custom_io, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, + mxint8_kv_cache=mxint8_kv_cache, **compiler_options, ) return self.qpc_path @@ -1540,8 +1525,6 @@ def compile( mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, num_speculative_tokens: Optional[int] = None, - enable_qnn: bool = False, - qnn_config: Optional[str] = None, prefill_only: Optional[bool] = None, **compiler_options, ) -> str: @@ -1564,10 +1547,14 @@ def compile( :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``. - :compiler_options (dict, optional): Any other options that the `qaic-exec` takes. ``Defaults to None``. + :compiler_options (dict, optional): Pass any compiler option as input. ``Defaults to None``. + Following flag can be passed in compiler_options to enable QNN Compilation path. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed`` + for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: + - aic_num_cores=16 -> -aic-num-cores=16 + - convert_to_fp16=True -> -convert-to-fp16 Returns: :str: Path of the compiled ``qpc`` package. @@ -1616,48 +1603,29 @@ def compile( specializations.append(decode_spec) # --- Compilation --- - if enable_qnn: - if compiler_options: - logger.warning("Extra arguments to QNN compilation are ignored. Use `qnn_config.json`.") - - qpc_path = self._qnn_compile( - onnx_path=onnx_path, - compile_dir=compile_dir, - specializations=specializations, - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - batch_size=batch_size, - full_batch_size=full_batch_size, - mdp_ts_num_devices=num_devices, - num_cores=num_cores, - mxfp6_matmul=mxfp6_matmul, - mxint8_kv_cache=mxint8_kv_cache, - qnn_config=qnn_config, - kv_cache_batch_size=kv_cache_batch_size, - ) - else: - kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" - custom_io = {} + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" + custom_io = {} - for suffix in ["", "_RetainedState"]: - for i in range(self.num_layers): - for kv in ["key", "value"]: - custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype + for suffix in ["", "_RetainedState"]: + for i in range(self.num_layers): + for kv in ["key", "value"]: + custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype - qpc_path = self._compile( - onnx_path=onnx_path, - compile_dir=compile_dir, - compile_only=True, - retained_state=True, - specializations=specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - custom_io=custom_io, - mdp_ts_num_devices=num_devices, - num_speculative_tokens=num_speculative_tokens, - aic_num_cores=num_cores, - **compiler_options, - ) + qpc_path = self._compile( + onnx_path=onnx_path, + compile_dir=compile_dir, + compile_only=True, + retained_state=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + custom_io=custom_io, + mdp_ts_num_devices=num_devices, + num_speculative_tokens=num_speculative_tokens, + aic_num_cores=num_cores, + mxint8_kv_cache=mxint8_kv_cache, + **compiler_options, + ) return qpc_path @@ -1802,8 +1770,6 @@ def compile( mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, num_speculative_tokens: Optional[int] = None, - enable_qnn: bool = False, - qnn_config: Optional[str] = None, **compiler_options, ) -> str: """ @@ -1845,9 +1811,6 @@ def compile( if num_speculative_tokens: logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq") - if enable_qnn or qnn_config: - logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq") - return self._compile( onnx_path, compile_dir, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index dd4ddd0cf..ea09e97d7 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -393,6 +393,26 @@ def execute_command(process: str, command: str, output_file_path: Optional[str] print(f"Failed to create {stderr_path}: {e}") +def load_yaml(file_path: str) -> Dict[Any, Any]: + """ + Opens the given YAML file, load and return the Dict. + + ``Mandatory`` Args: + :file_path (str): YAML File to be opened. + + Return: + Dict Object from the given file. + + """ + try: + # Load the YAML config file + with open(file_path, "r") as file: + config_data = yaml.safe_load(file) + except Exception as e: + raise ValueError(f"Failed to load YAML object from {file_path}: {e}") + return config_data + + def load_json(file_path: str) -> Dict[Any, Any]: """ Opens the given JSON file, load and return the JSON object. diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 1180b35d0..b1ff9701e 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -144,6 +144,7 @@ class QnnConstants: COMPILER_STATS_BATCH_SIZE = 1 COMPILER_TIME_PASSES = False GRAPH_NAMES = [f"{MODEL_NAME}_configuration_1", f"{MODEL_NAME}_configuration_2"] + GRAPH_NAMES_PREFILL_ONLY = [f"{MODEL_NAME}"] # qnn_config JSON file supported Keys CONVERTER_ARGS_EXTENSION_STR = "converter_args_extension" @@ -170,3 +171,14 @@ class QnnConstants: "--dlc_path ", "--config_file ", ] + + QNN_SAMPLE_CONFIG = { + "converter_args_extension": "--onnx_defer_loading", + "context_binary_generator_args_extension": "--log_level debug", + "qnn_compilation_backend": { + "compiler_enable_depth_first": True, + "compiler_printDDRStats": False, + "compiler_printPerfMetrics": False, + }, + "SKIP_QNN_CONVERTER_STEP": False, + } diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py index fe72918dc..1a437af78 100644 --- a/QEfficient/utils/generate_qnn_network_specialization_config.py +++ b/QEfficient/utils/generate_qnn_network_specialization_config.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import json -from typing import Optional +from typing import Dict, List, Optional import onnx import yaml @@ -17,15 +17,11 @@ """ -def fetch_nodes_info( +def generate_qnn_specialization( onnx_graph_path: str, - batch_size: int, - sequence_length: int, - context_length: int, + specializations: List[Dict[str, int]], + custom_io: Optional[Dict[str, str]] = None, file_path: str = "custom_io_config.yaml", - full_batch_size: Optional[int] = None, - kv_precision: Optional[str] = "float16", - kv_cache_batch_size: Optional[int] = None, ) -> None: """ Generates network specialization config custom IO file for converter stage in QNN compilation. @@ -34,134 +30,113 @@ def fetch_nodes_info( ``Mandatory`` Args: :onnx_graph_path (str): Generated ``ONNX`` Model Path. - :batch_size (int): Batch size to compile the model for. - :sequence_length (int): Sequence length for the model to compile. - :context_length (int): Maximum context length to compile the model. + :specializations (List[Dict[str, int]]): Specialization file containing compilation parameter values. ``Optional`` Args: + :custom_io (Dict[str, str]): Custom IO containing overriding datatype information for onnx graph nodes used in compilation. :file_path (str): File path to save the generated custom IO config. ``Defaults to custom_io_config.yaml.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` - :kv_precision (str): Sets kv precision for compilation. ``Defaults to float16.`` - :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.`` """ - # Load the ONNX model + # Load the ONNX model graph onnx_model = onnx.load(onnx_graph_path) - input_nodes = [] input_nodes_info = [] final_dict = {} - output_nodes = [] output_nodes_info = [] + + # Populating input graph nodes information. for node in onnx_model.graph.input: - input_nodes.append(node.name) input_info = {} + + # Assigining data type value as per the onnx graph input. input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type)) - if "past_key" in node.name or "past_value" in node.name: - input_info["DataType"] = kv_precision - if "batch_index" in node.name: - if full_batch_size: - input_info["Shape"] = f"(1, 1), ({full_batch_size}, 1)" + # Over riding the data type according to the custom_io (if provided). + if custom_io is not None and node.name in custom_io: + input_info["DataType"] = "uint8" if custom_io[node.name] == "mxint8" else custom_io[node.name] + + # Create Shapes List for the input node. + shapes = [] + for input_shape in node.type.tensor_type.shape.dim: + if input_shape.HasField("dim_value"): + shape = input_shape.dim_value + elif input_shape.HasField("dim_param"): + shape = input_shape.dim_param else: - raise AttributeError( - "ERROR: Full batch size is required for populating batch_index in custom_io_config.yaml" - ) - else: - shapes = [] - for input_shape in node.type.tensor_type.shape.dim: - if input_shape.HasField("dim_value"): - shape = input_shape.dim_value - elif input_shape.HasField("dim_param"): - shape = input_shape.dim_param + raise AttributeError(f"ERROR: {input_shape} Shape not Found") + shapes.append(shape) + + # Filling shape value for nodes with shape size != 2, example: past_key / past_value nodes. + if len(shapes) != 2: + shape_list = [] + for input_shape in shapes: + # If shape contains the parameter string, it value is extracted from the specialization file. + if isinstance(input_shape, str): + if input_shape in specializations[0]: + shape_list.append(int(specializations[0][input_shape])) + else: + raise AttributeError(f"ERROR: {input_shape} is required in specializations") + # If shape contains the value, then that value is used as it is. else: - shape = "shape_not_found" - shapes.append(shape) - - if ( - ("batch_size" in shapes or "full_batch_size" in shapes) - and ("ctx_len" in shapes or "max_context_len" in shapes) - and len(shapes) >= 3 - ): - shapeList = [] - for shape in shapes: - if isinstance(shape, str): - if "full_batch_size" in shape: - if ("past_key" in node.name or "past_value" in node.name) and kv_cache_batch_size: - shapeList.append(kv_cache_batch_size) - elif full_batch_size: - shapeList.append(full_batch_size) - else: - raise AttributeError( - "ERROR: Full batch size is required to generate custom_io_config.yaml" - ) - elif "batch_size" in shape: - shapeList.append(batch_size) - elif shape in ["ctx_len", "max_context_len"]: - shapeList.append(context_length) + shape_list.append(input_shape) + + # Calculated shape is now assigned to the input node. + input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")") + # If shape value for nodes is with shape size == 2, example: input_ids, position_ids, etc. + else: + shape_list = [] + for input_shape in shapes: + if isinstance(input_shape, str): + if input_shape in specializations[0]: + shape_list.append(int(specializations[0][input_shape])) else: - shapeList.append(shape) - shape = str(shapeList).replace("[", "(").replace("]", ")") - elif "batch_size" in shapes and ("seq_len" in shapes or "prompt_len" in shapes): - shape_1 = ( - str( - [ - batch_size if isinstance(shape, str) and "batch_size" in shape else sequence_length - for shape in shapes - ] - ) - .replace("[", "(") - .replace("]", ")") - ) - if full_batch_size: - shape_2 = ( - str( - [ - full_batch_size if isinstance(shape, str) and "batch_size" in shape else 1 - for shape in shapes - ] - ) - .replace("[", "(") - .replace("]", ")") - ) + raise AttributeError(f"ERROR: {input_shape} is required in specializations") else: - shape_2 = ( - str([batch_size if isinstance(shape, str) and "batch_size" in shape else 1 for shape in shapes]) - .replace("[", "(") - .replace("]", ")") - ) - shape = shape_1 + "," + shape_2 - elif ("batch_size" in shapes or "full_batch_size" in shapes) and ( - "ctx_len" in shapes or "max_context_len" in shapes - ): - shape = ( - str( - [ - batch_size if isinstance(shape, str) and "batch_size" in shape else context_length - for shape in shapes - ] - ) - .replace("[", "(") - .replace("]", ")") + shape_list.append(input_shape) + # If specializations file contains more than one parameters list, then first list is used for prefill and second one for decode graph. + if len(specializations) > 1: + prefill_shape_list = shape_list + decode_shape_list = [] + for input_shape in shapes: + if isinstance(input_shape, str): + if input_shape in specializations[1]: + decode_shape_list.append(int(specializations[1][input_shape])) + else: + raise AttributeError(f"ERROR: {input_shape} is required in specializations") + else: + decode_shape_list.append(input_shape) + + input_info["Shape"] = ( + str(prefill_shape_list).replace("[", "(").replace("]", ")") + + ", " + + str(decode_shape_list).replace("[", "(").replace("]", ")") ) - input_info["Shape"] = shape + + # If specializations file contains only one parameters list, then that list is used for decode graph information. + else: + input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")") + + # Finally, input node is created with its name, and desired model parameters {DataType, Shape} input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info}) # Prepare output tensor configuration for output in onnx_model.graph.output: - output_nodes.append(output.name) output_info = {} + + # Assigining data type value as per the onnx graph input. output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type)) - if "past_key" in output.name or "past_value" in output.name: - output_info["DataType"] = kv_precision - elif "logits" in output.name: - output_info["DataType"] = "float32" + + # Over riding the data type according to the custom_io (if provided). + if custom_io is not None and output.name in custom_io: + output_info["DataType"] = "uint8" if custom_io[output.name] == "mxint8" else custom_io[output.name] + + # Finally, output node is created with its name, and desired model parameters {DataType} output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info}) - # Combine input and output configurations + # Combining input and output configurations final_dict = {"Input Tensor Configuration": input_nodes_info, "Output Tensor Configuration": output_nodes_info} - # Save the configuration to a YAML file + # Saving the configuration to a YAML file try: with open(file_path, "w") as yaml_file: yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 24113f9c8..26278c359 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -117,7 +117,7 @@ pipeline { } stage('QNN Non-CLI Tests') { steps { - timeout(time: 60, unit: 'MINUTES') { + timeout(time: 200, unit: 'MINUTES') { sh ''' sudo docker exec ${BUILD_TAG} bash -c " source /qnn_sdk/bin/envsetup.sh && diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py deleted file mode 100644 index 65acab157..000000000 --- a/tests/qnn_tests/test_causal_lm_models_qnn.py +++ /dev/null @@ -1,176 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import os - -import numpy as np -import pytest -from transformers import AutoModelForCausalLM - -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers -from QEfficient.utils import hf_download -from QEfficient.utils._utils import load_hf_tokenizer -from QEfficient.utils.constants import Constants -from QEfficient.utils.device_utils import get_available_device_id -from QEfficient.utils.run_utils import ApiRunner - -test_models = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "gpt2", -] - - -def load_causal_lm_model(model_config): - """ - Function to load model from huggingface and transform to KV model - -------- - - :model_config: Dict - - :return model_hf, params - """ - model_path = hf_download( - repo_id=model_config["model_name"], - ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], - ) - model_hf = AutoModelForCausalLM.from_pretrained( - model_path, - use_cache=True, - num_hidden_layers=model_config["n_layer"], - attn_implementation="eager", - low_cpu_mem_usage=False, - ) # Run models for single layers only - params = sum(p.numel() for p in model_hf.parameters()) - model_hf.eval() - return model_hf, params - - -def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name: str, - prompt_len: int = Constants.PROMPT_LEN, - ctx_len: int = Constants.CTX_LEN, - n_layer: int = 1, -): - """ - Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` - :prompt_len (int): Prompt length for the model to compile. - :ctx_len (int): Maximum context length to compile the model. - :n_layers (int): Number of layers for the Model. - """ - replace_transformers_quantizers() - model_config = {"model_name": model_name} - model_config["n_layer"] = n_layer - - model_hf, _ = load_causal_lm_model(model_config) - - tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) - config = model_hf.config - batch_size = len(Constants.INPUT_STR) - api_runner = ApiRunner( - batch_size, - tokenizer, - config, - Constants.INPUT_STR, - Constants.PROMPT_LEN, - Constants.CTX_LEN, - ) - - pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) - - qeff_model = QEFFAutoModelForCausalLM(model_hf) - - pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - - assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - "Tokens don't match for HF PyTorch model output and KV PyTorch model output" - ) - - onnx_model_path = qeff_model.export() - ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path) - - assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output." - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - - qpc_path = qeff_model.compile( - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - num_cores=14, - mxfp6=False, - aic_enable_depth_first=False, - enable_qnn=True, - ) - assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) - exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) - cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size - gen_len = ort_tokens.shape[-1] - assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( - "Tokens don't match for ONNXRT output and Cloud AI 100 output." - ) - - # testing for CB models - model_hf, _ = load_causal_lm_model(model_config) - full_batch_size = 4 - fbs_prompts = Constants.INPUT_STR * 4 - api_runner = ApiRunner( - batch_size, - tokenizer, - config, - fbs_prompts, - Constants.PROMPT_LEN, - Constants.CTX_LEN, - full_batch_size, - ) - - pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) - pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) - - qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True) - onnx_model_path = qeff_model.export() - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - - qpc_path = qeff_model.compile( - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - num_cores=14, - mxfp6=False, - aic_enable_depth_first=False, - full_batch_size=full_batch_size, - enable_qnn=True, - ) - assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) - exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - - assert all( - [ - all(pt_token[:24] == cloud_token[:24]) - for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) - ] - ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." - - -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.parametrize("model_name", test_models) -def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): - """ - Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` - """ - if model_name == "microsoft/Phi-3-mini-4k-instruct": - n_layer = 2 # test only 2 layer models - else: - n_layer = 1 - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 21db8946d..67eec2e50 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -16,12 +16,12 @@ from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download -from QEfficient.utils._utils import load_hf_tokenizer -from QEfficient.utils.constants import Constants +from QEfficient.utils._utils import create_json, load_hf_tokenizer +from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id from QEfficient.utils.run_utils import ApiRunner -test_models = [ +test_models_qaic = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "gpt2", "Salesforce/codegen-350M-mono", @@ -46,6 +46,13 @@ "ibm-granite/granite-guardian-3.1-2b", ] +test_models_qnn = [ + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "meta-llama/Llama-3.2-1B", + "unsloth/gemma-2b", + "ibm-granite/granite-guardian-3.1-2b", +] + spd_test_models = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", ] @@ -83,6 +90,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( n_layer: int = 1, num_speculative_tokens: Optional[int] = None, prefill_only: Optional[bool] = None, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -138,6 +147,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( aic_enable_depth_first=False, num_speculative_tokens=num_speculative_tokens, prefill_only=prefill_only, + enable_qnn=enable_qnn, + qnn_config=qnn_config, ) exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) cloud_ai_100_tokens = exec_info.generated_ids[0][ @@ -186,6 +197,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( aic_enable_depth_first=False, full_batch_size=full_batch_size, num_speculative_tokens=num_speculative_tokens, + enable_qnn=enable_qnn, + qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) @@ -229,7 +242,7 @@ def test_causal_lm_export_with_deprecated_api(model_name): @pytest.mark.on_qaic -@pytest.mark.parametrize("model_name", test_models) +@pytest.mark.parametrize("model_name", test_models_qaic) def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -244,6 +257,29 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.parametrize("model_name", test_models_qnn) +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): + """ + QNN Compilation Test + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + if model_name == "microsoft/Phi-3-mini-4k-instruct": + n_layer = 2 # test only 2 layer models + else: + n_layer = 1 + + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path + ) + + @pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", spd_test_models) @@ -275,6 +311,23 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) +@pytest.mark.on_qaic +@pytest.mark.qnn +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn(): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. + """ + model_name = "gpt2" + prompt_len = 1 + + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path + ) + + @pytest.mark.on_qaic def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): model_name = "gpt2" @@ -282,3 +335,21 @@ def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100(): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100_qnn(): + model_name = "gpt2" + n_layer = 1 + + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name, n_layer=n_layer, prefill_only=True, enable_qnn=True, qnn_config=qnn_config_json_path + ) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name, n_layer=n_layer, prefill_only=False, enable_qnn=True, qnn_config=qnn_config_json_path + ) diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index e681f5093..22f4bd580 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import os +from typing import Optional import numpy as np import onnxruntime as ort @@ -13,7 +14,8 @@ from transformers import AutoModel, AutoTokenizer from QEfficient.transformers.models.modeling_auto import QEFFAutoModel -from QEfficient.utils.constants import Constants +from QEfficient.utils._utils import create_json +from QEfficient.utils.constants import Constants, QnnConstants embed_test_models = [ # model_name, architecture @@ -27,6 +29,8 @@ def check_embed_pytorch_vs_ort_vs_ai100( model_name: str, seq_len: int = Constants.CTX_LEN, n_layer: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, ): # Prepare input tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -71,6 +75,8 @@ def check_embed_pytorch_vs_ort_vs_ai100( qeff_model.compile( num_cores=14, + enable_qnn=enable_qnn, + qnn_config=qnn_config, ) ai100_output = qeff_model.generate(inputs=inputs) @@ -88,3 +94,19 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. """ check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.parametrize("model_name", embed_test_models) +def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name): + """ + QNN Compilation path test. + Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. + """ + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_embed_pytorch_vs_ort_vs_ai100( + model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path + ) diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index 467aa174b..8e0c061b8 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -14,6 +14,7 @@ from QEfficient.generation.text_generation_inference import TextGeneration from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils._utils import create_json +from QEfficient.utils.constants import QnnConstants test_models = ["gpt2"] @@ -39,18 +40,8 @@ def test_simple_prefix_caching(model_name): @pytest.mark.parametrize("model_name", test_models) def test_simple_prefix_caching_qnn(model_name): qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True) - qnn_config = { - "converter_args_extension": "", - "context_binary_generator_args_extension": "--log_level debug", - "qnn_compilation_backend": { - "compiler_enable_depth_first": True, - "compiler_printDDRStats": False, - "compiler_printPerfMetrics": False, - }, - "SKIP_QNN_CONVERTER_STEP": False, - } qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, qnn_config) + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) qeff_model.compile( prefill_seq_len=128, diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 17d82bde5..b486e0850 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -7,7 +7,7 @@ import os from importlib import reload -from typing import List +from typing import List, Optional import numpy as np import onnx @@ -21,8 +21,8 @@ from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSpeechSeq2Seq from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import get_padding_shape_from_config, hf_download -from QEfficient.utils._utils import load_hf_processor -from QEfficient.utils.constants import Constants +from QEfficient.utils._utils import create_json, load_hf_processor +from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id test_models = [ @@ -292,6 +292,8 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, ctx_len: int = Constants.CTX_LEN, n_layer: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, ONNX model and the Cloud AI 100 model @@ -337,6 +339,8 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( ctx_len=ctx_len, num_cores=16, batch_size=batch_size, + enable_qnn=enable_qnn, + qnn_config=qnn_config, ) exec_info = qeff_model.generate( @@ -358,3 +362,22 @@ def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name): :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.skip(reason="Whisper is currently not supported on QNN") +@pytest.mark.parametrize("model_name", test_models) +def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): + """ + QNN Compilation path test. + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer=4, enable_qnn=True, qnn_config=qnn_config_json_path + )