diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index a9fd8d663..119ec599f 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -24,7 +24,6 @@
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import constants, dump_qconfig
-from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
 logger = logging.getLogger(__name__)
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
             :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
             :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
 
@@ -217,10 +220,13 @@ def _compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
+        mxint8_kv_cache: bool = False,
         specializations: Optional[List[Dict[str, int]]] = None,
         custom_io: Optional[Dict[str, str]] = None,
         mdp_ts_num_devices: int = 1,
         num_speculative_tokens: Optional[int] = None,
+        enable_qnn: Optional[bool] = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -229,10 +235,13 @@ def _compile(
         Args:
             :onnx_path (str): Onnx file to compile
             :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
+            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :specializations (list): List of specializations to compile for
             :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
+            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
+
+        if enable_qnn:
+            self.qpc_path = qnn_compile(
+                onnx_path=onnx_path,
+                qpc_base_path=compile_dir,
+                specializations=specializations,
+                custom_io=custom_io,
+                device_group=list(range(mdp_ts_num_devices)),
+                num_cores=compiler_options.get("aic_num_cores", 16),
+                mxfp6=compiler_options.get("mxfp6_matmul", False),
+                mxint8=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+
+            return self.qpc_path
+
         command = constants.COMPILER + [f"-m={onnx_path}"]
         if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
             mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
         self.qpc_path = qpc_path
 
         return qpc_path
-
-    @dump_qconfig
-    def _qnn_compile(
-        self,
-        onnx_path: Optional[str] = None,
-        compile_dir: Optional[str] = None,
-        *,
-        specializations: Optional[List[Dict[str, int]]] = None,
-        prefill_seq_len: int = 32,
-        ctx_len: int = 128,
-        batch_size: int = 1,
-        full_batch_size: Optional[int] = None,
-        mdp_ts_num_devices: int = 1,
-        num_cores: int = 16,
-        mxfp6_matmul: bool = False,
-        mxint8_kv_cache: bool = False,
-        qnn_config: Optional[str] = None,
-        kv_cache_batch_size: Optional[int] = None,
-    ) -> str:
-        """
-        Interface for QNN compiler
-
-        Args:
-            :onnx_path (str): Onnx file to compile
-            :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
-            :specializations (list): List of specializations to compile for
-            :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
-            :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
-            :batch_size (int, optional): Batch size. ``Defaults to 1``.
-            :full_batch_size (int, optional): Continuous batching batch size.
-            :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
-            :num_cores (int): Number of cores used to compile the model.
-            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
-            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
-        """
-        if onnx_path is None and self.onnx_path is None:
-            self.export()
-
-        onnx_path = Path(onnx_path or self.onnx_path)
-        compile_dir = Path(compile_dir or onnx_path.parent)
-        qpc_path = compile_dir / "qpc"
-        if not onnx_path.is_file():
-            raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
-
-        compile_hash = hashlib.sha256(to_hashable("qnn"))
-
-        if specializations is not None:
-            compile_hash.update(to_hashable(specializations))
-
-        if qnn_config is not None:
-            qnn_config_values = load_json(qnn_config)
-            compile_hash.update(to_hashable(qnn_config_values))
-
-        if mdp_ts_num_devices > 1:
-            compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
-
-        compile_hash.update(to_hashable({"num_cores": num_cores}))
-        compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
-        compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
-
-        # Check if already compiled
-        compile_hash = compile_hash.hexdigest()[:16]
-        qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
-        if qpc_path.is_dir():
-            if (qpc_path / "programqpc.bin").is_file():
-                self.qpc_path = qpc_path
-                return qpc_path
-            # Probably compilation failure last time, delete directory to start over
-            shutil.rmtree(qpc_path)
-
-        # Write specializations.json file
-        if specializations is not None:
-            specializations_json = compile_dir / "specializations.json"
-            with open(specializations_json, "w") as fp:
-                json.dump(
-                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
-                    fp,
-                    indent=4,
-                )
-
-        qnn_compile(
-            onnx_path=onnx_path,
-            qpc_base_path=compile_dir,
-            num_cores=num_cores,
-            device_group=list(range(mdp_ts_num_devices)),
-            batch_size=batch_size,
-            prompt_len=prefill_seq_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6_matmul,
-            mxint8=mxint8_kv_cache,
-            full_batch_size=full_batch_size,
-            qnn_config=qnn_config,
-            qnn_binary_dir=qpc_path,
-            kv_cache_batch_size=kv_cache_batch_size,
-        )
-
-        self.qpc_path = qpc_path
-
-        return qpc_path
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index ae86b493a..5ce22bed9 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -13,6 +13,7 @@
 from typing import List, Optional, Tuple
 
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
+from QEfficient.utils._utils import load_json, load_yaml
 from QEfficient.utils.logging_utils import logger
 
 
@@ -180,36 +181,35 @@ def compile(
         full_batch_size=full_batch_size,
     )
 
+    # Select the customIO config based on the mx flag.
+    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
+
+    if custom_io_file_path is None:
+        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+
+    if not os.path.isfile(custom_io_file_path):
+        raise FileNotFoundError(
+            f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
+        )
+
     if enable_qnn:
         qpc_path = qnn_compile(
             onnx_path=onnx_path,
             qpc_base_path=qpc_path,
+            qnn_binary_dir=os.path.join(qpc_path, "qpcs"),
             num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
             mxfp6=mxfp6,
             mxint8=mxint8,
             allow_mxint8_mdp_io=allow_mxint8_mdp_io,
             aic_enable_depth_first=aic_enable_depth_first,
             mos=mos,
             device_group=device_group,
-            full_batch_size=full_batch_size,
             qnn_config=qnn_config,
+            specializations=(load_json(specialization_json_path))["specializations"],
+            custom_io=load_yaml(custom_io_file_path),
         )
         logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
     else:
-        # Select the customIO config based on the mx flag.
-        custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
-
-        if custom_io_file_path is None:
-            custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-        if not os.path.isfile(custom_io_file_path):
-            raise FileNotFoundError(
-                f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
-            )
-
         _, qpc_path = compile_kv_model_on_cloud_ai_100(
             onnx_path=onnx_path,
             specializations_json=specialization_json_path,
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
index e6758c826..e4b8f854b 100644
--- a/QEfficient/compile/qnn_compiler.py
+++ b/QEfficient/compile/qnn_compiler.py
@@ -5,13 +5,19 @@
 #
 # -----------------------------------------------------------------------------
 
+import hashlib
+import json
 import os
 import shutil
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from QEfficient.utils._utils import create_json, execute_command, load_json
+from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.constants import QnnConstants
-from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info, generate_data_format_config
+from QEfficient.utils.generate_qnn_network_specialization_config import (
+    generate_data_format_config,
+    generate_qnn_specialization,
+)
 from QEfficient.utils.logging_utils import logger
 
 
@@ -31,15 +37,13 @@ def __init__(
         device_group: Optional[List[int]] = None,
         compiler_enable_depth_first: bool = False,
         compiler_max_out_channel_split: int = -1,
-        batch_size: int = 1,
-        prompt_len: int = 32,
-        ctx_len: int = 128,
         compiler_mxfp6_matmul_weights: bool = True,
         qnn_target: str = QnnConstants.TARGET,
         qnn_config_path: Optional[str] = None,
         qnn_binary_dir: Optional[str] = None,
         mxint8: Optional[bool] = False,
         compiler_mxint8_mdp_io: Optional[bool] = False,
+        prefill_only: Optional[bool] = False,
         **kwargs,
     ) -> None:
         self.onnx_path = onnx_path
@@ -48,9 +52,6 @@ def __init__(
         self.device_group = device_group
         self.compiler_enable_depth_first = compiler_enable_depth_first
         self.compiler_max_out_channel_split = compiler_max_out_channel_split
-        self.batch_size = batch_size
-        self.prompt_len = prompt_len
-        self.ctx_len = ctx_len
         self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
         self.qnn_config_path = qnn_config_path
         self.qnn_binary_dir = qnn_binary_dir
@@ -59,6 +60,7 @@ def __init__(
         self.custom_io_path = custom_io_path
         self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
         self.qnn_target = qnn_target
+        self.prefill_only = prefill_only
         self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
         if not self.qnn_sdk_path:
             raise EnvironmentError(
@@ -141,7 +143,7 @@ def create_qnn_compile_backend_json(self) -> str:
             "compiler_hardware_version": QnnConstants.COMPILER_HARDWARE_VERSION,
             "compiler_convert_to_FP16": QnnConstants.COMPILER_CONVERT_TO_FP16,
             "compiler_retained_state": QnnConstants.COMPILER_RETAINED_STATE,
-            "graph_names": QnnConstants.GRAPH_NAMES,
+            "graph_names": QnnConstants.GRAPH_NAMES_PREFILL_ONLY if self.prefill_only else QnnConstants.GRAPH_NAMES,
             "compiler_enable_depth_first": self.compiler_enable_depth_first,
             "compiler_mxfp6_matmul_weights": self.compiler_mxfp6_matmul_weights,
             "compiler_num_of_cores": self.num_cores,
@@ -327,16 +329,13 @@ def compile(
     device_group: Optional[List[int]] = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
-    batch_size: int = 1,
-    prompt_len: int = 32,
-    ctx_len: int = 128,
     mxfp6: bool = True,
     mxint8: bool = False,
     allow_mxint8_mdp_io: Optional[bool] = False,
-    full_batch_size=None,
     qnn_config: Optional[str] = None,
     qnn_binary_dir: Optional[str] = None,
-    kv_cache_batch_size: Optional[int] = None,
+    custom_io: Optional[Dict[str, str]] = None,
+    specializations: Optional[List[Dict[str, int]]] = None,
     **kwargs,
 ) -> str:
     """
@@ -352,16 +351,13 @@ def compile(
         :device_group (List[int]): Used for finding the number of devices to compile for.
         :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
         :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
-        :batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
-        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
-        :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32``
-        :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128``
         :mxfp6 (bool): Enable compilation for ``MXFP6`` precision.  ``Defaults to True.``
-        :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
         :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
+        :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
         :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
         :qnn_binary_dir (str): Path for saving qnn binaries.
-        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
+        :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
+        :specializations (list): List of specializations to compile for
 
     Returns:
         :str: Path to compiled ``qpc`` package.
@@ -377,16 +373,11 @@ def compile(
     # TODO To make custom_io_config.yaml configurable as not all models need it.
     custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
 
-    kv_precision = "uint8" if mxint8 else "float16"
-    fetch_nodes_info(
+    generate_qnn_specialization(
         onnx_graph_path=onnx_path,
-        batch_size=batch_size,
-        sequence_length=prompt_len,
-        context_length=ctx_len,
+        specializations=specializations,
+        custom_io=custom_io,
         file_path=custom_io_file_path,
-        full_batch_size=full_batch_size,
-        kv_precision=kv_precision,
-        kv_cache_batch_size=kv_cache_batch_size,
     )
 
     if not os.path.isfile(custom_io_file_path):
@@ -394,6 +385,49 @@ def compile(
             f"file {custom_io_file_path} needs to exist in the qpc_base_path for Compilation. Please rerun infer/compile Api"
         )
 
+    prefill_only = True if len(specializations) == 1 else False
+
+    if qnn_binary_dir is None:
+        compile_hash = hashlib.sha256(to_hashable("qnn"))
+
+        if specializations is not None:
+            compile_hash.update(to_hashable(specializations))
+
+        if custom_io is not None:
+            compile_hash.update(to_hashable(custom_io))
+
+        if qnn_config is not None:
+            qnn_config_values = load_json(qnn_config)
+            compile_hash.update(to_hashable(qnn_config_values))
+
+        if device_group is not None:
+            compile_hash.update(to_hashable({"device_group": device_group}))
+
+        compile_hash.update(to_hashable({"num_cores": num_cores}))
+        compile_hash.update(to_hashable({"mxfp6": mxfp6}))
+        compile_hash.update(to_hashable({"mxint8": mxint8}))
+
+        # Check if already compiled
+        compile_hash = compile_hash.hexdigest()[:16]
+
+        qnn_binary_dir = qpc_base_path / "qpc"
+        qnn_binary_dir = qnn_binary_dir.with_name(qnn_binary_dir.name + "-" + compile_hash)
+        if qnn_binary_dir.is_dir():
+            if (qnn_binary_dir / "programqpc.bin").is_file():
+                return qnn_binary_dir
+            # Probably compilation failure last time, delete directory to start over
+            shutil.rmtree(qnn_binary_dir)
+
+        # Write specializations.json file
+        if specializations is not None:
+            specializations_json = qpc_base_path / "specializations.json"
+            with open(specializations_json, "w") as fp:
+                json.dump(
+                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
+                    fp,
+                    indent=4,
+                )
+
     qnn_obj = QNN(
         onnx_path=onnx_path,
         qpc_base_path=qpc_base_path,
@@ -403,13 +437,11 @@ def compile(
         custom_io_path=custom_io_file_path,
         compiler_enable_depth_first=aic_enable_depth_first,
         compiler_max_out_channel_split=mos,
-        batch_size=batch_size,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
         compiler_mxfp6_matmul_weights=mxfp6,
         qnn_binary_dir=qnn_binary_dir,
         mxint8=mxint8,
         compiler_mxint8_mdp_io=allow_mxint8_mdp_io,
+        prefill_only=prefill_only,
     )
 
     compiled_binary_path = qnn_obj.compile()
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
index deb64fae1..12c814ce6 100644
--- a/QEfficient/peft/auto.py
+++ b/QEfficient/peft/auto.py
@@ -251,6 +251,7 @@ def compile(
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 0182c4ef1..04f54047d 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -603,23 +603,14 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         skip_vision: Optional[bool] = False,
         skip_lang: Optional[bool] = False,
         **compiler_options,
     ) -> str:
-        if (
-            any(
-                param is not None
-                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
-            )
-            or enable_qnn
-        ):
+        if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]):
             raise ValueError(
-                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
-                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
             )
 
         if skip_lang and skip_vision:
@@ -662,6 +653,7 @@ def compile(
                 mdp_ts_num_devices=num_devices,
                 aic_num_cores=num_cores,
                 custom_io=custom_io_vision,
+                mxint8_kv_cache=mxint8_kv_cache,
                 **compiler_options,
             )
 
@@ -687,6 +679,7 @@ def compile(
                 mdp_ts_num_devices=num_devices,
                 aic_num_cores=num_cores,
                 custom_io=custom_io_lang,
+                mxint8_kv_cache=mxint8_kv_cache,
                 **compiler_options,
             )
         return self.qpc_path
@@ -930,21 +923,12 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
-        if (
-            any(
-                param is not None
-                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
-            )
-            or enable_qnn
-        ):
+        if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]):
             raise ValueError(
-                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
-                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
             )
 
         output_names = self.model.get_output_names()
@@ -982,6 +966,7 @@ def compile(
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
             **compiler_options,
         )
         return self.qpc_path
@@ -1540,8 +1525,6 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         prefill_only: Optional[bool] = None,
         **compiler_options,
     ) -> str:
@@ -1564,10 +1547,14 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
             :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
-            :compiler_options (dict, optional): Any other options that the `qaic-exec` takes. ``Defaults to None``.
+            :compiler_options (dict, optional): Pass any compiler option as input. ``Defaults to None``.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+                - aic_num_cores=16 -> -aic-num-cores=16
+                - convert_to_fp16=True -> -convert-to-fp16
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
@@ -1616,48 +1603,29 @@ def compile(
                 specializations.append(decode_spec)
 
         # --- Compilation ---
-        if enable_qnn:
-            if compiler_options:
-                logger.warning("Extra arguments to QNN compilation are ignored. Use `qnn_config.json`.")
-
-            qpc_path = self._qnn_compile(
-                onnx_path=onnx_path,
-                compile_dir=compile_dir,
-                specializations=specializations,
-                prefill_seq_len=prefill_seq_len,
-                ctx_len=ctx_len,
-                batch_size=batch_size,
-                full_batch_size=full_batch_size,
-                mdp_ts_num_devices=num_devices,
-                num_cores=num_cores,
-                mxfp6_matmul=mxfp6_matmul,
-                mxint8_kv_cache=mxint8_kv_cache,
-                qnn_config=qnn_config,
-                kv_cache_batch_size=kv_cache_batch_size,
-            )
-        else:
-            kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
-            custom_io = {}
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+        custom_io = {}
 
-            for suffix in ["", "_RetainedState"]:
-                for i in range(self.num_layers):
-                    for kv in ["key", "value"]:
-                        custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
+        for suffix in ["", "_RetainedState"]:
+            for i in range(self.num_layers):
+                for kv in ["key", "value"]:
+                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
 
-            qpc_path = self._compile(
-                onnx_path=onnx_path,
-                compile_dir=compile_dir,
-                compile_only=True,
-                retained_state=True,
-                specializations=specializations,
-                convert_to_fp16=True,
-                mxfp6_matmul=mxfp6_matmul,
-                custom_io=custom_io,
-                mdp_ts_num_devices=num_devices,
-                num_speculative_tokens=num_speculative_tokens,
-                aic_num_cores=num_cores,
-                **compiler_options,
-            )
+        qpc_path = self._compile(
+            onnx_path=onnx_path,
+            compile_dir=compile_dir,
+            compile_only=True,
+            retained_state=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            custom_io=custom_io,
+            mdp_ts_num_devices=num_devices,
+            num_speculative_tokens=num_speculative_tokens,
+            aic_num_cores=num_cores,
+            mxint8_kv_cache=mxint8_kv_cache,
+            **compiler_options,
+        )
 
         return qpc_path
 
@@ -1802,8 +1770,6 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        enable_qnn: bool = False,
-        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1845,9 +1811,6 @@ def compile(
         if num_speculative_tokens:
             logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
 
-        if enable_qnn or qnn_config:
-            logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
-
         return self._compile(
             onnx_path,
             compile_dir,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index dd4ddd0cf..ea09e97d7 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -393,6 +393,26 @@ def execute_command(process: str, command: str, output_file_path: Optional[str]
                 print(f"Failed to create {stderr_path}: {e}")
 
 
+def load_yaml(file_path: str) -> Dict[Any, Any]:
+    """
+    Opens the given YAML file, load and return the Dict.
+
+    ``Mandatory`` Args:
+        :file_path (str): YAML File to be opened.
+
+    Return:
+        Dict Object from the given file.
+
+    """
+    try:
+        # Load the YAML config file
+        with open(file_path, "r") as file:
+            config_data = yaml.safe_load(file)
+    except Exception as e:
+        raise ValueError(f"Failed to load YAML object from {file_path}: {e}")
+    return config_data
+
+
 def load_json(file_path: str) -> Dict[Any, Any]:
     """
     Opens the given JSON file, load and return the JSON object.
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 1180b35d0..b1ff9701e 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -144,6 +144,7 @@ class QnnConstants:
     COMPILER_STATS_BATCH_SIZE = 1
     COMPILER_TIME_PASSES = False
     GRAPH_NAMES = [f"{MODEL_NAME}_configuration_1", f"{MODEL_NAME}_configuration_2"]
+    GRAPH_NAMES_PREFILL_ONLY = [f"{MODEL_NAME}"]
 
     # qnn_config JSON file supported Keys
     CONVERTER_ARGS_EXTENSION_STR = "converter_args_extension"
@@ -170,3 +171,14 @@ class QnnConstants:
         "--dlc_path ",
         "--config_file ",
     ]
+
+    QNN_SAMPLE_CONFIG = {
+        "converter_args_extension": "--onnx_defer_loading",
+        "context_binary_generator_args_extension": "--log_level debug",
+        "qnn_compilation_backend": {
+            "compiler_enable_depth_first": True,
+            "compiler_printDDRStats": False,
+            "compiler_printPerfMetrics": False,
+        },
+        "SKIP_QNN_CONVERTER_STEP": False,
+    }
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
index fe72918dc..1a437af78 100644
--- a/QEfficient/utils/generate_qnn_network_specialization_config.py
+++ b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import json
-from typing import Optional
+from typing import Dict, List, Optional
 
 import onnx
 import yaml
@@ -17,15 +17,11 @@
 """
 
 
-def fetch_nodes_info(
+def generate_qnn_specialization(
     onnx_graph_path: str,
-    batch_size: int,
-    sequence_length: int,
-    context_length: int,
+    specializations: List[Dict[str, int]],
+    custom_io: Optional[Dict[str, str]] = None,
     file_path: str = "custom_io_config.yaml",
-    full_batch_size: Optional[int] = None,
-    kv_precision: Optional[str] = "float16",
-    kv_cache_batch_size: Optional[int] = None,
 ) -> None:
     """
     Generates network specialization config custom IO file for converter stage in QNN compilation.
@@ -34,134 +30,113 @@ def fetch_nodes_info(
 
     ``Mandatory`` Args:
         :onnx_graph_path (str): Generated ``ONNX`` Model Path.
-        :batch_size (int): Batch size to compile the model for.
-        :sequence_length (int): Sequence length for the model to compile.
-        :context_length (int): Maximum context length to compile the model.
+        :specializations (List[Dict[str, int]]): Specialization file containing compilation parameter values.
 
     ``Optional`` Args:
+        :custom_io (Dict[str, str]): Custom IO containing overriding datatype information for onnx graph nodes used in compilation.
         :file_path (str): File path to save the generated custom IO config. ``Defaults to custom_io_config.yaml.``
-        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
-        :kv_precision (str): Sets kv precision for compilation.  ``Defaults to float16.``
-        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
     """
 
-    # Load the ONNX model
+    # Load the ONNX model graph
     onnx_model = onnx.load(onnx_graph_path)
 
-    input_nodes = []
     input_nodes_info = []
     final_dict = {}
-    output_nodes = []
     output_nodes_info = []
+
+    # Populating input graph nodes information.
     for node in onnx_model.graph.input:
-        input_nodes.append(node.name)
         input_info = {}
+
+        # Assigining data type value as per the onnx graph input.
         input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type))
-        if "past_key" in node.name or "past_value" in node.name:
-            input_info["DataType"] = kv_precision
 
-        if "batch_index" in node.name:
-            if full_batch_size:
-                input_info["Shape"] = f"(1, 1), ({full_batch_size}, 1)"
+        # Over riding the data type according to the custom_io (if provided).
+        if custom_io is not None and node.name in custom_io:
+            input_info["DataType"] = "uint8" if custom_io[node.name] == "mxint8" else custom_io[node.name]
+
+        # Create Shapes List for the input node.
+        shapes = []
+        for input_shape in node.type.tensor_type.shape.dim:
+            if input_shape.HasField("dim_value"):
+                shape = input_shape.dim_value
+            elif input_shape.HasField("dim_param"):
+                shape = input_shape.dim_param
             else:
-                raise AttributeError(
-                    "ERROR: Full batch size is required for populating batch_index in custom_io_config.yaml"
-                )
-        else:
-            shapes = []
-            for input_shape in node.type.tensor_type.shape.dim:
-                if input_shape.HasField("dim_value"):
-                    shape = input_shape.dim_value
-                elif input_shape.HasField("dim_param"):
-                    shape = input_shape.dim_param
+                raise AttributeError(f"ERROR: {input_shape} Shape not Found")
+            shapes.append(shape)
+
+        # Filling shape value for nodes with shape size != 2, example: past_key / past_value nodes.
+        if len(shapes) != 2:
+            shape_list = []
+            for input_shape in shapes:
+                # If shape contains the parameter string, it value is extracted from the specialization file.
+                if isinstance(input_shape, str):
+                    if input_shape in specializations[0]:
+                        shape_list.append(int(specializations[0][input_shape]))
+                    else:
+                        raise AttributeError(f"ERROR: {input_shape} is required in specializations")
+                # If shape contains the value, then that value is used as it is.
                 else:
-                    shape = "shape_not_found"
-                shapes.append(shape)
-
-            if (
-                ("batch_size" in shapes or "full_batch_size" in shapes)
-                and ("ctx_len" in shapes or "max_context_len" in shapes)
-                and len(shapes) >= 3
-            ):
-                shapeList = []
-                for shape in shapes:
-                    if isinstance(shape, str):
-                        if "full_batch_size" in shape:
-                            if ("past_key" in node.name or "past_value" in node.name) and kv_cache_batch_size:
-                                shapeList.append(kv_cache_batch_size)
-                            elif full_batch_size:
-                                shapeList.append(full_batch_size)
-                            else:
-                                raise AttributeError(
-                                    "ERROR: Full batch size is required to generate custom_io_config.yaml"
-                                )
-                        elif "batch_size" in shape:
-                            shapeList.append(batch_size)
-                        elif shape in ["ctx_len", "max_context_len"]:
-                            shapeList.append(context_length)
+                    shape_list.append(input_shape)
+
+            # Calculated shape is now assigned to the input node.
+            input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+        # If shape value for nodes is with shape size == 2, example: input_ids, position_ids, etc.
+        else:
+            shape_list = []
+            for input_shape in shapes:
+                if isinstance(input_shape, str):
+                    if input_shape in specializations[0]:
+                        shape_list.append(int(specializations[0][input_shape]))
                     else:
-                        shapeList.append(shape)
-                shape = str(shapeList).replace("[", "(").replace("]", ")")
-            elif "batch_size" in shapes and ("seq_len" in shapes or "prompt_len" in shapes):
-                shape_1 = (
-                    str(
-                        [
-                            batch_size if isinstance(shape, str) and "batch_size" in shape else sequence_length
-                            for shape in shapes
-                        ]
-                    )
-                    .replace("[", "(")
-                    .replace("]", ")")
-                )
-                if full_batch_size:
-                    shape_2 = (
-                        str(
-                            [
-                                full_batch_size if isinstance(shape, str) and "batch_size" in shape else 1
-                                for shape in shapes
-                            ]
-                        )
-                        .replace("[", "(")
-                        .replace("]", ")")
-                    )
+                        raise AttributeError(f"ERROR: {input_shape} is required in specializations")
                 else:
-                    shape_2 = (
-                        str([batch_size if isinstance(shape, str) and "batch_size" in shape else 1 for shape in shapes])
-                        .replace("[", "(")
-                        .replace("]", ")")
-                    )
-                shape = shape_1 + "," + shape_2
-            elif ("batch_size" in shapes or "full_batch_size" in shapes) and (
-                "ctx_len" in shapes or "max_context_len" in shapes
-            ):
-                shape = (
-                    str(
-                        [
-                            batch_size if isinstance(shape, str) and "batch_size" in shape else context_length
-                            for shape in shapes
-                        ]
-                    )
-                    .replace("[", "(")
-                    .replace("]", ")")
+                    shape_list.append(input_shape)
+            # If specializations file contains more than one parameters list, then first list is used for prefill and second one for decode graph.
+            if len(specializations) > 1:
+                prefill_shape_list = shape_list
+                decode_shape_list = []
+                for input_shape in shapes:
+                    if isinstance(input_shape, str):
+                        if input_shape in specializations[1]:
+                            decode_shape_list.append(int(specializations[1][input_shape]))
+                        else:
+                            raise AttributeError(f"ERROR: {input_shape} is required in specializations")
+                    else:
+                        decode_shape_list.append(input_shape)
+
+                input_info["Shape"] = (
+                    str(prefill_shape_list).replace("[", "(").replace("]", ")")
+                    + ", "
+                    + str(decode_shape_list).replace("[", "(").replace("]", ")")
                 )
-            input_info["Shape"] = shape
+
+            # If specializations file contains only one parameters list, then that list is used for decode graph information.
+            else:
+                input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+
+        # Finally, input node is created with its name, and desired model parameters {DataType, Shape}
         input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info})
 
     # Prepare output tensor configuration
     for output in onnx_model.graph.output:
-        output_nodes.append(output.name)
         output_info = {}
+
+        # Assigining data type value as per the onnx graph input.
         output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type))
-        if "past_key" in output.name or "past_value" in output.name:
-            output_info["DataType"] = kv_precision
-        elif "logits" in output.name:
-            output_info["DataType"] = "float32"
+
+        # Over riding the data type according to the custom_io (if provided).
+        if custom_io is not None and output.name in custom_io:
+            output_info["DataType"] = "uint8" if custom_io[output.name] == "mxint8" else custom_io[output.name]
+
+        # Finally, output node is created with its name, and desired model parameters {DataType}
         output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info})
 
-    # Combine input and output configurations
+    # Combining input and output configurations
     final_dict = {"Input Tensor Configuration": input_nodes_info, "Output Tensor Configuration": output_nodes_info}
 
-    # Save the configuration to a YAML file
+    # Saving the configuration to a YAML file
     try:
         with open(file_path, "w") as yaml_file:
             yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False)
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 24113f9c8..26278c359 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -117,7 +117,7 @@ pipeline {
         }
         stage('QNN Non-CLI Tests') {
             steps {
-                timeout(time: 60, unit: 'MINUTES') {
+                timeout(time: 200, unit: 'MINUTES') {
                     sh '''
                     sudo docker exec ${BUILD_TAG} bash -c "
                     source /qnn_sdk/bin/envsetup.sh &&
diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py
deleted file mode 100644
index 65acab157..000000000
--- a/tests/qnn_tests/test_causal_lm_models_qnn.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import os
-
-import numpy as np
-import pytest
-from transformers import AutoModelForCausalLM
-
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
-from QEfficient.utils import hf_download
-from QEfficient.utils._utils import load_hf_tokenizer
-from QEfficient.utils.constants import Constants
-from QEfficient.utils.device_utils import get_available_device_id
-from QEfficient.utils.run_utils import ApiRunner
-
-test_models = [
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "gpt2",
-]
-
-
-def load_causal_lm_model(model_config):
-    """
-    Function to load model from huggingface and transform to KV model
-    --------
-
-    :model_config: Dict
-
-    :return model_hf, params
-    """
-    model_path = hf_download(
-        repo_id=model_config["model_name"],
-        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
-    )
-    model_hf = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        use_cache=True,
-        num_hidden_layers=model_config["n_layer"],
-        attn_implementation="eager",
-        low_cpu_mem_usage=False,
-    )  # Run models for single layers only
-    params = sum(p.numel() for p in model_hf.parameters())
-    model_hf.eval()
-    return model_hf, params
-
-
-def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
-    model_name: str,
-    prompt_len: int = Constants.PROMPT_LEN,
-    ctx_len: int = Constants.CTX_LEN,
-    n_layer: int = 1,
-):
-    """
-    Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-    ``Mandatory`` Args:
-        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-        :prompt_len (int): Prompt length for the model to compile.
-        :ctx_len (int): Maximum context length to compile the model.
-        :n_layers (int): Number of layers for the Model.
-    """
-    replace_transformers_quantizers()
-    model_config = {"model_name": model_name}
-    model_config["n_layer"] = n_layer
-
-    model_hf, _ = load_causal_lm_model(model_config)
-
-    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
-    config = model_hf.config
-    batch_size = len(Constants.INPUT_STR)
-    api_runner = ApiRunner(
-        batch_size,
-        tokenizer,
-        config,
-        Constants.INPUT_STR,
-        Constants.PROMPT_LEN,
-        Constants.CTX_LEN,
-    )
-
-    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
-
-    qeff_model = QEFFAutoModelForCausalLM(model_hf)
-
-    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
-
-    assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
-        "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
-    )
-
-    onnx_model_path = qeff_model.export()
-    ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path)
-
-    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
-
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
-    qpc_path = qeff_model.compile(
-        prefill_seq_len=prompt_len,
-        ctx_len=ctx_len,
-        num_cores=14,
-        mxfp6=False,
-        aic_enable_depth_first=False,
-        enable_qnn=True,
-    )
-    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
-    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
-    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
-    gen_len = ort_tokens.shape[-1]
-    assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
-        "Tokens don't match for ONNXRT output and Cloud AI 100 output."
-    )
-
-    # testing for CB models
-    model_hf, _ = load_causal_lm_model(model_config)
-    full_batch_size = 4
-    fbs_prompts = Constants.INPUT_STR * 4
-    api_runner = ApiRunner(
-        batch_size,
-        tokenizer,
-        config,
-        fbs_prompts,
-        Constants.PROMPT_LEN,
-        Constants.CTX_LEN,
-        full_batch_size,
-    )
-
-    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
-    pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
-
-    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True)
-    onnx_model_path = qeff_model.export()
-
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
-    qpc_path = qeff_model.compile(
-        prefill_seq_len=prompt_len,
-        ctx_len=ctx_len,
-        num_cores=14,
-        mxfp6=False,
-        aic_enable_depth_first=False,
-        full_batch_size=full_batch_size,
-        enable_qnn=True,
-    )
-    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
-    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
-
-    assert all(
-        [
-            all(pt_token[:24] == cloud_token[:24])
-            for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
-        ]
-    ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
-
-
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model_name", test_models)
-def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
-    """
-    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
-    ``Mandatory`` Args:
-        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
-    """
-    if model_name == "microsoft/Phi-3-mini-4k-instruct":
-        n_layer = 2  # test only 2 layer models
-    else:
-        n_layer = 1
-
-    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 21db8946d..67eec2e50 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -16,12 +16,12 @@
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
 from QEfficient.utils import hf_download
-from QEfficient.utils._utils import load_hf_tokenizer
-from QEfficient.utils.constants import Constants
+from QEfficient.utils._utils import create_json, load_hf_tokenizer
+from QEfficient.utils.constants import Constants, QnnConstants
 from QEfficient.utils.device_utils import get_available_device_id
 from QEfficient.utils.run_utils import ApiRunner
 
-test_models = [
+test_models_qaic = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "gpt2",
     "Salesforce/codegen-350M-mono",
@@ -46,6 +46,13 @@
     "ibm-granite/granite-guardian-3.1-2b",
 ]
 
+test_models_qnn = [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "meta-llama/Llama-3.2-1B",
+    "unsloth/gemma-2b",
+    "ibm-granite/granite-guardian-3.1-2b",
+]
+
 spd_test_models = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 ]
@@ -83,6 +90,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     n_layer: int = 1,
     num_speculative_tokens: Optional[int] = None,
     prefill_only: Optional[bool] = None,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ):
     """
     Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
@@ -138,6 +147,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         num_speculative_tokens=num_speculative_tokens,
         prefill_only=prefill_only,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0][
@@ -186,6 +197,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         full_batch_size=full_batch_size,
         num_speculative_tokens=num_speculative_tokens,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
@@ -229,7 +242,7 @@ def test_causal_lm_export_with_deprecated_api(model_name):
 
 
 @pytest.mark.on_qaic
-@pytest.mark.parametrize("model_name", test_models)
+@pytest.mark.parametrize("model_name", test_models_qaic)
 def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     """
     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
@@ -244,6 +257,29 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", test_models_qnn)
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation Test
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "microsoft/Phi-3-mini-4k-instruct":
+        n_layer = 2  # test only 2 layer models
+    else:
+        n_layer = 1
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+
 @pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", spd_test_models)
@@ -275,6 +311,23 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
 
 
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
+    """
+    model_name = "gpt2"
+    prompt_len = 1
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+
 @pytest.mark.on_qaic
 def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
     model_name = "gpt2"
@@ -282,3 +335,21 @@ def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100_qnn():
+    model_name = "gpt2"
+    n_layer = 1
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name, n_layer=n_layer, prefill_only=True, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name, n_layer=n_layer, prefill_only=False, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index e681f5093..22f4bd580 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
+from typing import Optional
 
 import numpy as np
 import onnxruntime as ort
@@ -13,7 +14,8 @@
 from transformers import AutoModel, AutoTokenizer
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
-from QEfficient.utils.constants import Constants
+from QEfficient.utils._utils import create_json
+from QEfficient.utils.constants import Constants, QnnConstants
 
 embed_test_models = [
     # model_name, architecture
@@ -27,6 +29,8 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     model_name: str,
     seq_len: int = Constants.CTX_LEN,
     n_layer: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ):
     # Prepare input
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -71,6 +75,8 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     qeff_model.compile(
         num_cores=14,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     ai100_output = qeff_model.generate(inputs=inputs)
 
@@ -88,3 +94,19 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
     """
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation path test.
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
index 467aa174b..8e0c061b8 100644
--- a/tests/transformers/models/test_prefix_caching.py
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -14,6 +14,7 @@
 from QEfficient.generation.text_generation_inference import TextGeneration
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils._utils import create_json
+from QEfficient.utils.constants import QnnConstants
 
 test_models = ["gpt2"]
 
@@ -39,18 +40,8 @@ def test_simple_prefix_caching(model_name):
 @pytest.mark.parametrize("model_name", test_models)
 def test_simple_prefix_caching_qnn(model_name):
     qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True)
-    qnn_config = {
-        "converter_args_extension": "",
-        "context_binary_generator_args_extension": "--log_level debug",
-        "qnn_compilation_backend": {
-            "compiler_enable_depth_first": True,
-            "compiler_printDDRStats": False,
-            "compiler_printPerfMetrics": False,
-        },
-        "SKIP_QNN_CONVERTER_STEP": False,
-    }
     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, qnn_config)
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
 
     qeff_model.compile(
         prefill_seq_len=128,
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index 17d82bde5..b486e0850 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -7,7 +7,7 @@
 
 import os
 from importlib import reload
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import onnx
@@ -21,8 +21,8 @@
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSpeechSeq2Seq
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
 from QEfficient.utils import get_padding_shape_from_config, hf_download
-from QEfficient.utils._utils import load_hf_processor
-from QEfficient.utils.constants import Constants
+from QEfficient.utils._utils import create_json, load_hf_processor
+from QEfficient.utils.constants import Constants, QnnConstants
 from QEfficient.utils.device_utils import get_available_device_id
 
 test_models = [
@@ -292,6 +292,8 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     model_name: str,
     ctx_len: int = Constants.CTX_LEN,
     n_layer: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ):
     """
     Validate the PyTorch model, the PyTorch model after KV changes, ONNX model and the Cloud AI 100 model
@@ -337,6 +339,8 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
         ctx_len=ctx_len,
         num_cores=16,
         batch_size=batch_size,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
 
     exec_info = qeff_model.generate(
@@ -358,3 +362,22 @@ def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
     """
     check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.skip(reason="Whisper is currently not supported on QNN")
+@pytest.mark.parametrize("model_name", test_models)
+def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation path test.
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name, n_layer=4, enable_qnn=True, qnn_config=qnn_config_json_path
+    )