diff --git a/backends/qualcomm/_passes/decompose_einsum.py b/backends/qualcomm/_passes/decompose_einsum.py
new file mode 100644
index 0000000000..c1924838b3
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_einsum.py
@@ -0,0 +1,65 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.experimental.proxy_tensor import make_fx
+
+
+class DecomposeEinsum(ExportPass):
+    """
+    Decompose einsum for quantization annotation to work properly.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.einsum.default:
+                decomposed_module = make_fx(
+                    node.target,
+                    tracing_mode="fake",
+                )(node.args[0], [arg.meta["val"] for arg in node.args[1]])
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correclty updated in the new graph
+                    remap = {}
+                    # Different from other nodes, einsum args[0] is the einsum equation,
+                    # while input nodes are stored in args[1]
+                    for i, arg in enumerate(node.args[1]):
+                        remap[f"arg1_{i+1}"] = arg
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        # This is the arg[0] equation string, which is not required anymore after decomposition
+                        if "arg0" in decomposed_node.name:
+                            continue
+
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            for user in node.users.copy():
+                                # remap
+                                user.replace_input_with(
+                                    node,
+                                    remap[decomposed_node.args[0][0]],
+                                )
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/insert_requantize.py b/backends/qualcomm/_passes/insert_requantize.py
index 417d3b85b0..5291edeb9f 100644
--- a/backends/qualcomm/_passes/insert_requantize.py
+++ b/backends/qualcomm/_passes/insert_requantize.py
@@ -28,6 +28,7 @@ class InsertRequantize(ExportPass):
     # we don't use the 2nd output, 2nd output is an integer, etc.
     multi_output_op_ignore_set = {
         exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+        exir_ops.edge.aten.topk.default,
     }
 
     def __init__(
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
index bdee2c8196..829c11fda4 100644
--- a/backends/qualcomm/_passes/layout_transform.py
+++ b/backends/qualcomm/_passes/layout_transform.py
@@ -65,6 +65,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
+        exir_ops.edge.aten.topk.default,
         exir_ops.edge.aten._to_copy.default,
         exir_ops.edge.aten.split_with_sizes.default,
         *q_ops,
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index 79c02e2207..74fd58a3ec 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -53,6 +53,7 @@
     op_sum_int_list,
     op_tanh,
     op_to,
+    op_topk,
     op_transpose,
     op_unsqueeze,
     op_upsample_bilinear2d,
@@ -107,6 +108,7 @@
     op_sub,
     op_sum_int_list,
     op_tanh,
+    op_topk,
     op_to,
     op_transpose,
     op_unsqueeze,
diff --git a/backends/qualcomm/builders/op_avg_pool2d.py b/backends/qualcomm/builders/op_avg_pool2d.py
index 2f7e773b4f..5ad3fc36c9 100644
--- a/backends/qualcomm/builders/op_avg_pool2d.py
+++ b/backends/qualcomm/builders/op_avg_pool2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -85,7 +86,10 @@ def define_node(
         if len(node.args) > 6:
             divisor_override = cast(int, node.args[6])
         if divisor_override != pooling_region:
-            print("Not support divisor_override which is not equal to pooling region.")
+            warnings.warn(
+                "[QNN Delegate Op Builder]: Not support divisor_override which is not equal to pooling region.",
+                stacklevel=1,
+            )
             return
 
         avg_pool2d_op = PyQnnWrapper.PyQnnOpWrapper(
diff --git a/backends/qualcomm/builders/op_cat.py b/backends/qualcomm/builders/op_cat.py
index bb68b24289..cf18690498 100644
--- a/backends/qualcomm/builders/op_cat.py
+++ b/backends/qualcomm/builders/op_cat.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -43,8 +44,9 @@ def define_node(
             )
 
         if len(list_of_tensors) != len(list_of_tensor_wrappers):
-            print(
-                "The number or input tensors is not equal to the number of input tensor wrappers."
+            warnings.warn(
+                "[QNN Delegate Op Builder]: The number or input tensors is not equal to the number of input tensor wrappers.",
+                stacklevel=1,
             )
             return
 
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index b6e70c374e..30207a0392 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -189,12 +190,18 @@ def _define_conv1d(
 
         # args[6] = transposed
         if cast(bool, node.args[6]):
-            print("Currently, No support for transposed convolution")
+            warnings.warn(
+                "[QNN Delegate Op Builder]: Currently, No support for transposed convolution.",
+                stacklevel=1,
+            )
             return
 
         # args[7] = output padding
         if not all(out_pad == 0 for out_pad in cast(List[int], node.args[7])):
-            print("QNN does not support output padding")
+            warnings.warn(
+                "[QNN Delegate Op Builder]: QNN does not support output padding.",
+                stacklevel=1,
+            )
             return
 
         stride_shape = [len(stride)]
diff --git a/backends/qualcomm/builders/op_expand.py b/backends/qualcomm/builders/op_expand.py
index dec352fef7..3f5c266cdd 100644
--- a/backends/qualcomm/builders/op_expand.py
+++ b/backends/qualcomm/builders/op_expand.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -52,8 +53,9 @@ def define_node(
         output_dims = len(output_tensor.size())
 
         if input_dims < output_dims:
-            print(
-                f"The rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}."
+            warnings.warn(
+                f"[QNN Delegate Op Builder]: The rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}.",
+                stacklevel=1,
             )
             return
 
diff --git a/backends/qualcomm/builders/op_layer_norm.py b/backends/qualcomm/builders/op_layer_norm.py
index 18f5b76310..635e12d2ee 100644
--- a/backends/qualcomm/builders/op_layer_norm.py
+++ b/backends/qualcomm/builders/op_layer_norm.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -44,7 +45,10 @@ def define_node(
             len(normalized_shapes) != 1
             and normalized_shapes[0] != input_tensor.shape[-1]
         ):
-            print("Only supports normalization with last input dimension")
+            warnings.warn(
+                "[QNN Delegate Op Builder]: Only supports normalization with last input dimension.",
+                stacklevel=1,
+            )
             return
         axis = [len(input_tensor.shape) - 1]
         axis_shape = [len(axis)]
diff --git a/backends/qualcomm/builders/op_linear.py b/backends/qualcomm/builders/op_linear.py
index 17afb21c6d..e4f16d4473 100644
--- a/backends/qualcomm/builders/op_linear.py
+++ b/backends/qualcomm/builders/op_linear.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -70,8 +71,9 @@ def define_node(
 
             # TODO remove this when qnn sdk support
             if QCOM_SCALES in bias_node.meta.get(QCOM_QUANT_ATTRS, {}):
-                print(
-                    f"[WARNING] Fallback linear bias, {bias_node}. per channel bias quantization is not support yet."
+                warnings.warn(
+                    f"[QNN Delegate Op Builder]: Fallback linear bias, {bias_node}. per channel bias quantization is not support yet.",
+                    stacklevel=1,
                 )
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_max_pool2d.py b/backends/qualcomm/builders/op_max_pool2d.py
index 586556621b..27f14889bf 100644
--- a/backends/qualcomm/builders/op_max_pool2d.py
+++ b/backends/qualcomm/builders/op_max_pool2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -42,8 +43,9 @@ def define_node(
             if user.target.__name__ == "getitem":
                 getitem_index = user.args[1]
                 if getitem_index != 0:
-                    print(
-                        f"Expected second argument of getitem node for {node.target.__name__ } to be 0, got {getitem_index}"
+                    warnings.warn(
+                        f"[QNN Delegate Op Builder]: Expected second argument of getitem node for {node.target.__name__ } to be 0, got {getitem_index}",
+                        stacklevel=1,
                     )
                     return
 
@@ -78,8 +80,9 @@ def define_node(
         if len(node.args) > 4:
             dilation = cast(List[int], node.args[4])
             if not (dilation == 1 or dilation == [1, 1]):
-                print(
-                    f"Not support dilation argument for max pool2d, but got {dilation}"
+                warnings.warn(
+                    f"[QNN Delegate Op Builder]: Not support dilation argument for max pool2d, but got {dilation}",
+                    stacklevel=1,
                 )
                 return
 
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
index e99b1f47ba..3a5101b12d 100644
--- a/backends/qualcomm/builders/op_rms_norm.py
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -47,7 +48,10 @@ def define_node(
             len(normalized_shapes) != 1
             and normalized_shapes[0] != input_tensor.shape[-1]
         ):
-            print("Only supports normalization with last input dimension")
+            warnings.warn(
+                "[QNN Delegate Op Builder]: Only supports normalization with last input dimension.",
+                stacklevel=1,
+            )
             return
         axes = [node.args[0].meta["val"].dim() - 1]
         axes_shape = [len(axes)]
diff --git a/backends/qualcomm/builders/op_topk.py b/backends/qualcomm/builders/op_topk.py
new file mode 100644
index 0000000000..84c29925f2
--- /dev/null
+++ b/backends/qualcomm/builders/op_topk.py
@@ -0,0 +1,107 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpTopK, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class TopK(NodeVisitor):
+    target = ["aten.topk.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        k = cast(int, node.args[1])
+
+        if len(node.args) > 2:
+            dim = cast(int, node.args[2])
+            if dim < 0:
+                dim = dim % len(input_tensor.shape)
+            if QCOM_AXIS_ORDER in node.meta:
+                dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+            if dim != len(input_tensor.shape) - 1:
+                warnings.warn(
+                    "[QNN Delegate Op Builder]: QNN currently only supports channel as dimension for topK.",
+                    stacklevel=1,
+                )
+                return
+
+        topk_input_tensors = [input_tensor_wrapper]
+
+        output_val_tensor = self.get_tensor(node, node, 0)
+        output_idx_tensor = self.get_tensor(node, node, 1).to(torch.int32)
+
+        # QNN constraint, topk output_0 requires having the same quant config as input
+        node.meta["quant_attrs"] = input_node.meta.get("quant_attrs")
+        output_val_tensor_wrapper = self.define_tensor(
+            node,
+            output_val_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        # topk output_1 is index, do not quantize it.
+        node.meta.pop("quant_attrs", None)
+        output_index_tensor_wrapper = self.define_tensor(
+            node,
+            output_idx_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+            wrapper_idx=1,
+        )
+        topk_output_tensors = [output_val_tensor_wrapper, output_index_tensor_wrapper]
+
+        topk_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpTopK.op_name,
+        )
+        topk_op.AddInputTensors(topk_input_tensors)
+        topk_op.AddOutputTensors(topk_output_tensors)
+
+        topk_op.AddScalarParam(
+            OpTopK.param_k,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {"data": np.uint32(k)},
+        )
+
+        # As of QNN 2.26, QNN HTP backend only allows users to set this value to 1, or else it will fail at op validation
+        if len(node.args) > 3:
+            largest = cast(bool, node.args[3])
+            topk_op.AddScalarParam(
+                OpTopK.param_largest,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+                {QCOM_DATA: largest},
+            )
+
+        return topk_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 9c589c7678..307ad4a0c4 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -352,6 +352,13 @@ class OpTile:
     param_multiples: str = "multiples"
 
 
+@dataclass(init=False, frozen=True)
+class OpTopK:
+    op_name: str = "TopK"
+    param_k: str = "k"
+    param_largest: str = "largest"
+
+
 @dataclass(init=False, frozen=True)
 class OpTranspose:
     op_name: str = "Transpose"
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index ce8fab5e6f..3c8faee9c0 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -7,6 +7,7 @@
 from typing import Callable, Dict, Optional, Sequence, Set
 
 import torch
+from executorch.backends.qualcomm._passes.decompose_einsum import DecomposeEinsum
 from executorch.backends.qualcomm._passes.decompose_silu import DecomposeSilu
 from executorch.backends.qualcomm._passes.recompose_pixel_unshuffle import (
     RecomposePixelUnshuffle,
@@ -190,6 +191,7 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = RecomposePixelUnshuffle(quantization_capture=True)(model).graph_module
         model = DecomposeScaledDotProductAttention()(model).graph_module
         model = DecomposeSilu()(model).graph_module
+        model = DecomposeEinsum()(model).graph_module
         model = ReplaceInfBuffer()(model).graph_module
         return model
 
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index 62407decce..d1ea35fa19 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -415,8 +415,8 @@ def _is_annotated(nodes: List[Node]):
     return annotated
 
 
-def _is_input_float_tensor(node: Node):
-    """Check if the input is not a float tensor, so that we can skip quantization for the node
+def _is_float_tensor(node: Node):
+    """Check if the node's tensor is a float tensor, so that we can skip quantization for the node
     since observers only works with float Tensors
     """
     if (
@@ -474,7 +474,7 @@ def annotate_single_in_single_out(
     assert isinstance(input_act, Node)
     input_qspec_map[input_act] = quantization_config.input_activation
 
-    if _is_input_float_tensor(node):
+    if _is_float_tensor(node):
         node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
             output_qspec=quantization_config.output_activation,
@@ -482,22 +482,30 @@ def annotate_single_in_single_out(
         )
 
 
+@register_annotator([torch.ops.aten.topk.default])
+def annotate_topk(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+    # We can use single_in_single_out since we don't want to quantize indices output
+    annotate_single_in_single_out(node, quantization_config)
+
+
 def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
 
     input_act_qspec = quantization_config.input_activation
     output_act_qspec = (
-        quantization_config.output_activation if _is_input_float_tensor(node) else None
+        quantization_config.output_activation if _is_float_tensor(node) else None
     )
 
     input_qspec_map = {}
     input_act0 = node.args[0]
-    if _is_input_float_tensor(input_act0):
+    if _is_float_tensor(input_act0):
         input_qspec_map[input_act0] = input_act_qspec
 
     input_act1 = node.args[1]
-    if _is_input_float_tensor(input_act1):
+    if _is_float_tensor(input_act1):
         input_qspec_map[input_act1] = input_act_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
@@ -579,7 +587,7 @@ def _derive_div_qparams_fn(
         )
         input_qspec_map = {}
         input_act0 = node.args[0]
-        if _is_input_float_tensor(input_act0):
+        if _is_float_tensor(input_act0):
             input_qspec_map[input_act0] = input_act_qspec
 
         node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
@@ -864,7 +872,7 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
         qscheme=torch.torch.per_tensor_affine,
     )
 
-    if _is_input_float_tensor(node):
+    if _is_float_tensor(node):
         node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
             output_qspec=out_act_quantization_spec,
@@ -1143,8 +1151,12 @@ def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) ->
 
 @register_annotator([operator.getitem])
 def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None:
-    _annotate_output_qspec(node, quantization_config.output_activation)
-    _mark_nodes_as_annotated([node])
+    if _is_annotated([node]):
+        return
+
+    if _is_float_tensor(node):
+        _annotate_output_qspec(node, quantization_config.output_activation)
+        _mark_nodes_as_annotated([node])
 
 
 @register_annotator([torch.ops.aten.layer_norm.default])
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index ee3d6cf93a..0b3553fd59 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -416,6 +416,17 @@ def forward(self, x):
         return torch.sum(self.first(x), dim=(2, 3), keepdim=False)
 
 
+class Conv2dTopK(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 16, 3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        topk_values, topk_indices = torch.topk(x, 5, dim=1)
+        return topk_values
+
+
 class Div(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -440,6 +451,30 @@ def forward(self, x):
         return x / 10
 
 
+class EinsumBilinear(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, bn, anm, bm):
+        return torch.einsum("bn,anm,bm->ba", bn, anm, bm)
+
+
+class EinsumOuterProduct(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, i, j):
+        return torch.einsum("i,j->ij", i, j)
+
+
+class EinsumOuterProductRelu(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, i, j):
+        return torch.relu(torch.einsum("i,j->ij", i, j))
+
+
 class Embedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -978,6 +1013,16 @@ def forward(self, x):
         return torch.tanh(x)
 
 
+class TopKandIndex(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.idx_source = torch.rand(10, 3)
+
+    def forward(self, x):
+        a, b = torch.topk(x, 3)
+        return a + self.idx_source[b]
+
+
 class Unbind(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index bb90e0eb58..2e1bd0eff3 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -37,6 +37,11 @@
     skip_annotation,
 )
 
+from executorch.examples.models.llama2.llama_transformer import (
+    ModelArgs,
+    MOEFeedForward,
+)
+
 from executorch.examples.qualcomm.utils import setup_common_args_and_variables
 
 from executorch.backends.qualcomm.tests.models import *  # noqa: F403
@@ -140,6 +145,28 @@ def test_qnn_backend_conv_transpose2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_einsum_outer_product(self):
+        module = EinsumOuterProduct()  # noqa: F405
+        x = torch.randn(5)
+        y = torch.randn(4)
+        sample_input = (
+            x,
+            y,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_einsum_bilinear(self):
+        module = EinsumBilinear()  # noqa: F405
+        bn = torch.randn(2, 5)
+        anm = torch.randn(3, 5, 4)
+        bm = torch.randn(2, 4)
+        sample_input = (
+            bn,
+            anm,
+            bm,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
             {
@@ -546,6 +573,34 @@ def test_qnn_backend_conv2d_sum_reduce_dim(self):
         sample_input = (torch.randn([1, 1, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_topk(self):
+        module = Conv2dTopK()  # noqa: F405
+        sample_input = (torch.randn(1, 3, 32, 32),)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_einsum_outer_product_relu(self):
+        module = EinsumOuterProductRelu()  # noqa: F405
+        x = torch.randn(5)
+        y = torch.randn(4)
+        sample_input = (
+            x,
+            y,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
+    @unittest.skip("Fail because of bad accuracy")
+    def test_qnn_backend_moe_feed_forward(self):
+        args = ModelArgs()
+        args.dim = 32
+        args.n_heads = 8
+        args.n_layers = 2
+        self.head_dim = args.dim // args.n_heads
+        module = MOEFeedForward(args)  # noqa: F405
+        sample_input = (
+            torch.randint(low=0, high=100, size=(1, 32), dtype=torch.float32),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_unshuffle_math_equivalent(self):
         module = PixelUnshuffleMathEquivalent(2)  # noqa: F405
         sample_input = (torch.rand(2, 2, 6, 6),)
@@ -561,6 +616,11 @@ def test_qnn_backend_simple_model(self):
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_topk_and_index(self):
+        module = TopKandIndex()  # noqa: F405
+        sample_input = (torch.randn(3, 10),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_view_permute_matmul(self):
         module = ViewPermuteMatMul()  # noqa: F405
         torch.manual_seed(8)
@@ -749,6 +809,30 @@ def test_qnn_backend_conv_transpose2d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_einsum_outer_product(self):
+        module = EinsumOuterProduct()  # noqa: F405
+        x = torch.randn(5)
+        y = torch.randn(4)
+        sample_input = (
+            x,
+            y,
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_einsum_bilinear(self):
+        module = EinsumBilinear()  # noqa: F405
+        bn = torch.randn(2, 5)
+        anm = torch.randn(3, 5, 4)
+        bm = torch.randn(2, 4)
+        sample_input = (
+            bn,
+            anm,
+            bm,
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
             {
@@ -1211,6 +1295,37 @@ def test_qnn_backend_conv2d_sum_reduce_dim(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_topk(self):
+        module = Conv2dTopK()  # noqa: F405
+        sample_input = (torch.randn(1, 3, 32, 32),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_einsum_outer_product_relu(self):
+        module = EinsumOuterProductRelu()  # noqa: F405
+        x = torch.randn(5)
+        y = torch.randn(4)
+        sample_input = (
+            x,
+            y,
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
+    @unittest.skip("UT pass before QNN 2.26, segfault during partitioner")
+    def test_qnn_backend_moe_feed_forward(self):
+        args = ModelArgs()
+        args.dim = 32
+        args.n_heads = 8
+        args.n_layers = 2
+        self.head_dim = args.dim // args.n_heads
+        module = MOEFeedForward(args)  # noqa: F405
+        sample_input = (
+            torch.randint(low=0, high=100, size=(1, 32), dtype=torch.float32),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_unshuffle_math_equivalent(self):
         module = PixelUnshuffleMathEquivalent(2)  # noqa: F405
         sample_input = (torch.rand(2, 2, 6, 6),)
@@ -1229,6 +1344,12 @@ def test_qnn_backend_simple_model(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_topk_and_index(self):
+        module = TopKandIndex()  # noqa: F405
+        sample_input = (torch.randn(3, 10),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_view_permute_matmul(self):
         module = ViewPermuteMatMul()  # noqa: F405
         torch.manual_seed(8)
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index b8230abdc2..c58da42e84 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -166,7 +166,7 @@ def _save_model_and_expected_output(
         ref_outputs = []
         if isinstance(ref_output, collections.OrderedDict):
             ref_outputs.append(ref_output["out"].detach())
-        elif isinstance(ref_output, tuple):
+        elif isinstance(ref_output, (list, tuple)):
             for output in ref_output:
                 ref_outputs.append(output.detach())
         else:
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 7da1ccb4ed..d93f7fcb4b 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -52,6 +52,7 @@
 from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     _soc_info_table,
+    HtpArch,
     QcomChipset,
     QnnExecuTorchBackendOptions,
     QnnExecuTorchBackendType,
@@ -854,6 +855,16 @@ def generate_qnn_executorch_compiler_spec(
     ]
 
 
+def get_soc_to_arch_map():
+    return {
+        "SSG2115P": HtpArch.V73,
+        "SM8650": HtpArch.V75,
+        "SM8550": HtpArch.V73,
+        "SM8475": HtpArch.V69,
+        "SM8450": HtpArch.V69,
+    }
+
+
 def get_soc_to_chipset_map():
     return {
         "SSG2115P": QcomChipset.SSG2115P,
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
index 7013a06880..4d0858953b 100644
--- a/backends/vulkan/partitioner/supported_ops.py
+++ b/backends/vulkan/partitioner/supported_ops.py
@@ -45,6 +45,12 @@ def __contains__(self, op):
 
 PRIM_OPS = [
     operator.getitem,
+    # Quantization related ops will be fused via graph passes
+    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
     exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
 ]
 
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index 00fc4f6f54..69ece27f67 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -176,6 +176,7 @@ struct ContentView: View {
         .padding([.leading, .trailing, .bottom], 10)
         .sheet(isPresented: $isImagePickerPresented, onDismiss: addSelectedImageMessage) {
           ImagePicker(selectedImage: $selectedImage, sourceType: imagePickerSourceType)
+            .id(imagePickerSourceType.rawValue)
         }
       }
       .navigationBarTitle(title, displayMode: .inline)
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 2e6cb348b0..f10babc5bb 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -30,7 +30,7 @@
     capture_program,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
-    get_soc_to_chipset_map,
+    get_soc_to_arch_map,
 )
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 from executorch.exir.backend.backend_api import to_backend
@@ -83,7 +83,7 @@ def __init__(
         self.dump_intermediate_outputs = dump_intermediate_outputs
         self.debug_output_path = f"{self.workspace}/debug_output.bin"
         self.output_folder = f"{self.workspace}/outputs"
-        self.soc_model = get_soc_to_chipset_map()[soc_model]
+        self.soc_model = get_soc_to_arch_map()[soc_model]
         self.error_only = error_only
         self.shared_buffer = shared_buffer
         self.runner = runner