From 57e192b0391746a26e8598d30f02c96a8a34bb4c Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Sun, 3 Mar 2024 15:37:16 -0800
Subject: [PATCH] Qualcomm AI Engine Direct - support embedding op (#2057)

Summary:
- support embedding op with int32 index input
- make mobilebert / llama2 be fully delegated
- add requantize passes for mixed precision
- bug fixes

Pull Request resolved: https://github.com/pytorch/executorch/pull/2057

Reviewed By: dbort

Differential Revision: D54348816

Pulled By: cccclai

fbshipit-source-id: ec3c8e87cc879d6f642859231255d5094d78349f
---
 backends/qualcomm/builders/__init__.py        |  2 +
 backends/qualcomm/builders/node_visitor.py    |  7 +-
 backends/qualcomm/builders/op_embedding.py    | 74 +++++++++++++++++++
 backends/qualcomm/partition/common_defs.py    |  1 -
 .../passes/annotate_and_quant_scalar.py       |  6 +-
 backends/qualcomm/passes/insert_io_qdq.py     | 14 +++-
 backends/qualcomm/passes/insert_requantize.py | 57 ++++++++++++++
 backends/qualcomm/passes/layout_transform.py  |  7 ++
 .../passes/recompose_pixel_shuffle.py         | 46 ++++++++++++
 backends/qualcomm/qnn_preprocess.py           |  2 +
 backends/qualcomm/quantizer/utils.py          | 16 +++-
 backends/qualcomm/tests/test_qnn_delegate.py  | 33 +++++----
 backends/qualcomm/tests/utils.py              | 10 ++-
 backends/qualcomm/utils/utils.py              |  8 ++
 examples/qualcomm/scripts/dummy_llama2.py     |  8 +-
 .../qualcomm/scripts/mobilebert_fine_tune.py  | 18 ++++-
 16 files changed, 279 insertions(+), 30 deletions(-)
 create mode 100644 backends/qualcomm/builders/op_embedding.py
 create mode 100644 backends/qualcomm/passes/insert_requantize.py
 create mode 100644 backends/qualcomm/passes/recompose_pixel_shuffle.py

diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index 5d2a08f7c7..b63a5583b1 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -18,6 +18,7 @@
     op_depth_to_space,
     op_dequantize,
     op_div,
+    op_embedding,
     op_expand,
     op_gelu,
     op_hardswish,
@@ -62,6 +63,7 @@
     op_depth_to_space,
     op_dequantize,
     op_div,
+    op_embedding,
     op_expand,
     op_gelu,
     op_hardswish,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 6e22e7864e..f4ccb6a7a4 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -90,8 +90,13 @@ def get_quant_encoding_conf(self, node: torch.fx.Node) -> Tuple[Any, Dict]:
                 {},
             )
 
-        quant_attrs = node.meta["quant_attrs"]
+        quant_attrs = (
+            node.meta["requantize"]["dq_attrs"]
+            if "requantize" in node.meta
+            else node.meta["quant_attrs"]
+        )
         encoding = quant_attrs["encoding"]
+
         quant_config = {}
         if encoding in PER_CHANNEL_ENCODING_MAPPING:
             scales = quant_attrs["scales"]
diff --git a/backends/qualcomm/builders/op_embedding.py b/backends/qualcomm/builders/op_embedding.py
new file mode 100644
index 0000000000..60d3a3906c
--- /dev/null
+++ b/backends/qualcomm/builders/op_embedding.py
@@ -0,0 +1,74 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
+from .utils import get_parameter
+
+
+@register_node_visitor
+class Embedding(NodeVisitor):
+    target = "aten.embedding.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        weight_node = node.args[0]
+        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor_wrapper = self.define_tensor(
+            weight_node,
+            weight_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
+
+        indices_node = node.args[1]
+        indices_tensor = self.get_tensor(indices_node, node)
+        indices_tensor_wrapper = self.define_scalar(
+            indices_node,
+            indices_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        gather_input_tensors = [weight_tensor_wrapper, indices_tensor_wrapper]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        gather_output_tensors = [output_tensor_wrapper]
+
+        gather_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpGather.op_name,
+        )
+        gather_op.AddInputTensors(gather_input_tensors)
+        gather_op.AddOutputTensors(gather_output_tensors)
+
+        # For now, default axis is zero.
+        gather_op.AddScalarParam(
+            OpGather.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            {"data": np.int32(0)},
+        )
+
+        return gather_op
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index e7a10a1dd7..517dcdadb4 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -12,7 +12,6 @@
     exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
-    exir_ops.edge.aten.embedding.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py
index e9a9a33a7b..52ab47a9c2 100644
--- a/backends/qualcomm/passes/annotate_and_quant_scalar.py
+++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py
@@ -98,7 +98,11 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule):
                 q_node = dq_node.args[0]
                 q_node_attrs = get_quant_attrs(graph_module, q_node)
 
-                scalar_node = [n for n in output.args if n != dq_node][0]
+                scalar_nodes = [n for n in output.args if n != dq_node]
+                if len(scalar_nodes) == 0:
+                    continue
+
+                scalar_node = scalar_nodes[0]
                 source_scalar_node = self._get_source_scalar_node(scalar_node)
                 # we'll abandon cast op here, since the constant scalar will
                 # be pre-loaded into QNN context binary
diff --git a/backends/qualcomm/passes/insert_io_qdq.py b/backends/qualcomm/passes/insert_io_qdq.py
index a7be118c41..e1dd55a916 100644
--- a/backends/qualcomm/passes/insert_io_qdq.py
+++ b/backends/qualcomm/passes/insert_io_qdq.py
@@ -49,8 +49,13 @@ def _insert_node(
         graph_module: torch.fx.GraphModule,
         node: torch.fx.node,
         target: torch.fx.node.Target,
+        quant_attrs: Dict = None,
     ) -> torch.fx.node:
-        quant_attrs = node.meta.get("quant_attrs")
+        # check if there has a specified quant_attrs
+        # if not, use the existent info. from current node
+        if quant_attrs is None:
+            quant_attrs = node.meta.get("quant_attrs")
+
         inserted_node = graph_module.graph.create_node(
             "call_function",
             target,
@@ -69,13 +74,16 @@ def _insert_quant_node(
         graph_module: torch.fx.GraphModule,
         node: torch.fx.node,
         target: torch.fx.node.Target,
-    ) -> None:
+        quant_attrs: Dict = None,
+    ) -> torch.fx.Node:
         with graph_module.graph.inserting_after(node):
             users = list(node.users.keys())
-            inserted_node = self._insert_node(graph_module, node, target)
+            inserted_node = self._insert_node(graph_module, node, target, quant_attrs)
             for user in users:
                 user.replace_input_with(node, inserted_node)
 
+        return inserted_node
+
     def _insert_dequant_node(
         self,
         graph_module: torch.fx.GraphModule,
diff --git a/backends/qualcomm/passes/insert_requantize.py b/backends/qualcomm/passes/insert_requantize.py
new file mode 100644
index 0000000000..bc45bd568e
--- /dev/null
+++ b/backends/qualcomm/passes/insert_requantize.py
@@ -0,0 +1,57 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch.backends.qualcomm.passes.insert_io_qdq import InsertIOQDQ
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class InsertRequantize(InsertIOQDQ):
+    """
+    This pass inserts dq/q nodes for non-arithmetic operators which have
+    different quantization specs in input and activation
+    """
+
+    def __init__(
+        self,
+        edge_program: torch.export.ExportedProgram,
+        insert_requantize: bool = False,
+    ):
+        super().__init__(edge_program)
+        # add non-arithmetic operators here if condition met
+        self.op_map = {
+            exir_ops.edge.aten.permute_copy.default: self._single_io_annotation,
+        }
+        self.insert_requantize = insert_requantize
+
+    def _single_io_annotation(self, gm: torch.fx.GraphModule, n: torch.fx.node) -> None:
+        in_q_attr = n.args[0].meta.get("quant_attrs")
+        out_q_attr = n.meta["quant_attrs"]
+        if in_q_attr is not None and in_q_attr["dtype"] != out_q_attr["dtype"]:
+            if self.insert_requantize:
+                dq_attr = n.meta["requantize"]["dq_attrs"]
+                q_attr = n.meta["requantize"]["q_attrs"]
+                # insert dq with given quantization attribute in input node
+                dq = self._insert_quant_node(gm, n, dq_attr["encoding"], dq_attr)
+                dq.meta["quant_attrs"] = dq_attr
+                # insert q with given quantization attribute in current node
+                q = self._insert_quant_node(gm, dq, q_attr["encoding"], q_attr)
+                q.meta["quant_attrs"] = q_attr
+            else:
+                dq_attr = in_q_attr.copy()
+                dq_attr["encoding"] = self.q_dq_map[out_q_attr["encoding"]]
+                q_attr = out_q_attr.copy()
+                n.meta["requantize"] = {"dq_attrs": dq_attr, "q_attrs": q_attr}
+
+    def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        for n in graph_module.graph.nodes:
+            if (
+                n.op == "call_function"
+                and n.meta.get("quant_attrs")
+                and n.target in self.op_map
+            ):
+                self.op_map[n.target](graph_module, n)
diff --git a/backends/qualcomm/passes/layout_transform.py b/backends/qualcomm/passes/layout_transform.py
index b851001404..8c86f1919a 100644
--- a/backends/qualcomm/passes/layout_transform.py
+++ b/backends/qualcomm/passes/layout_transform.py
@@ -13,6 +13,8 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.sym_util import eval_shape
 
+from .utils import dq_ops, q_ops
+
 
 class LayoutTransform(ExportPass):
     """
@@ -50,6 +52,8 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.bmm.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.gelu.default,
+        *q_ops,
+        *dq_ops,
         _operator.getitem,
     }
 
@@ -77,6 +81,7 @@ def __init__(
         super(LayoutTransform, self).__init__()
         self.edge_program = edge_program
         self.insert_permute = insert_permute
+        self.qdq_opset = {*q_ops, *dq_ops}
 
     def mark_as_transformed(self, node: torch.fx.Node) -> None:
         if isinstance(node.meta["val"], (tuple, list)):
@@ -108,6 +113,8 @@ def is_layout_agnostic(self, node: torch.fx.Node) -> bool:
             # if dimemsion is not kept, we'll have no clue how to do layout transform
             if len(node.args) < 3 or not node.args[2]:
                 return False
+        if node.target in self.qdq_opset:
+            return "requantize" in node.meta
         return node.target in self.layout_agnostic_ops
 
     def is_edge_condition(self, node):
diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py
new file mode 100644
index 0000000000..9eec6bfa26
--- /dev/null
+++ b/backends/qualcomm/passes/recompose_pixel_shuffle.py
@@ -0,0 +1,46 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class RecomposePixelShuffle(ExportPass):
+    """
+    Merge decomposed operators back to one super node.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        # decomposed core aten ops
+        partitions = get_source_partitions(graph, [torch.nn.PixelShuffle])
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                input_node = src_partition.input_nodes[0]
+                output_node = src_partition.output_nodes[0]
+                with graph.inserting_after(input_node):
+                    h_in_shape = input_node.meta["val"].shape[2]
+                    h_out_shape = output_node.meta["val"].shape[2]
+                    upscale_factor = h_out_shape / h_in_shape
+
+                    pixel_shuffle_node = graph.create_node(
+                        "call_function",
+                        exir_ops.edge.aten.pixel_shuffle.default,
+                        (input_node, int(upscale_factor)),
+                    )
+                    users = output_node.users.copy()
+                    for user in users:
+                        user.replace_input_with(output_node, pixel_shuffle_node)
+                    # copy metadata
+                    pixel_shuffle_node.meta = output_node.meta
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
index df025a1d3f..f6e7918bbd 100644
--- a/backends/qualcomm/qnn_preprocess.py
+++ b/backends/qualcomm/qnn_preprocess.py
@@ -12,6 +12,7 @@
 
 from executorch.backends.qualcomm.passes.convert_to_linear import ConvertToLinear
 from executorch.backends.qualcomm.passes.insert_io_qdq import InsertIOQDQ
+from executorch.backends.qualcomm.passes.insert_requantize import InsertRequantize
 from executorch.backends.qualcomm.passes.layout_transform import LayoutTransform
 from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_option
 from executorch.exir.backend.backend_details import (
@@ -44,6 +45,7 @@ def preprocess(
             passes=[
                 ConvertToLinear(),
                 InsertIOQDQ(edge_program),
+                InsertRequantize(edge_program, insert_requantize=True),
                 LayoutTransform(edge_program, insert_permute=True),
             ]
         )
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index be33fcfcd4..809b7298eb 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -104,7 +104,7 @@ def annotate_single_in_single_out(
     input_qspec_map[input_act] = quantization_config.input_activation
 
     node_tensor = node.meta.get("val")
-    if torch.is_tensor(node_tensor) and node_tensor.dtype == torch.int64:
+    if torch.is_tensor(node_tensor) and node_tensor.dtype != torch.float32:
         return
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
@@ -356,6 +356,20 @@ def annotate_transpose(node: Node, quantization_config: QuantizationConfig) -> N
         annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.embedding.default])
+def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> None:
+    weight = node.args[0]
+
+    input_qspec_map = {}
+    input_qspec_map[weight] = quantization_config.input_activation
+
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=SharedQuantizationSpec((weight, node)),
+        _annotated=True,
+    )
+
+
 @register_annotator([torch.ops.aten.expand.default])
 def annotate_expand(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_in_out_obs_sharing_op(node, quantization_config)
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 47173dd875..4a410eb72c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -117,12 +117,13 @@ def test_qnn_backend_element_wise_ceil(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_div(self):
+        eps = 1e-03
         test_comb = [
             {
                 "module": [Div()],  # noqa: F405
                 "sample_inputs": [
-                    (torch.randn(2, 5, 1, 3), torch.randn(2, 5, 1, 3)),
-                    (torch.randn([2, 5, 1, 3]), torch.randn([4, 1])),
+                    (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
+                    (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
             },
             {
@@ -197,11 +198,9 @@ def test_qnn_backend_element_wise_sub(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
-    @unittest.expectedFailure
     def test_qnn_backend_embedding(self):
         module = Embedding()  # noqa: F405
-        # QNN does not support int64 datatype
-        sample_input = (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)
+        sample_input = (torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]]).to(torch.int32),)
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_expand_copy(self):
@@ -549,12 +548,13 @@ def test_qnn_backend_element_wise_ceil(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_div(self):
+        eps = 1e-03
         test_comb = [
             {
                 "module": [Div()],  # noqa: F405
                 "sample_inputs": [
-                    (torch.randn(2, 5, 1, 3), torch.randn(2, 5, 1, 3)),
-                    (torch.randn([2, 5, 1, 3]), torch.randn([4, 1])),
+                    (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
+                    (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
             },
             {
@@ -633,11 +633,9 @@ def test_qnn_backend_element_wise_sub(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
-    @unittest.expectedFailure
     def test_qnn_backend_embedding(self):
         module = Embedding()  # noqa: F405
-        # QNN does not support int64 datatype
-        sample_input = (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)
+        sample_input = (torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]]).to(torch.int32),)
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
@@ -891,6 +889,9 @@ def test_qnn_backend_view_permute_matmul(self):
         sample_input = (torch.randn([1, 8, 512]), torch.randn([1, 2, 8, 256]))
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
+        # check if requantization work
+        module = self.get_qdq_module(module, sample_input, use_16bit_quant=True)
+        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_example_models(self):
         instances = [
@@ -1258,12 +1259,11 @@ def test_ptq_dummy_llama2(self):
             self.device,
             "--model",
             self.model,
-            "--ptq",
-            self.ptq,
             "--ip",
             self.ip,
             "--port",
             str(self.port),
+            "--ptq",
         ]
         if self.host:
             cmds.extend(["--host", self.host])
@@ -1307,9 +1307,15 @@ def test_mobilebert(self):
             msg = json.loads(conn.recv())
             cpu, htp = msg["CPU"], msg["HTP"]
             for k, v in cpu.items():
-                self.assertLessEqual(abs(v[0] - htp[k][0]), 1)
+                self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
 
+    @unittest.expectedFailure
     def test_ptq_mobilebert(self):
+        # TODO: 2 approaches to resolve accuracy issue
+        # 1. fallback embedding layers:
+        #    - skip annotation in quantizer (need PR to provide helper funciton)
+        #    - skip operators in partitioner (use existent "skip_node_op_set")
+        # 2. investigate different quantization configurations / mechanisms
         if not self.required_envs([self.pretrained_weight]):
             self.skipTest("missing required envs")
 
@@ -1370,6 +1376,7 @@ def setup_environment():
         "-p",
         "--pretrained_weight",
         help="Location for pretrained weighting",
+        default="",
         type=str,
     )
     parser.add_argument(
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index c6a1d4c831..2342d129b9 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -15,7 +15,10 @@
 from executorch import exir
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
-from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+from executorch.backends.qualcomm.quantizer.quantizer import (
+    get_default_16bit_qnn_ptq_config,
+    QnnQuantizer,
+)
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     QcomChipset,
 )
@@ -167,6 +170,7 @@ def get_qdq_module(
         inputs: Tuple[torch.Tensor],
         is_conv_per_channel: Optional[bool] = True,
         custom_quant_annotations: Tuple[Callable] = (),
+        use_16bit_quant: Optional[bool] = False,
     ) -> torch.fx.GraphModule:
         m = torch._export.capture_pre_autograd_graph(module, inputs)
 
@@ -174,6 +178,10 @@ def get_qdq_module(
         quantizer.add_custom_quant_annotations(custom_quant_annotations)
         quantizer.set_per_channel_quant(is_conv_per_channel)
 
+        if use_16bit_quant:
+            quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
+            quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+
         prepared = prepare_pt2e(m, quantizer)
         prepared(*inputs)
         quantized_module = convert_pt2e(prepared)
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 945c8f0ab0..64b4c9c02a 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -27,7 +27,11 @@
 from executorch.backends.qualcomm.passes.convert_to_linear import ConvertToLinear
 from executorch.backends.qualcomm.passes.fold_qdq import FoldQDQ
 from executorch.backends.qualcomm.passes.i64_to_i32 import I64toI32
+from executorch.backends.qualcomm.passes.insert_requantize import InsertRequantize
 from executorch.backends.qualcomm.passes.layout_transform import LayoutTransform
+from executorch.backends.qualcomm.passes.recompose_pixel_shuffle import (
+    RecomposePixelShuffle,
+)
 from executorch.backends.qualcomm.passes.remove_clone import RemoveClone
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     _soc_info_table,
@@ -60,6 +64,8 @@ def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
 ) -> exir.ExirExportedProgram:
+    # TODO: should switch to torch.export.export & custom deomposition
+    #       to reduce maintaining effort.
     exir_exported_program = exir.capture(
         module,
         inputs,
@@ -76,6 +82,7 @@ def capture_program(
     edge_program = ex_prog.exported_program
     graph_module = edge_program.graph_module
     RemoveClone()(graph_module)
+    RecomposePixelShuffle()(graph_module)
     ConvertToLinear()(graph_module)
     ConvertHardsigmoid()(graph_module)
     ConvertHardswish()(graph_module)
@@ -86,6 +93,7 @@ def capture_program(
     AnnotateAndQuantScalar(edge_program)(graph_module)
     AnnotateDecomposed(edge_program)(graph_module)
     FoldQDQ()(graph_module)
+    InsertRequantize(edge_program)(graph_module)
     LayoutTransform(edge_program)(graph_module)
     return ex_prog
 
diff --git a/examples/qualcomm/scripts/dummy_llama2.py b/examples/qualcomm/scripts/dummy_llama2.py
index ba278f11c7..94e2e323c1 100755
--- a/examples/qualcomm/scripts/dummy_llama2.py
+++ b/examples/qualcomm/scripts/dummy_llama2.py
@@ -21,21 +21,19 @@
 
 
 def create_device_inputs(example_inputs, use_kv_cache):
-    inputs = None
+    inputs = [inp.to(torch.int32) for inp in example_inputs]
     input_list = ""
     if use_kv_cache:
-        inputs = (example_inputs,)
         for i, d in enumerate(inputs[0]):
             if type(d) == list:
                 d = torch.stack(d)
             d.numpy().tofile(f"{args.artifact}/input_0_0.raw")
             input_list = f"input_0_{i}.raw "
     else:
-        inputs = example_inputs
         inputs[0].numpy().tofile(f"{args.artifact}/input_0_0.raw")
         input_list = "input_0_0.raw"
     input_list += "\n"
-    return inputs, input_list
+    return tuple(inputs), input_list
 
 
 if __name__ == "__main__":
@@ -94,7 +92,7 @@ def create_device_inputs(example_inputs, use_kv_cache):
     use_fp16 = False if args.ptq else True
     build_executorch_binary(
         instance.get_eager_model().eval(),
-        instance.get_example_inputs(),
+        inputs,
         args.model,
         f"{args.artifact}/{pte_filename}",
         inputs,
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 807550d11c..d241d5da3e 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -26,9 +26,9 @@ def evaluate(model, data_val):
     predictions, true_vals = [], []
     for data in data_val:
         inputs = {
-            "input_ids": data[0],
-            "attention_mask": data[1],
-            "labels": data[2],
+            "input_ids": data[0].to(torch.long),
+            "attention_mask": data[1].to(torch.long),
+            "labels": data[2].to(torch.long),
         }
         logits = model(**inputs)[1].detach().numpy()
         label_ids = inputs["labels"].numpy()
@@ -58,8 +58,18 @@ def accuracy_per_class(preds, goldens, labels):
 def get_dataset(data_val):
     # prepare input data
     inputs, input_list = [], ""
+    # max_position_embeddings defaults to 512
+    position_ids = torch.arange(512).expand((1, -1)).to(torch.int32)
     for index, data in enumerate(data_val):
-        inputs.append(tuple(data[:2]))
+        data = [d.to(torch.int32) for d in data]
+        # input_ids, attention_mask, token_type_ids, position_ids
+        inputs.append(
+            (
+                *data[:2],
+                torch.zeros(data[0].size(), dtype=torch.int32),
+                position_ids[:, : data[0].shape[1]],
+            )
+        )
         input_text = " ".join(
             [f"input_{index}_{i}.raw" for i in range(len(inputs[-1]))]
         )