From 949554731acf1221312175a2105e656779cb62a2 Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <quic_fhossein@quicinc.com>
Date: Fri, 24 Mar 2023 11:27:48 -0500
Subject: [PATCH] [QNN] Implement quantized avg_pool2d

* qnn.avg_pool2d is integrated into Relay and integration tests are added
* FQ2I is modified to pick up qnn.avg_pool2d
* Canonicalization and layout conversion for qnn.avg_pool2d are implemented
* Dynamic PoolArea computation is implemented for both qnn and slice_op avg_pool2d
  to exclude padding area in averaging calculation when count_include_pad=False

Co-authored-by: Fateme Hosseini <quic_fhossein@quicinc.com>
Co-authored-by: Jyotsna Verma <jverma@quicinc.com>
Co-authored-by: Anirudh Sundar <quic_sanirudh@quicinc.com>
Co-authored-by: Venkat Rasagna Reddy Komatireddy <quic_rasagna@quicinc.com>
---
 python/tvm/relay/qnn/op/_qnn.py               |   3 +
 python/tvm/relay/qnn/op/layout_conversions.py |  35 ++
 python/tvm/relay/qnn/op/qnn.py                |  66 +++
 python/tvm/relay/qnn/strategy/generic.py      |  39 ++
 python/tvm/relay/qnn/strategy/hexagon.py      |  24 +
 .../transform/fake_quantization_to_integer.py |  57 +--
 python/tvm/topi/hexagon/compute_poolarea.py   | 143 ++++++
 python/tvm/topi/hexagon/qnn/__init__.py       |   2 +-
 python/tvm/topi/hexagon/qnn/avg_pool2d.py     | 408 ++++++++++++----
 python/tvm/topi/hexagon/slice_ops/__init__.py |   2 +-
 .../tvm/topi/hexagon/slice_ops/avg_pool2d.py  | 224 ++++++---
 python/tvm/topi/hexagon/utils.py              |  65 ++-
 src/relay/op/nn/pooling.cc                    |  38 +-
 src/relay/op/nn/pooling.h                     |   4 +-
 src/relay/op/nn/pooling_common.h              |  78 +++
 src/relay/qnn/op/avg_pool2d.cc                | 222 +++++++++
 .../test_hexagon/test_qnn_op_integration.py   | 456 ++++++++++++++++++
 .../topi/slice_op/test_avg_pool2d_slice.py    | 412 ++++++++--------
 18 files changed, 1837 insertions(+), 441 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/compute_poolarea.py
 create mode 100644 src/relay/op/nn/pooling_common.h
 create mode 100644 src/relay/qnn/op/avg_pool2d.cc
 create mode 100644 tests/python/contrib/test_hexagon/test_qnn_op_integration.py

diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index 2ad2445cd3651..f036e6cf840d2 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -167,3 +167,6 @@ def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type):
 # qnn.batch_matmul
 register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy)
 register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# qnn.avg_pool2d
+register_strategy("qnn.avg_pool2d", strategy.qnn_avg_pool2d_strategy)
diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py
index 668cafb8ae342..587993603139f 100644
--- a/python/tvm/relay/qnn/op/layout_conversions.py
+++ b/python/tvm/relay/qnn/op/layout_conversions.py
@@ -126,3 +126,38 @@ def convert_qnn_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
         return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs)
 
     raise ValueError(f"Layout {desired_data_layout} is not yet supported")
+
+
+@reg.register_convert_op_layout("qnn.avg_pool2d")
+def convert_qnn_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
+    """Convert Layout pass registration for QNN avg_pool2d op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current avg_pool2d
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    tinfos : list of types
+        List of input and output types
+    desired_layouts : list of layout strings
+        List of layouts defining our desired
+        layout for the data input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The transformed expr
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    assert len(desired_layouts) == 1, "A desired layout is expected for qnn.avg_pool2d's input"
+    desired_data_layout = desired_layouts[0]
+    if desired_data_layout == "NCHW" or desired_data_layout == "NHWC":
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_data_layout)
+        new_attrs["out_layout"] = str(desired_data_layout)
+        return relay.qnn.op.avg_pool2d(*inputs, **new_attrs)
+
+    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 0e73a6889fcd2..eb64b56e829d4 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -1249,3 +1249,69 @@ def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zer
 
 def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1):
     return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point)
+
+
+def avg_pool2d(
+    data,
+    input_scale,
+    input_zero_point,
+    output_scale,
+    output_zero_point,
+    pool_size,
+    strides,
+    padding,
+    dilation,
+    ceil_mode=False,
+    count_include_pad=True,
+    layout="NHWC",
+    out_layout="",
+):
+
+    """Quantized avg_pool2d
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The quantized input tensor.
+    input_scale: float
+        The scale of the input quantized expr.
+    input_zero_point: int
+        The zero point of input quantized expr.
+    output_scale: flaot
+        The scale of the output quantized expr.
+    output_zero_point: int
+       The zero point of output quantized expr.
+    pool_size : relay.Expr
+        The pool_size
+    strides : relay.Expr
+        The strides
+    padding : relay.Expr
+        The padding size
+    dilation : relay.Expr
+        The dilation size
+    ceil_mode : bool, optional
+        Whether to use ceil or floor for calculating the output shape
+    count_include_pad : bool, optional
+        Determines if padding should be taken into account in the computation
+    layout: string, optinal
+    out_layout: string, optional
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.avg_pool2d(
+        data,
+        input_scale,
+        input_zero_point,
+        output_scale,
+        output_zero_point,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        ceil_mode,
+        count_include_pad,
+        layout,
+        out_layout,
+    )
diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
index 3ebf8edd36654..4c5884ffdc15b 100644
--- a/python/tvm/relay/qnn/strategy/generic.py
+++ b/python/tvm/relay/qnn/strategy/generic.py
@@ -157,6 +157,36 @@ def wrapper(_attrs, inputs, out_type):
     return wrapper
 
 
+def wrap_compute_qnn_avg_pool2d(topi_compute):
+    """Wrap qnn.avg_pool2d topi compute"""
+
+    def wrapper(attrs, inputs, out_type):
+        kernel = attrs.pool_size
+        strides = attrs.strides
+        padding = attrs.padding
+        dilation = attrs.dilation
+        count_include_pad = attrs.count_include_pad
+        oshape = out_type.shape
+        odtype = out_type.dtype
+        args = [
+            inputs[0],
+            kernel,
+            strides,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            inputs[1],
+            inputs[2],
+            inputs[3],
+            inputs[4],
+        ]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
 def wrap_topi_concatenate(topi_compute):
     """Wrap TOPI compute which use qnn.concatenate attrs"""
 
@@ -280,3 +310,12 @@ def qnn_batch_matmul_strategy(attrs, inputs, out_type, target):
         "qnn.batch_matmul is currently only supported with Hexagon. "
         "Please run QNN Canonicalize pass to decompose this op into supported ops."
     )
+
+
+@override_native_generic_func("qnn_avg_pool2d_strategy")
+def qnn_avg_pool2d_strategy(attrs, inputs, out_type, target):
+    """qnn.avg_pool2d generic strategy"""
+    raise RuntimeError(
+        "qnn.avg_pool2d is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
index d17b0da6cf0a4..3edbce34e30f9 100644
--- a/python/tvm/relay/qnn/strategy/hexagon.py
+++ b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -201,3 +201,27 @@ def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
         name="qnn_batch_matmul.hexagon",
     )
     return strategy
+
+
+@qnn_avg_pool2d_strategy.register(["hexagon"])
+def qnn_avg_pool2d_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.avg_pool2d strategy for Hexagon"""
+    data_layout = attrs.layout
+    if data_layout == "NHWC":
+        strategy = _op.OpStrategy()
+        strategy.add_implementation(
+            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NHWC),
+            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
+            name="qnn_avg_pool2d.hexagon",
+        )
+        return strategy
+    elif data_layout == "NCHW":
+        strategy = _op.OpStrategy()
+        strategy.add_implementation(
+            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NCHW),
+            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
+            name="qnn_avg_pool2d.hexagon",
+        )
+        return strategy
+    else:
+        raise RuntimeError("Unsupported strategy for qnn.avg_pool2d")
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 4c9a3f7cd09cf..5e289b0c9380f 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -111,6 +111,32 @@ def identity(expr, type_map):
 register_unary_identity("image.resize2d")
 
 
+@register_fake_quantization_to_integer("nn.avg_pool2d")
+def avgpool2d(expr, type_map):
+    """Rewrite an avgpool op"""
+    attrs = {**expr.attrs}
+    arg = expr.args[0]
+    t = type_map[arg]
+    out_t = type_map[expr]
+
+    out = relay.qnn.op.avg_pool2d(
+        arg,
+        t.scale,
+        t.zero_point,
+        out_t.scale,
+        out_t.zero_point,
+        attrs["pool_size"],
+        attrs["strides"],
+        attrs["padding"],
+        attrs["dilation"],
+        attrs["ceil_mode"],
+        attrs["count_include_pad"],
+        attrs["layout"],
+    )
+
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
+
+
 @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d")
 def adaptive_avgpool1d(expr, type_map):
     """Rewrite an adaptive avgpool op"""
@@ -138,37 +164,6 @@ def adaptive_avgpool1d(expr, type_map):
     return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
 
 
-@register_fake_quantization_to_integer("nn.avg_pool2d")
-def avgpool2d(expr, type_map):
-    """Rewrite a avgpool op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    out_t = type_map[expr]
-    # Cast (or requantize) to int32.
-    if not (
-        approx_equal(t.scale, out_t.scale)
-        and approx_equal(t.zero_point, out_t.zero_point)
-        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
-    ):
-        arg = relay.qnn.op.requantize(
-            arg,
-            t.scale,
-            t.zero_point,
-            out_t.scale,
-            out_t.zero_point,
-            out_dtype="int32",
-            axis=t.axis,
-        )
-    else:
-        arg = relay.op.cast(arg, "int32")
-    out = relay.op.nn.avg_pool2d(arg, **expr.attrs)
-    if out_t.dtype != "int32":
-        # Cast back to output dtype to preserve input dtype == output dtype for AvgPool2d.
-        out = relay.op.clip(out, a_min=np.iinfo(out_t.dtype).min, a_max=np.iinfo(out_t.dtype).max)
-        out = relay.op.cast(out, out_t.dtype)
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
-
-
 @register_fake_quantization_to_integer("nn.global_avg_pool2d")
 def global_avgpool2d(expr, type_map):
     """Rewrite a global_avgpool op"""
diff --git a/python/tvm/topi/hexagon/compute_poolarea.py b/python/tvm/topi/hexagon/compute_poolarea.py
new file mode 100644
index 0000000000000..6ba50c4a96e16
--- /dev/null
+++ b/python/tvm/topi/hexagon/compute_poolarea.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+
+"""Compute PoolArea size which is used to exclude the zero-padding elements in the averaging
+   calculation.
+"""
+
+from tvm import te, tir
+
+
+def compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left):
+    """
+    Parameters
+    ----------
+    i,j:
+        index of output tensor along H and W axis
+        This is equal to the starting point of the sliding window for which the average is computed
+    ih, iw:
+        input data size along H and W axis
+    kh, kw:
+        Kernel size along H and W axis
+    sh, sw:
+        Stride size along H and W axis
+    dh, dw:
+        Dilation size along H and W axis
+    pad_top, pad_left:
+        Pad size on Top and left side of input data
+
+    # PoolArea refers to the area of that portion of each sliding window which only includes
+    # the input data and not the padded area.
+
+    # Motivation: The following example shows the location of the first sliding window (at i=0, j=0)
+    # on a 6*6 array, with kernel=[3,3] and padding=[1, 1, 1, 1].
+    # The input data elements are shown with (X) and padding data with (0).
+    # As shown, the number of non-padding elements that should be used for computing
+    # the average of values inside this window is 4, while the windows area is 3*3=9.
+    # To compute the PoolArea, we have to move the top/left edge of the window down/right
+    # to exclude zero-padding elements. The edge adjustment can be formulated as
+    #    top_edge = max(i , pad_top)
+    #    left_edge= max(j , pad_left)
+    # Note that pad_top and pad_left represent point 0 of the input data along i and j direction.
+    # In this example, bottom_edge and right_edge of the PoolArea do not need any adjustment,
+    # because there is no padding data on those side of the window.
+    # However, as we slide the window down and to the right, the window might go
+    # beyond the input data boundaries (ih and iw). In these cases, bottom/right edge should be
+    # moved up/left to be located inside the input data.
+    # This can be formulated as
+    #    bottom_edge = min(i + kh, ih + pad_top)
+    #    left_edge   = min(j + kw, iw + pad_left)
+    # Having all the edges,
+    #    PoolArea = (bottom_edge - top_edge) * (right_edge - left_edge)
+
+    #    _______
+    #    |0 0 0|0 0 0 0 0                         0 0 0 0 0 0 0 0
+    #    |     |                                 _______
+    #    |0 X X|X X X X 0                        |0 X X|X X X X 0
+    #    |     |                                 |     |
+    #    |0 X X|X X X X 0        ====>           |0 X X|X X X X 0
+    #    |_____|                                 |_____|
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 0 0 0 0 0 0 0                          0 0 0 0 0 0 0 0
+
+
+    # The above equations are derived under the assumption of having default value (1)
+    # for stride and dilation. However, we need to expand them to support non-default
+    # stride and dilation values.
+    # Stride impacts the starting location of the sliding windows, so i and j should be
+    # replaced by (i * sh) and j by (j * sw) in the equations.
+    # Dilation changes the window size, making k kernel elements scattered into a d*(k - 1) + 1
+    # window.
+    # Non-1 dilation means that, we need to divide the adjusted window size by the dilation value
+    # to find out how many kernel elements inside the sliding window are inside the input data
+    # boundaries:
+    #    top_edge= max(i * sh , pad_top)
+    #    left_edge= max(j * sw , pad_left)
+    #    bottom_edge = min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
+    #    left_edge   = min(j * sw + (kw - 1) * dw + 1, data_w + pad_left)
+    #    PoolArea = ceil_div((bottom_edge - top_edge), dh) * ceil_div((right_edge - left_edge), dw)
+    #
+    # Finally, we need to address one corner case related to the non-default dilation:
+    # Consider the following example along W axis, where iw = 3, kw = 3 and dw = 2.
+    # The first figure on the left shows the sliding window of size 5 starting at index 0,
+    # and the first figure on the right shows the same example with sliding window at index 1.
+    # The second row of figures show the PoolArea after adjusting the edges
+    # (both left_edge - right_edge = 3)
+    # The third row of figures show the location of dialated kernel points(*).
+    # As shown, although the distance between left and right edge in both cases is 3 and
+    # dilation is 2 and ceil_div(3,2)=2, the right PoolArea only includes 1 kernel point.
+
+    #  Sliding Window:                       |0 0 X X X |0                         0 |0 X X X  0|
+    #  PoolArea(after edge adjustment):       0 0|X X X |0                         0  0|X X X| 0
+    #  location of dilated kernel points:     * 0|* X * |0                         0  *|X * X| 0
+    #  PoolArea (dilated_point_aware):        * 0|* X * |0                         0  * X|* X| 0
+
+    # To address this issue, instead of moving the left_edge to bring it just inside the input
+    # data boundary, we should move the edge to the right untill we get to the first dilated kernel
+    # point inside the input data boundary.
+    # The third row of figures shows how this row adjustment can solve the problem.
+    # So the problem is reduced to finding the the first dilated kernel point inside the data
+    # boundary.# For that, we can find the number of dialted points which are mapped to the padded
+    # area and find the location of the next one which should be inside the input data:
+    #    num_of_prev_points = (pad_top - i * sh - 1) // dh
+    #    next_point_index = i * sh + (num_prev_points + 1) * dh
+    #
+    # With that, Top_edge and left_edge can be reformulated as:
+    #    if i*sh - pad_top < 0:
+    #        top_edge = i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh
+    #    else:
+    #        top_edge = i * sh
+    #
+    #    if j * sw - pad_left < 0:
+    #        left_edge = j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw
+    #    else:
+    #        left_edge= j * sw
+
+    """
+    top_edge = tir.if_then_else(
+        tir.all(i * sh - pad_top < 0), i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh, i * sh
+    )
+    bottom_edge = te.min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
+    left_edge = tir.if_then_else(
+        tir.all(j * sw - pad_left < 0), j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw, j * sw
+    )
+    right_edge = te.min(j * sw + (kw - 1) * dw + 1, iw + pad_left)
+    return -((bottom_edge - top_edge) // -dh) * -((right_edge - left_edge) // -dw)
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index ba7d64b6b56d6..f7c4502301c06 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -18,7 +18,7 @@
 """ Computes and schedules for Hexagon quantized ops """
 
 from .adaptive_avg_pool1d import *
-from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
+from .avg_pool2d import *
 from .conv2d_alter_op import *
 from .dense_alter_op import *
 from .dequantize import dequantize_compute, dequantize_schedule
diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
index 4aac15cbdc178..1370ad36e4687 100644
--- a/python/tvm/topi/hexagon/qnn/avg_pool2d.py
+++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
@@ -16,64 +16,144 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
 
-""" Compute and schedule for quantized avg_pool2d op
-
-Please note the following assumptions made by the implementation:
-
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
-   modified to support 'False' case but the element count for the pooling window
-   must be pre-computed and provided as an input to reduce the run-time overhead.
-3) 'padding' is ignored. It must be handled outside of the sliced op.
-4) Please note that this implementation will not work if the output includes any
-   physical layout related padding as it can result into out-of-bound access
-   for the input.
-"""
+""" Compute and schedule for quantized avg_pool2d op """
 
+import tvm
 from tvm import te
 from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value
+from ..utils import (
+    get_layout_transform_fn,
+    get_fixed_point_value,
+    is_scalar,
+    get_const_int_value,
+    get_const_float_value,
+)
+from ...utils import get_const_tuple
+from ...nn.utils import get_pad_tuple
+from ...nn.pad import pad
+from ..compute_poolarea import compute_PoolArea
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
 
 
-def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list):
-    """Validate output shape"""
-    _, oh, ow, _ = out_shape
-    _, ih, iw, _ = in_shape
+def qnn_avg_pool2d_NCHW(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Compute for quantized avg_pool2d"""
     kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
+
+    if odtype == "uint8":
+        temp_dtype = "uint16"
+    elif odtype == "int8":
+        temp_dtype = "int16"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
     sh, sw = stride
     dh, dw = dilation
-    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
-        raise RuntimeError("Output height is too large")
-    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
-        raise RuntimeError("Output width is too large")
 
+    scale = input_scale / output_scale
+    scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
 
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
+    Sum = te.compute(
+        oshape,
+        lambda b, c, h, w: te.sum(
+            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype(temp_dtype), axis=[rh, rw]
+        ),
+        name="pool_sum",
+    )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, _, oh, ow = oshape
+        _, _, ih, iw = data.shape
 
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
 
-def qnn_avg_pool2d_compute(
+        ScaleWithArea = te.compute(
+            (oh, ow),
+            lambda i, j: (scale_fixed_point // PoolArea[i, j]).astype("int32"),
+            name="scale_with_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: saturate(
+                ((Sum[b, c, h, w] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        ScaleWithArea = scale_fixed_point // (kh * kw)
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: saturate(
+                ((Sum[b, c, h, w] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    return Avg
+
+
+def qnn_avg_pool2d_NHWC(
     data: te.Tensor,
     kernel: list,
     stride: list,
+    padding: list,
     dilation: list,
+    count_include_pad: bool,
     oshape: list,
     odtype: str,
     # quantization params:
-    input_zero_point: int,
     input_scale: float,
-    output_zero_point: int,
+    input_zero_point: int,
     output_scale: float,
+    output_zero_point: int,
 ):
     """Compute for quantized avg_pool2d"""
     kh, kw = kernel
     rh = te.reduce_axis((0, kh), name="rh")
     rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = oshape
-    if isinstance(ob, int):
-        validate_out_shape(oshape, data.shape, kernel, stride, dilation)
 
     if odtype == "uint8":
         temp_dtype = "uint16"
@@ -85,86 +165,217 @@ def qnn_avg_pool2d_compute(
     sh, sw = stride
     dh, dw = dilation
 
-    PoolArea = kh * kw
-
     scale = input_scale / output_scale
     scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-    scale_with_area = scale_fixed_point // PoolArea
     corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+    # Compute Area
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, pad_top, pad_left, 0)
+        pad_after = (0, pad_down, pad_right, 0)
+        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
     Sum = te.compute(
         oshape,
         lambda b, h, w, c: te.sum(
-            data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
+            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
         ),
-        name="sum",
+        name="pool_sum",
     )
 
-    Avg = te.compute(
-        oshape,
-        lambda b, h, w, c: saturate(
-            ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="avg",
-    )
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, oh, ow, _ = oshape
+        _, ih, iw, _ = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        ScaleWithArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0),
+                (scale_fixed_point // PoolArea[i, j]).astype("int32"),
+                0,
+            ),
+            name="scale_with_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: saturate(
+                ((Sum[b, h, w, c] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        ScaleWithArea = scale_fixed_point // (kh * kw)
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: saturate(
+                ((Sum[b, h, w, c] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+
     return Avg
 
 
-def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for input and output layout nhwc-8h8w32c"""
+def qnn_avg_pool2d_wrapper_compute_NCHW(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Extract qnn params"""
+    if (
+        is_scalar(input_scale)
+        and is_scalar(output_scale)
+        and is_scalar(input_zero_point)
+        and is_scalar(output_zero_point)
+    ):
+        iscale = get_const_float_value(input_scale)
+        oscale = get_const_float_value(output_scale)
+        izero_point = get_const_int_value(input_zero_point)
+        ozero_point = get_const_int_value(output_zero_point)
+        return qnn_avg_pool2d_NCHW(
+            data,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            iscale,
+            izero_point,
+            oscale,
+            ozero_point,
+        )
+    else:
+        raise RuntimeError("quantization parameters should be scalar tensors")
+
+
+def qnn_avg_pool2d_wrapper_compute_NHWC(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Extract qnn params"""
+    if (
+        is_scalar(input_scale)
+        and is_scalar(output_scale)
+        and is_scalar(input_zero_point)
+        and is_scalar(output_zero_point)
+    ):
+        iscale = get_const_float_value(input_scale)
+        oscale = get_const_float_value(output_scale)
+        izero_point = get_const_int_value(input_zero_point)
+        ozero_point = get_const_int_value(output_zero_point)
+        return qnn_avg_pool2d_NHWC(
+            data,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            iscale,
+            izero_point,
+            oscale,
+            ozero_point,
+        )
+    else:
+        raise RuntimeError("quantization parameters should be scalar tensors")
+
+
+def schedule_qnn_avg_pool2d(outs):
+    """Schedule for qnn.avg_pool2d
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.avg_pool2d
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
+
+
+def schedule_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for input and output layout 8h8w32c"""
+
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
-
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_read = s.cache_read(Avg, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
-
-    # Schedule 'Avg'
-    # Split and reorder the axes to iterate over the output tensor chunks.
-    # Each chunk consists for 2048 bytes with 32 channels being the fastest
-    # changing axis, followed by 8 width and then 8 height.
-    # The width is split by a factor of 4 and then fused with 32 channels
-    # to provide full vector length of data for the output tensor chunks.
-    # NOTE: These schedules are a work in progress and may require
-    # adjustments in future as some of the missing features for 2-d tensors
-    # become available.
-    n, h, w, c = s.get_loops(Avg)
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 8])
-    wio, wii = s.split(wi, [None, 4])
-    co, ci = s.split(c, [None, 32])
-    s.reorder(n, ho, wo, co, hi, wio, wii, ci)
-    wii_ci = s.fuse(wii, ci)
-    s.vectorize(wii_ci)
-
-    # Schedule 'Sum'
-    s.compute_at(Sum, wio)
-    Sum_axis = s.get_loops(Sum)
-    # Compute for 'Sum' includes reduction along height and width. The axes
-    # are being reordered so that 4 width and 32 channels become the
-    # inner-most loops which then can be fused and vectorized. However,
-    # vectorization of the 2-d tensors doesn't work when reduction is
-    # involved and requires codegen support that is yet to be added.
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
-    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
-    # s.vectorize(ci_wii) # Doesn't work
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
     return s
 
 
-def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c"""
+def schedule_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for output layout: 2048c, input layout: 8h8w32c"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
 
     # Schedule 'Avg'
     # Split and reorder the axes to iterate over the output tensor chunks.
@@ -173,7 +384,13 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp
     # NOTE: These schedules are a work in progress and may require
     # adjustments in future as some of the missing features for 2-d tensors
     # become available.
-    n, h, w, c = s.get_loops(Avg)
+
+    if output_layout == "n11c-2048c-2d":
+        _, _, _, c = s.get_loops(Avg)
+    else:
+        _, c, _, _ = s.get_loops(Avg)
+
+    # n, h, w, c = s.get_loops(Avg)
     co, ci = s.split(c, [None, 2048])
     cio, cii = s.split(ci, [None, 128])
     s.vectorize(cii)
@@ -191,15 +408,10 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp
 
 
 def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Quantized avg_pool2d schedule
+    """Quantized avg_pool2d schedule"""
+    if output_layout == "nhwc-8h8w32c-2d" or output_layout == "nchw-8h8w32c-2d":
+        return schedule_8h8w32c(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-2048c-2d" or output_layout == "nc11-2048c-2d":
+        return schedule_2048c(outs, ins, output_layout, input_layout)
 
-    NOTE: This schedule assumes that both input and output tensors are in the form of
-    2d discontiguous buffer and data is already arranged as per the input and output layout
-    respectively.
-
-    """
-    if output_layout == "nhwc-8h8w32c-2d":
-        return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-2048c-2d":
-        return schedule_n11c_2048c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 46ae0c53200f8..b38dd5ecb3c1e 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -17,7 +17,7 @@
 
 """ Computes and Schedules for Hexagon slice ops. """
 
-from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule
+from .avg_pool2d import avg_pool2d_NHWC, avg_pool2d_NCHW, avg_pool2d_schedule
 from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule
 from .add_subtract_multiply import *
 from .argmax import argmax_compute, argmax_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
index bf6d57b8f7f84..9f78cfce28380 100644
--- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
@@ -16,118 +16,206 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, pointless-exception-statement
 
-""" Compute and schedule for avg_pool2d slice op
-
-Please note the following assumptions made by the implementation:
-
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
-   modified to support 'False' case but the element count for the pooling window
-   must be pre-computed and provided as an input to reduce the run-time overhead.
-3) 'padding' is ignored. It must be handled outside of the sliced op.
-4) Please note that this implementation will not work if the output includes any
-   physical layout related padding as it can result into out-of-bound access
-   for the input.
-"""
+""" Compute and schedule for avg_pool2d slice op """
 
 from tvm import te
 from tvm import tir
 from ..utils import get_layout_transform_fn
+from ...utils import get_const_tuple
+from ...nn.utils import get_pad_tuple
+from ...nn.pad import pad
+from ..compute_poolarea import compute_PoolArea
 
 
-def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
-    """Validate output shape"""
-    _, oh, ow, _ = out_shape
-    _, ih, iw, _ = in_shape
+def avg_pool2d_NCHW(
+    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
+):
+    """avg_pool2d compute"""
+    if odtype != "float16":
+        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
     kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
     sh, sw = stride
     dh, dw = dilation
-    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
-        raise RuntimeError("Output height is too large")
-    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
-        raise RuntimeError("Output width is too large")
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
 
-def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"):
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
+    Sum = te.compute(
+        oshape,
+        lambda b, c, h, w: te.sum(
+            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype("float32"), axis=[rh, rw]
+        ),
+        name="pool_sum",
+    )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, _, oh, ow = oshape
+        _, _, ih, iw = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        InvArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
+            ),
+            name="inverse_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: (Sum[b, c, h, w] * InvArea[h, w]).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        InvArea = float(1) / (kh * kw)
+        Avg = te.compute(
+            oshape, lambda b, c, h, w: (Sum[b, c, h, w] * InvArea).astype(odtype), name="pool_avg"
+        )
+
+    return Avg
+
+
+def avg_pool2d_NHWC(
+    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
+):
     """avg_pool2d compute"""
     if odtype != "float16":
-        RuntimeError(f"Unsupported output dtype '{odtype}'")
+        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
     kh, kw = kernel
     rh = te.reduce_axis((0, kh), name="rh")
     rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = oshape
-    if isinstance(ob, int):
-        validate_out_shape(oshape, A.shape, kernel, stride, dilation)
 
     sh, sw = stride
     dh, dw = dilation
     InvArea = float(1) / (kh * kw)
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, pad_top, pad_left, 0)
+        pad_after = (0, pad_down, pad_right, 0)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
     Sum = te.compute(
         oshape,
         lambda b, h, w, c: te.sum(
-            A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
+            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
         ),
-        name="sum",
-    )
-    Avg = te.compute(
-        oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
+        name="pool_sum",
     )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, oh, ow, _ = oshape
+        _, ih, iw, _ = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        InvArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
+            ),
+            name="inverse_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: (Sum[b, h, w, c] * InvArea[h, w]).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        InvArea = float(1) / (kh * kw)
+        Avg = te.compute(
+            oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(odtype), name="pool_avg"
+        )
+
     return Avg
 
 
-def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for input and output layout nhwc-8h2w32c2w"""
+def schedule_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for input and output layout 8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
+    print(func)
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
-
-    # Schedule 'Avg'
-    n, h, w, c = s.get_loops(Avg)
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 4])
-    wio, wii = s.split(wi, [None, 2])
-    co, ci = s.split(c, [None, 32])
-    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
-    ci_wii = s.fuse(ci, wii)
-    s.vectorize(ci_wii)
-
-    # Schedule 'Sum'
-    s.compute_at(Sum, wio)
-    Sum_axis = s.get_loops(Sum)
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
-    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
-    # s.vectorize(ci_wii) # Doesn't work
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
     return s
 
 
-def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
+def schedule_1024c(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for output layout: 1024c, input layout: 8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
 
     # Schedule 'Avg'
-    n, h, w, c = s.get_loops(Avg)
-    co, ci = s.split(c, [None, 1024])
+    if output_layout == "n11c-1024c-2d":
+        n, h, w, c = s.get_loops(Avg)
+    else:
+        n, c, h, w = s.get_loops(Avg)
+    _, ci = s.split(c, [None, 1024])
     cio, cii = s.split(ci, [None, 64])
     s.vectorize(cii)
 
     # Schedule 'Sum'
-    s.compute_at(Sum, cio)
+    # s.compute_at(Sum, cio)
     Sum_axis = s.get_loops(Sum)
     s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
     # s.vectorize(Sum_axis[-3]) # Doesn't work
@@ -136,8 +224,8 @@ def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
 
 def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str):
     """avg_pool2d schedule"""
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-1024c-2d":
-        return schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+    if output_layout == "nhwc-8h2w32c2w-2d" or output_layout == "nchw-8h2w32c2w-2d":
+        return schedule_8h2w32c2w(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-1024c-2d" or output_layout == "nc11-1024c-2d":
+        return schedule_1024c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index f017aaebbdeb4..4bec64e9dbceb 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -24,10 +24,30 @@
 from typing import Dict, Tuple, Union
 
 import tvm
-from tvm import IRModule, te
+from tvm import IRModule, te, tir
 from tvm.tir import IndexMap, PrimFunc
 
 
+def is_scalar(expr):
+    if isinstance(expr, te.Tensor):
+        return expr.ndim == 0 and (isinstance(expr.op.body[0], (tir.FloatImm, tir.IntImm)))
+    return isinstance(expr, (tir.FloatImm, tir.IntImm))
+
+
+def get_const_int_value(expr):
+    if isinstance(expr, te.Tensor):
+        assert isinstance(expr.op.body[0], tir.IntImm)
+        return expr.op.body[0].value
+    return get_const_int(expr)
+
+
+def get_const_float_value(expr):
+    if isinstance(expr, te.Tensor):
+        assert isinstance(expr.op.body[0], tir.FloatImm)
+        return expr.op.body[0].value
+    return get_const_float(expr)
+
+
 def n11c_1024c_2d(n, h, w, c):
     """Return index map for n11c_1024 2d layout"""
     return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
@@ -38,6 +58,11 @@ def n11c_1024c_1d(n, h, w, c):
     return [n, h, w, c // 1024, c % 1024]
 
 
+def nc11_1024c_2d(n, c, h, w):
+    """Return index map for nc11_1024 2d layout"""
+    return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024, h, w]
+
+
 def nhwc_8h2w32c2w_2d(n, h, w, c):
     """Return index map for nhwc_8h2w32c2w 2d layout"""
     return [n, h // 8, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
@@ -48,6 +73,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c):
     return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
 
 
+def nchw_8h2w32c2w_2d(n, c, h, w):
+    """Return index map for nchw_8h2w32c2w 2d layout"""
+    return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
 def nhw_32h16w_2d(n, h, w):
     """Return index map for nhw_32h16w 2d layout"""
     return [n, h // 32, w // 16, IndexMap.AXIS_SEPARATOR, h % 32, w % 16]
@@ -88,6 +118,11 @@ def nc_2048c_2d(n, c):
     return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
 
 
+def nc11_2048c_2d(n, c, h, w):
+    """Return index map for nc11_2048c 2d layout"""
+    return [n, c // 2048, IndexMap.AXIS_SEPARATOR, h, w, c % 2048]
+
+
 def nc_1024c_1d(n, c):
     """Return index map for nc_1024c 1d layout"""
     return [n, c // 1024, c % 1024]
@@ -123,11 +158,25 @@ def nhwc_8h8w32c_2d(n, h, w, c):
     return [n, h // 8, w // 8, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
 
 
+def nhwc_8h8w32c_1d(n, h, w, c):
+    """Return index map for nhwc_8h8w32c 1d layout"""
+    return [n, h // 8, w // 8, c // 32, h % 8, w % 8, c % 32]
+
+
+def nchw_8h8w32c_2d(n, c, h, w):
+    return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
+
+
 def n11c_2048c_2d(n, h, w, c):
     """Return index map for n11c_2048c 2d layout"""
     return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
 
 
+def n11c_2048c_1d(n, h, w, c):
+    """Return index map for n11c_2048c 1 layout"""
+    return [n, h, w, c // 2048, c % 2048]
+
+
 def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
     return [
         in_channel // 32,
@@ -163,12 +212,16 @@ def get_layout_transform_fn(layout):
         return nhwc_8h2w32c2w_2d
     if layout == "nhwc-8h2w32c2w-1d":
         return nhwc_8h2w32c2w_1d
+    if layout == "nchw-8h2w32c2w-2d":
+        return nchw_8h2w32c2w_2d
     if layout == "n11c-1024c-2d":
         return n11c_1024c_2d
     if layout == "n11c-1024c-1d":
         return n11c_1024c_1d
     if layout == "nhwc-1024c-2d":
         return nhwc_1024c_2d
+    if layout == "nc11-1024c-2d":
+        return nc11_1024c_2d
     if layout == "nc-1024-2d":
         return nc_1024_2d
     if layout == "nhw-32h16w-2d":
@@ -201,16 +254,26 @@ def get_layout_transform_fn(layout):
         return nc_2048c_2d
     if layout == "nhwc-8h8w32c-2d":
         return nhwc_8h8w32c_2d
+    if layout == "nhwc-8h8w32c-1d":
+        return nhwc_8h8w32c_1d
+    if layout == "nchw-8h8w32c-2d":
+        return nchw_8h8w32c_2d
     if layout == "n11c-2048c-2d":
         return n11c_2048c_2d
+    if layout == "n11c-2048c-1d":
+        return n11c_2048c_1d
     if layout == "ohwi32o-1d":
         return ohwi32o_1d
+    if layout == "nc11-2048c-2d":
+        return nc11_2048c_2d
     if layout == "ncw-32c64w-2d":
         return ncw_32c64w_2d
     if layout == "nchw-32c8h8w-2d":
         return nchw_32c8h8w_2d
     if layout == "nchw-32c8h4w-2d":
         return nchw_32c8h4w_2d
+    if layout == "nchw-8h8w32c-2d":
+        return nchw_8h8w32c_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 2a8c2440519ee..1cfbab6e661ee 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "../../transforms/infer_layout_utils.h"
+#include "pooling_common.h"
 
 namespace tvm {
 namespace relay {
@@ -40,43 +41,6 @@ namespace relay {
 TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
 TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
 
-template <typename T>
-InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs,
-                                                const Array<Layout>& new_in_layouts,
-                                                const Array<Layout>& old_in_layouts,
-                                                const Array<tvm::relay::Type>& old_in_types) {
-  const auto* attrs_ptr = attrs.as<T>();
-  ICHECK(attrs_ptr);
-  ObjectPtr<T> params = make_object<T>(*attrs_ptr);
-
-  if (params->out_layout != "") {
-    // when users specify the out_layout of pooling, follow user's preference
-    ICHECK_EQ(params->layout, params->out_layout)
-        << "Pooling input/output layouts mismatch: " << params->layout << " vs. "
-        << params->out_layout;
-  } else if (new_in_layouts.defined()) {
-    // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller
-    ICHECK_EQ(new_in_layouts.size(), 1);
-    params->layout = new_in_layouts[0].name();
-  }
-
-  return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params));
-}
-
-IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount,
-                                   IndexExpr pool_size, IndexExpr dilation, IndexExpr stride_size,
-                                   bool ceil_mode) {
-  IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1);
-  IndexExpr denominator = stride_size;
-
-  // Emulate the behavior of running ceil on numerator / denominator rather than floor
-  if (ceil_mode) {
-    numerator += denominator - 1;
-  }
-
-  return numerator / denominator + 1;
-}
-
 template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
diff --git a/src/relay/op/nn/pooling.h b/src/relay/op/nn/pooling.h
index 32ae464101ab1..123cfcd075705 100644
--- a/src/relay/op/nn/pooling.h
+++ b/src/relay/op/nn/pooling.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file src/relay/op/nn/convolution.h
- * \brief Properties def of convlution operator for sharing.
+ * \file src/relay/op/nn/pooling.h
+ * \brief utilities for creating pool ops
  */
 #ifndef TVM_RELAY_OP_NN_POOLING_H_
 #define TVM_RELAY_OP_NN_POOLING_H_
diff --git a/src/relay/op/nn/pooling_common.h b/src/relay/op/nn/pooling_common.h
new file mode 100644
index 0000000000000..9e30cadddd583
--- /dev/null
+++ b/src/relay/op/nn/pooling_common.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/nn/pooling_common.h
+ * \brief Properties def of pooling operator for sharing.
+ */
+#ifndef TVM_RELAY_OP_NN_POOLING_COMMON_H_
+#define TVM_RELAY_OP_NN_POOLING_COMMON_H_
+
+#include <tvm/auto_scheduler/compute_dag.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/tir/analysis.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+inline IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount,
+                                          IndexExpr pool_size, IndexExpr dilation,
+                                          IndexExpr stride_size, bool ceil_mode) {
+  IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1);
+  IndexExpr denominator = stride_size;
+
+  // Emulate the behavior of running ceil on numerator / denominator rather than floor
+  if (ceil_mode) {
+    numerator += denominator - 1;
+  }
+
+  return numerator / denominator + 1;
+}
+
+template <typename T>
+InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs,
+                                                const Array<Layout>& new_in_layouts,
+                                                const Array<Layout>& old_in_layouts,
+                                                const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<T>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<T> params = make_object<T>(*attrs_ptr);
+
+  if (params->out_layout != "") {
+    // when users specify the out_layout of pooling, follow user's preference
+    ICHECK_EQ(params->layout, params->out_layout)
+        << "Pooling input/output layouts mismatch: " << params->layout << " vs. "
+        << params->out_layout;
+  } else if (new_in_layouts.defined()) {
+    // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller
+    // ICHECK_EQ(new_in_layouts.size(), 1);
+    params->layout = new_in_layouts[0].name();
+  }
+
+  return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params));
+}
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_NN_POOLING_COMMON_H_
diff --git a/src/relay/qnn/op/avg_pool2d.cc b/src/relay/qnn/op/avg_pool2d.cc
new file mode 100644
index 0000000000000..b0ba9ff028fe7
--- /dev/null
+++ b/src/relay/qnn/op/avg_pool2d.cc
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/Avg_pool2d.cc
+ * \brief Property def of qnn Avg_pool2d operator.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/tir/expr.h>
+
+#include "../../op/nn/nn.h"
+#include "../../op/nn/pooling.h"
+#include "../../op/nn/pooling_common.h"
+#include "../../op/tensor/transform.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+// relay.op.qnn.avg_pool2d
+bool QnnAvgPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  // Expected Types: data, input_zero_point, input_scale, output_zero_point, output_scale
+  // out_type
+
+  ICHECK_EQ(types.size(), 6);
+
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+      << "Expected quantized avg_pool2d type(int8, uint8) for input but was " << data->dtype;
+
+  const auto* param = attrs.as<AvgPool2DAttrs>();
+  ICHECK(param != nullptr) << "AvgPool2DAttrs cannot be nullptr.";
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+
+  // Find the output shape and data type
+  const auto dshape = data->shape;
+  ICHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+
+  // Check input and output layout
+  Layout layout(param->layout);
+  // The Layout is always NHWC
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+      << "Invalid input layout " << layout
+      << ". qnn_avg_pool2d inut layout must have H and W, which cannot be split";
+
+  // Find the output shape and data type
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
+
+  IndexExpr pad_h, pad_w;
+  if (param->padding.size() == 1) {
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[0] * 2;
+  } else if (param->padding.size() == 2) {
+    // (top, left)
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[1] * 2;
+  } else if (param->padding.size() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param->padding[0] + param->padding[2];
+    pad_w = param->padding[1] + param->padding[3];
+  } else {
+    return false;
+  }
+
+  std::vector<IndexExpr> oshape(dshape.begin(), dshape.end());
+  if (dshape[hidx].as<tir::AnyNode>()) {
+    oshape[hidx] = dshape[hidx];
+  } else {
+    oshape[hidx] =
+        calculate_pool_dimension(dshape[hidx], pad_h, param->pool_size[0], param->dilation[0],
+                                 param->strides[0], param->ceil_mode);
+  }
+  if (dshape[widx].as<tir::AnyNode>()) {
+    oshape[widx] = dshape[widx];
+  } else {
+    oshape[widx] =
+        calculate_pool_dimension(dshape[widx], pad_w, param->pool_size[1], param->dilation[1],
+                                 param->strides[1], param->ceil_mode);
+  }
+
+  // assign output type
+  reporter->Assign(types[5], TensorType(oshape, data->dtype));
+  return true;
+}
+
+InferCorrectLayoutOutput QnnAvgPoolInferCorrectLayout(const Attrs& attrs,
+                                                      const Array<Layout>& new_in_layouts,
+                                                      const Array<Layout>& old_in_layouts,
+                                                      const Array<tvm::relay::Type>& old_in_types) {
+  // Use Relay AvgPool2D Infer correct layout.
+  auto avgpool_new_layouts =
+      PoolInferCorrectLayout<AvgPool2DAttrs>(attrs, new_in_layouts, old_in_layouts, old_in_types);
+
+  // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these
+  // tensors can be treated as channel layout.
+  Layout channel_layout = Layout("C");
+  Array<Layout> input_layouts = {avgpool_new_layouts->input_layouts[0], channel_layout,
+                                 channel_layout, channel_layout, channel_layout};
+  Array<Layout> output_layouts = avgpool_new_layouts->output_layouts;
+  return InferCorrectLayoutOutput(input_layouts, output_layouts, attrs);
+}
+
+/*
+ * \brief Forward rewrite the qnn Avg_pool2d op.
+ * \param attrs The QNN Avg_pool2d attrs.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for qnn Avg_pool2d op.
+ * \note Lowering of the qnn.Avg_pool2d operator
+
+ *  Quantized Avg_pool2d will take one quantized input tensor and returns another
+ *  quantized tensor. Since the input qnn params can be different from the output
+ *  qnn params, first, we requantize the input tensors with output qnn params and
+ *  cast the results into Int32. Then we call relay.nn.avg_pool2d on that requantized
+ *  inputs. Finally, the results are cast into the quantized output data type.
+
+ * Note: The RequantizeOrUpcast function only perform requantization if the input
+ * and output qnn params are different, otherwise it only does casting to Int32.
+ */
+
+Expr QnnAvgPoolCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  ICHECK_EQ(new_args.size(), 5);
+  Expr input_data = new_args[0];
+  Expr input_scale = new_args[1];
+  Expr input_zero_point = new_args[2];
+  Expr output_scale = new_args[3];
+  Expr output_zero_point = new_args[4];
+  const auto in_shape = get_shape(arg_types[0]);
+  const auto* avgpool_attrs = attrs.as<AvgPool2DAttrs>();
+  auto requantized_input = RequantizeOrUpcast(input_data, input_scale, input_zero_point,
+                                              output_scale, output_zero_point, in_shape);
+  Expr nnAvg = AvgPool2D(requantized_input, avgpool_attrs->pool_size, avgpool_attrs->strides,
+                         avgpool_attrs->dilation, avgpool_attrs->padding, avgpool_attrs->layout,
+                         avgpool_attrs->out_layout, avgpool_attrs->ceil_mode,
+                         avgpool_attrs->count_include_pad);
+  const auto* data = arg_types[5].as<TensorTypeNode>();
+  return Cast(nnAvg, data->dtype);
+}
+
+// Positional relay function to create quantized avg_pool2d operator used by frontend FFI.
+Expr MakeQuantizedAvgPool2D(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale,
+                            Expr output_zero_point, Array<IndexExpr> pool_size,
+                            Array<IndexExpr> strides, Array<IndexExpr> padding,
+                            Array<IndexExpr> dilation, bool ceil_mode, bool count_include_pad,
+                            String layout, String output_layout) {
+  auto attrs = make_object<AvgPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(output_layout);
+  attrs->ceil_mode = ceil_mode;
+  attrs->count_include_pad = count_include_pad;
+  static const Op& op = Op::Get("qnn.avg_pool2d");
+  return Call(op, {data, input_scale, input_zero_point, output_scale, output_zero_point},
+              Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.avg_pool2d")
+    .describe("Customized? qnn_avg_pool2d for quantized tensors.")
+    .set_attrs_type<AvgPool2DAttrs>()
+    .set_num_inputs(5)
+    .add_argument("data", "Quantized Tensor", "The input data.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
+    .set_support_level(11)
+    .add_type_rel("QnnAvgPool2D", QnnAvgPool2DRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnAvgPoolInferCorrectLayout)
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAvgPoolCanonicalize);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.avg_pool2d").set_body_typed(MakeQuantizedAvgPool2D);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
new file mode 100644
index 0000000000000..d62dd0c04673c
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
@@ -0,0 +1,456 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring,redefined-outer-name
+
+""" Test Relay integrated qnn ops
+There are two types of tests for qnn ops in this file. One to verify the
+correctness of the relay integration and the other one to verify
+the fake quantization to integer implemented for picking up the qnn op.
+The former is only executed when qnn canonicalization is disabled.
+The latter is executed both with and without canonicalization.
+"""
+# TODO: We might want to distribute these test cases into other test cases such as
+# test_wo_qnn_canonicalization and test_pass_fake_quantization_to_integer in the future.
+
+import numpy as np
+
+import tvm.testing
+import tvm.topi.testing
+from tvm import relay
+from tvm.contrib.hexagon.session import Session
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon import allocate_hexagon_array
+from .infrastructure import quantize_np
+
+from .pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom
+
+
+def compile_for_target(mod, target="hexagon", disable_canonicalization=False):
+    runtime = Runtime("cpp")
+    executor = Executor("graph", {"link-params": True})
+    if target == "hexagon":
+        target_hexagon = tvm.target.hexagon("v68")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        print("Trying relay.build for ...", target)
+        dis_passes = []
+        if disable_canonicalization:
+            dis_passes = ["QnnCanonicalize"]
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=dis_passes):
+            lib = relay.build(mod, target=target, runtime=runtime, executor=executor)
+            print(lib.function_metadata)
+        print("Finished relay.build for...", target)
+    elif target == "llvm":
+        target = tvm.target.Target("llvm")
+        print("Trying relay.build for ...", target)
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, runtime=runtime, executor=executor)
+            print(lib.function_metadata)
+        print("Finished relay.build for...", target)
+    return lib
+
+
+def run_model_on_hexagon(hexagon_session, mod, inputs, params=None, disable_canonicalization=True):
+    hexagon_lowered = compile_for_target(mod, "hexagon", disable_canonicalization)
+    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    if params is None:
+        params = {}
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    return graph_mod.get_output(0).numpy()
+
+
+def run_model_on_llvm(mod, inputs, params=None):
+    llvm_lowered = compile_for_target(mod, "llvm")
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    if params is None:
+        params = {}
+    llvm_graph_mod.set_input(**params)
+    llvm_graph_mod.run(**inputs)
+    return llvm_graph_mod.get_output(0).numpy()
+
+
+def compare_fq_to_int(hexagon_session, expr, input_np_quant, params=None):
+    working_scope = "global"
+    inputs_llvm = {"data": input_np_quant}
+    input_arr = allocate_hexagon_array(
+        hexagon_session.device, data=input_np_quant, mem_scope=working_scope
+    )
+    inputs_hex = {"data": input_arr}
+    mod = tvm.IRModule.from_expr(expr)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
+    assert not tvm.ir.structural_equal(mod, mod_int)
+
+    ref_out_llvm = run_model_on_llvm(mod, inputs_llvm, params)
+
+    # Compare the Hexagon and LLVM results with and without the qnn canonicalization
+    print("Comparing Hexagon and LLVM reusults (canonicalization disabled)...")
+    hexagon_output_fq_wo_qnn_can = run_model_on_hexagon(
+        hexagon_session, mod_int, inputs_hex, params, True
+    )
+    tvm.testing.assert_allclose(ref_out_llvm, hexagon_output_fq_wo_qnn_can, rtol=0, atol=2)
+    print("Comparing Hexagon and LLVM reusults (canonicalization enabled)...")
+    hexagon_output_fq_w_qnn_can = run_model_on_hexagon(
+        hexagon_session, mod_int, inputs_hex, params, False
+    )
+    assert np.all(
+        np.abs(ref_out_llvm.astype("int32") - hexagon_output_fq_w_qnn_can.astype("int32")) <= 1
+    )
+
+
+@tvm.testing.fixture
+def input_np(input_shape, idtype, input_tensor_populator):
+    if idtype in ("int8", "uint8"):
+        idtype = "float32"  # Use "float32" input which will be quantized later
+    return create_populated_numpy_ndarray(input_shape, idtype, input_tensor_populator)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, odtype):
+    scale = None
+    zero_point = None
+    if odtype in ("int8", "uint8"):
+        quant_arr, scale, zero_point = quantize_np(expected_output_np, odtype)
+    else:
+        quant_arr = expected_output_np
+    return quant_arr, scale, zero_point
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, idtype):
+    scale = None
+    zero_point = None
+    if idtype in ("int8", "uint8"):
+        quant_arr, scale, zero_point = quantize_np(input_np, idtype)
+    else:
+        quant_arr = input_np
+    return quant_arr, scale, zero_point
+
+
+input_layout = tvm.testing.parameter("nhwc")
+output_layout = tvm.testing.parameter("nhwc")
+
+
+class TestQnnAvgPool2d:
+    """QNN AvgPool2d test class."""
+
+    _param_descs = [
+        "in_shape",  # input_shape
+        "layout",  # NHWC or NCHW
+        "kernel",  # kernel
+        "stride",  # stride
+        "dil",  # dilation
+        "pad",  # padding
+        "ceil",  # ceil_mode
+        "cnt_padded",  # count_include_pad
+        None,  # input_tensor_populator
+    ]
+
+    _multitest_params = [
+        (
+            [1, 12, 12, 32],
+            "NHWC",
+            [3, 3],
+            [1, 1],
+            [2, 3],
+            [1, 2, 3, 4],
+            False,
+            False,
+            TensorContentRandom(),
+        ),
+        (
+            [1, 18, 18, 32],  # output shape: [1, 16, 16, 32]
+            "NCHW",
+            [3, 3],
+            [2, 2],
+            [2, 1],
+            [1, 2, 3, 4],
+            False,
+            True,
+            TensorContentRandom(),
+        ),
+    ]
+
+    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
+    idtype, odtype = tvm.testing.parameters(("uint8", "uint8"))
+
+    (
+        input_shape,
+        layout,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        input_tensor_populator,
+    ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, layout
+    ):
+        pad_before = padding[:2]
+        pad_after = padding[2:]
+        ref_np = tvm.topi.testing.poolnd_python(
+            input_np,
+            kernel,
+            stride,
+            dilation,
+            pad_before,
+            pad_after,
+            "avg",  # pool_type
+            count_include_pad,
+            ceil_mode,
+            layout=layout,
+        )
+
+        return ref_np
+
+    @tvm.testing.requires_hexagon
+    def test_integrated_qnn_avg_pool2d(
+        self,
+        idtype,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        layout,
+        transformed_input_np,
+        transformed_expected_output_np,
+        hexagon_session: Session,
+    ):
+        working_scope = "global"
+
+        if idtype in ("uint8"):
+            input_np_quant, input_scale, input_zero_point = transformed_input_np
+            golden_out_np, output_scale, output_zero_point = transformed_expected_output_np
+        else:
+            raise RuntimeError(f"Unsupport input dtype '{idtype}'")
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device, data=input_np_quant, mem_scope=working_scope
+        )
+        inputs_hex = {"data": input_arr}
+
+        def gen_relay_expr_qnn(dtype):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            qnn_avg_pool = relay.qnn.op.avg_pool2d(
+                data,
+                input_scale=relay.const(input_scale),
+                input_zero_point=relay.const(input_zero_point),
+                output_scale=relay.const(output_scale),
+                output_zero_point=relay.const(output_zero_point),
+                pool_size=kernel,
+                strides=stride,
+                dilation=dilation,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+                layout=layout,
+            )
+
+            return qnn_avg_pool
+
+        op_hex = gen_relay_expr_qnn(idtype)
+        mod = tvm.IRModule.from_expr(op_hex)
+        mod = relay.transform.InferType()(mod)
+        hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex)
+        np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=0, atol=2)
+
+    @tvm.testing.requires_hexagon
+    def test_fake_quantize_avg_pool2d(
+        self,
+        idtype,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        layout,
+        ceil_mode,
+        count_include_pad,
+        transformed_input_np,
+        transformed_expected_output_np,
+        hexagon_session: Session,
+    ):
+        if idtype in ("uint8"):
+            input_np_quant, input_scale, input_zero_point = transformed_input_np
+            _, output_scale, output_zero_point = transformed_expected_output_np
+        else:
+            raise RuntimeError(f"Unsupport input dtype '{idtype}'")
+
+        def gen_relay_expr(dtype):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            data_deq = relay.qnn.op.dequantize(
+                data, relay.const(input_scale), relay.const(input_zero_point)
+            )
+            op = relay.op.nn.avg_pool2d(
+                data=data_deq,
+                pool_size=kernel,
+                strides=stride,
+                dilation=dilation,
+                padding=padding,
+                layout=layout,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+            out_quant = relay.qnn.op.quantize(
+                op, relay.const(output_scale), relay.const(output_zero_point), out_dtype=dtype
+            )
+            return out_quant
+
+        op_llvm = gen_relay_expr(idtype)
+        compare_fq_to_int(hexagon_session, op_llvm, input_np_quant)
+
+
+class TestQnnQuantize:
+    """QNN Quantize test class."""
+
+    _param_descs = ["in_shape", None]  # input_shape  # input_tensor_populator
+
+    _multitest_params = [
+        ([1, 8, 8, 32], TensorContentRandom()),
+        ([1, 10, 10, 32], TensorContentRandom()),
+        ([1, 12, 12, 128], TensorContentRandom()),
+    ]
+
+    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
+
+    (input_shape, input_tensor_populator) = tvm.testing.parameters(
+        *_multitest_params, ids=_param_ids
+    )
+
+    idtype, odtype = tvm.testing.parameters(("float32", "int8"), ("float32", "uint8"))
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np):
+        # The expected output is of the same shape as input.
+        # The only computation applied on the input is quanization.
+        # Since transform_expected_output quantizes the data,
+        # here, we return the orignal input array in float
+        return input_np
+
+    @tvm.testing.requires_hexagon
+    def test_integrated_qnn_quantize(
+        self,
+        idtype,
+        odtype,
+        input_shape,
+        input_np,
+        transformed_input_np,
+        transformed_expected_output_np,
+        hexagon_session: Session,
+    ):
+        working_scope = "global"
+        if odtype in ("int8", "uint8"):
+            golden_out_np, output_scale, output_zero_point = transformed_expected_output_np
+        else:
+            raise RuntimeError(f"Unsupport output dtype '{odtype}'")
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device, data=input_np, mem_scope=working_scope
+        )
+        inputs_hex = {"data": input_arr}
+
+        def gen_relay_expr_qnn(dtype):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            qnn_quantize = relay.qnn.op.quantize(
+                data,
+                output_scale=relay.const(output_scale),
+                output_zero_point=relay.const(output_zero_point),
+                axis=-1,
+                out_dtype=odtype,
+            )
+            return qnn_quantize
+
+        op_hex = gen_relay_expr_qnn(idtype)
+        mod = tvm.IRModule.from_expr(op_hex)
+        mod = relay.transform.InferType()(mod)
+        hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex)
+        np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=0, atol=1)
+
+
+class TestQnnDequantize:
+    """QNN Dequantize test class."""
+
+    _param_descs = ["in_shape", None]  # input_shape  # input_tensor_populator
+
+    _multitest_params = [
+        ([1, 12, 32, 128], TensorContentRandom()),
+        ([1, 10, 10, 32], TensorContentRandom()),
+        ([1, 6, 6, 2048], TensorContentRandom()),
+        ([1, 1000], TensorContentRandom()),
+    ]
+
+    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
+
+    (input_shape, input_tensor_populator) = tvm.testing.parameters(
+        *_multitest_params, ids=_param_ids
+    )
+
+    idtype, odtype = tvm.testing.parameters(("int8", "float32"), ("uint8", "float32"))
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, idtype):
+        quant_np, scale, zero_point = quantize_np(input_np, idtype)
+        ref_np = (scale * (quant_np.astype("int32") - zero_point)).astype("float32")
+        return ref_np
+
+    @tvm.testing.requires_hexagon
+    def test_integrated_qnn_dequantize(
+        self,
+        idtype,
+        odtype,
+        input_shape,
+        transformed_input_np,
+        transformed_expected_output_np,
+        hexagon_session: Session,
+    ):
+        working_scope = "global"
+        if odtype in ("float32"):
+            input_np_quant, input_scale, input_zero_point = transformed_input_np
+            golden_out_np, _, _ = transformed_expected_output_np
+        else:
+            raise RuntimeError(f"Unsupport odtype '{odtype}'")
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device, data=input_np_quant, mem_scope=working_scope
+        )
+        inputs_hex = {"data": input_arr}
+
+        def gen_relay_expr_qnn(dtype):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            qnn_quantize = relay.qnn.op.dequantize(
+                data,
+                input_scale=relay.const(input_scale),
+                input_zero_point=relay.const(input_zero_point),
+            )
+            return qnn_quantize
+
+        op_hex = gen_relay_expr_qnn(idtype)
+        mod = tvm.IRModule.from_expr(op_hex)
+        mod = relay.transform.InferType()(mod)
+        hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex)
+        np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
index 0eedfdbf8da1d..712d5b303eeb7 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
@@ -20,74 +20,70 @@
 
 from tvm import te
 import tvm.testing
+from tvm.topi.testing import poolnd_python
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
 from tvm.contrib.hexagon import allocate_hexagon_array
-from ...infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-from ...pytest_util import (
-    get_multitest_ids,
-    create_populated_numpy_ndarray,
-    TensorContentRandom,
-)
+import pytest
+from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
+from ...pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom
 
-input_layout = tvm.testing.parameter(
-    "nhwc-8h2w32c2w-2d",
-)
 
-dtype = tvm.testing.parameter("float16", "uint8")
+dtype = tvm.testing.parameter("uint8", "float16")
 
 
 @tvm.testing.fixture
-def output_layout(output_shape, dtype):
-    o_b, o_h, o_w, o_c = output_shape
-    if dtype == "float16":
-        if o_h == 1 and o_w == 1:
-            return "n11c-1024c-2d"
+def output_layout(output_shape, op_layout, dtype):
+    if op_layout == "NHWC":
+        o_b, o_h, o_w, o_c = output_shape
+        if dtype == "float16":
+            if o_h == 1 and o_w == 1:
+                return "n11c-1024c-2d"
+            else:
+                return "nhwc-8h2w32c2w-2d"
+        elif dtype == "int8" or "uint8":
+            if o_h == 1 and o_w == 1:
+                return "n11c-2048c-2d"
+            else:
+                return "nhwc-8h8w32c-2d"
         else:
-            assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape"
-            return "nhwc-8h2w32c2w-2d"
-    elif dtype == "int8" or "uint8":
-        if o_h == 1 and o_w == 1:
-            return "n11c-2048c-2d"
+            raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    elif op_layout == "NCHW":
+        o_b, o_c, o_h, o_w = output_shape
+        if dtype == "float16":
+            if o_h == 1 and o_w == 1:
+                return "nc11-1024c-2d"
+            else:
+                return "nchw-8h2w32c2w-2d"
+        elif dtype == "int8" or "uint8":
+            if o_h == 1 and o_w == 1:
+                return "nc11-2048c-2d"
+            else:
+                return "nchw-8h8w32c-2d"
         else:
-            assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape"
-            return "nhwc-8h8w32c-2d"
+            raise RuntimeError(f"Unsupported data type '{dtype}'")
     else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
+        raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
 
 
 @tvm.testing.fixture
-def input_np(input_shape, dtype: str, input_tensor_populator):
-    if dtype == "uint8":
-        dtype = "float32"  # Use "float32" input which will be quantized later
-    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout, dtype):
+def input_layout(op_layout, dtype):
+    in_layout = op_layout.lower()
     if dtype == "float16":
-        return transform_numpy(expected_output_np, "nhwc", output_layout)
-    elif dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype)
-        return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point]
+        return in_layout + "-8h2w32c2w-2d"
+    elif dtype == "int8" or "uint8":
+        return in_layout + "-8h8w32c-2d"
     else:
         raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 @tvm.testing.fixture
-def transformed_input_np_padded(input_np_padded, input_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(input_np_padded, "nhwc", input_layout)
-    elif dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype)
-        return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point]
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
+def input_np(input_shape, dtype: str, input_tensor_populator):
+    if dtype == "uint8":
+        dtype = "float32"  # Use "float32" input which will be quantized later
+    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
 
 
 class TestAvgPool2dSlice:
@@ -99,110 +95,143 @@ class TestAvgPool2dSlice:
         "pad",  # padding
         "ceil",  # ceil_mode
         "cnt_padded",  # count_include_pad
+        "op_layout",  # input output 4D layout
         None,  # input_tensor_populator
     ]
-
     _multitest_params = [
         (
-            [1, 8, 8, 32],
+            [1, 7, 11, 32],
+            [3, 3],
+            [3, 2],
+            [2, 3],
+            [1, 2, 3, 4],
+            False,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 2, 1, 4],
+            False,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        # Test default stride,dilation, and padding with different layouts
+        (
+            [1, 10, 10, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 16, 16, 32],
+            [1, 12, 12, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
-            [8, 8],
+            [1, 32, 14, 14],
+            [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
-        # Test non-one stride and dilation
         (
-            [1, 8, 8, 32],
-            [3, 3],
-            [2, 3],
+            [1, 32, 15, 15],
+            [8, 8],
+            [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
+        # Test non-one stride and dilation with different layouts
         (
-            [1, 8, 8, 32],
+            [1, 18, 24, 32],
             [3, 3],
-            [2, 2],
+            [2, 3],
             [2, 2],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
-            [3, 3],
+            [1, 32, 18, 18],
+            [5, 5],
             [2, 2],
             [2, 3],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
-        # Test non-zero padding
+        # Test non-zero padding with count include and exclude pad and different layouts
         (
-            [1, 8, 8, 32],
+            [1, 6, 6, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [1, 1, 1, 1],
             False,
-            True,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
             [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 2, 3, 4],
+            [1, 2],
+            [2, 3],
+            [2, 2, 3, 3],
             False,
-            True,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
+            [1, 32, 6, 6],
             [3, 3],
             [1, 1],
             [1, 1],
             [1, 2, 3, 4],
             False,
-            True,
+            False,
+            "NCHW",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
+            [1, 32, 15, 22],
             [3, 3],
             [3, 2],
             [2, 3],
             [1, 2, 3, 4],
             False,
-            True,
+            False,
+            "NCHW",
             TensorContentRandom(),
         ),
         # Test n11c-1024c-2d layout which will require input and output to have different layout
@@ -214,6 +243,7 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
@@ -221,9 +251,21 @@ class TestAvgPool2dSlice:
             [6, 6],
             [1, 1],
             [1, 1],
-            [0, 0, 0, 0],
+            [2, 2, 2, 2],
             False,
-            True,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 2, 1, 4],
+            False,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
@@ -234,23 +276,24 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 1, 1, 2048],
+            [1, 2048, 1, 1],
             [4, 4],
             [2, 2],
             [2, 3],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
     ]
 
     _param_ids = get_multitest_ids(_multitest_params, _param_descs)
 
-    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
     (
         output_shape,
         kernel,
@@ -259,23 +302,17 @@ class TestAvgPool2dSlice:
         padding,
         ceil_mode,
         count_include_pad,
+        op_layout,
         input_tensor_populator,
     ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
     @tvm.testing.fixture
     def expected_output_np(
-        self,
-        input_np,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
+        self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, op_layout
     ):
         pad_before = padding[:2]
         pad_after = padding[2:]
-        ref_np = tvm.topi.testing.poolnd_python(
+        ref_np = poolnd_python(
             input_np,
             kernel,
             stride,
@@ -285,150 +322,129 @@ def expected_output_np(
             "avg",  # pool_type
             count_include_pad,
             False,  # ceil_mode,
-            layout="NHWC",
+            layout=op_layout,
         )
         return ref_np
 
     @tvm.testing.fixture
-    def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout):
+    def input_shape(
+        self, output_shape, kernel, padding, stride, dilation, op_layout, output_layout
+    ):
         # Input shape without any padding; 'ceil' is being ignored from calculation:
-        o_b, o_h, o_w, o_c = output_shape
+        if op_layout == "NHWC":
+            o_b, o_h, o_w, o_c = output_shape
+        else:
+            o_b, o_c, o_h, o_w = output_shape
         d_h, d_w = dilation
         s_h, s_w = stride
         k_h, k_w = kernel
         pad_before_h, pad_before_w = padding[:2]
         pad_after_h, pad_after_w = padding[2:]
 
-        if output_layout == "n11c-1024c-2d":
-            assert (
-                pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0
-            ), "Padding must be zero for n11c-1024c-2d layout"
+        if (
+            output_layout == "n11c-2048c-2d"
+            or output_layout == "nc11-2048c-2d"
+            or output_layout == "n11c-1024c-2d"
+            or output_layout == "nc11-1024c-2d"
+        ):
             assert o_h == 1 and o_w == 1, "Output height and width must be 1"
 
         in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
         in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
 
-        return [o_b, in_h, in_w, o_c]
-
-    @tvm.testing.fixture
-    def input_shape_padded(self, input_shape, padding, output_layout, dtype):
-        # Input shape is adjusted to account for 'padding'. Also, due to the physical
-        # layout of the buffer, height and width are adjusted so that they are a
-        # multiple of the buffer size dictated by the layout.
-        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
-        # for int8/uint8, it's nhwc-8h8w32c-2d.
-        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
-        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
-        # the second case.
-
-        height_mult = 8
-        if dtype == "float16":
-            width_mult = 4  # input layout : nhwc-8h2w32c2w-2d
-        elif dtype in ("uint8", "int8"):
-            width_mult = 8  # input layout : nhwc-8h8w32c-2d
+        if op_layout == "NHWC":
+            return [o_b, in_h, in_w, o_c]
         else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h, pad_after_w = padding[2:]
-        padded_input_height = (
-            (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult
-        ) * height_mult
-        padded_input_width = (
-            (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult
-        ) * width_mult
-        return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
-
-    @tvm.testing.fixture
-    def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h
-        pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w
-        input_padded = np.pad(
-            input_np,
-            ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)),
-            "constant",
-        )
-        return input_padded
+            return [o_b, o_c, in_h, in_w]
 
     @tvm.testing.fixture
     def schedule_args(
         self,
-        stride,
         kernel,
-        dtype,
+        stride,
+        padding,
         dilation,
-        input_layout,
+        count_include_pad,
         output_layout,
         output_shape,
-        input_shape_padded,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
+        input_np,
+        input_shape,
+        input_layout,
+        expected_output_np,
+        dtype,
+        op_layout,
     ):
-        """
-        Construct schedule args based on dtype
-        """
-        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
-
+        """Construct schedule args based on dtype"""
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
         if dtype == "float16":
-            M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape)
+            if op_layout == "NHWC":
+                M = sl.avg_pool2d_NHWC(
+                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
+                )
+            elif op_layout == "NCHW":
+                M = sl.avg_pool2d_NCHW(
+                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
+                )
+            else:
+                raise RuntimeError(f"Unsupported layout for slice_op.avg_pool2d '{op_layout}'")
             tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout)
         elif dtype in ("uint8", "int8"):
-            in_data, in_scale, in_zero_point = transformed_input_np_padded
-            _, out_scale, out_zero_point = transformed_expected_output_np
-            M = qn.qnn_avg_pool2d_compute(
-                A,
-                kernel,
-                stride,
-                dilation,
-                output_shape,
-                dtype,
-                in_zero_point,
-                in_scale,
-                out_zero_point,
-                out_scale,
-            )
+            _, in_scale, in_zero_point = quantize_np(input_np, dtype)
+            _, out_scale, out_zero_point = quantize_np(expected_output_np, dtype)
+            if op_layout == "NHWC":
+                M = qn.qnn_avg_pool2d_NHWC(
+                    A,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    count_include_pad,
+                    output_shape,
+                    dtype,
+                    in_scale,
+                    in_zero_point,
+                    out_scale,
+                    out_zero_point,
+                )
+            elif op_layout == "NCHW":
+                M = qn.qnn_avg_pool2d_NCHW(
+                    A,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    count_include_pad,
+                    output_shape,
+                    dtype,
+                    in_scale,
+                    in_zero_point,
+                    out_scale,
+                    out_zero_point,
+                )
+            else:
+                raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
+
             tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout)
 
         return [tir_schedule.mod, [A, M]]
 
     @tvm.testing.requires_hexagon
     def test_avg_pool2d_slice(
-        self,
-        dtype,
-        output_layout,
-        output_shape,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
-        schedule_args,
-        hexagon_session: Session,
+        self, dtype, input_np, expected_output_np, schedule_args, hexagon_session: Session
     ):
-        in_data = transformed_input_np_padded
-
+        print("schedule_args : ", schedule_args)
         with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                *schedule_args,
-                get_hexagon_target("v69"),
-                name="avg_pool2d",
-            )
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-            "n11c-1024c-2d",
-            "n11c-2048c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+            func = tvm.build(*schedule_args, get_hexagon_target("v69"), name="avg_pool2d")
+
+        input_axis_separator = []
+        output_axis_separator = []
 
         if dtype == "float16":
-            in_data_np = transformed_input_np_padded
-            out_data_np = transformed_expected_output_np
+            in_data_np = input_np
+            out_data_np = expected_output_np
         elif dtype in ("uint8", "int8"):
-            in_data_np, _, _ = transformed_input_np_padded
-            out_data_np, _, _ = transformed_expected_output_np
+            in_data_np, _, _ = quantize_np(input_np, dtype)
+            out_data_np, _, _ = quantize_np(expected_output_np, dtype)
         else:
             raise RuntimeError(f"Unsupport dtype '{dtype}'")
 
@@ -436,34 +452,26 @@ def test_avg_pool2d_slice(
             hexagon_session.device,
             data=in_data_np,
             axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
+            mem_scope="global.ddr",
         )
         output_arr = allocate_hexagon_array(
             hexagon_session.device,
             out_data_np.shape,
             dtype,
             axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
+            mem_scope="global.ddr",
         )
 
         mod = hexagon_session.load_module(func)
 
         mod(input_arr, output_arr)
-        b, h, w, c = output_shape
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
-        elif output_layout == "nhwc-8h8w32c-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
-        elif output_layout == "n11c-2048c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048])
-        elif output_layout == "n11c-1024c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        output_np = output_arr.numpy()
         if dtype == "float16":
             np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3)
         else:
-            np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1)
+            output_np = output_arr.numpy()
+            np.testing.assert_allclose(output_np, out_data_np, rtol=0, atol=2)
 
 
 if __name__ == "__main__":