diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index 2ad2445cd365..f036e6cf840d 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -167,3 +167,6 @@ def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type):
 # qnn.batch_matmul
 register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy)
 register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# qnn.avg_pool2d
+register_strategy("qnn.avg_pool2d", strategy.qnn_avg_pool2d_strategy)
diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py
index 668cafb8ae34..587993603139 100644
--- a/python/tvm/relay/qnn/op/layout_conversions.py
+++ b/python/tvm/relay/qnn/op/layout_conversions.py
@@ -126,3 +126,38 @@ def convert_qnn_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
         return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs)
 
     raise ValueError(f"Layout {desired_data_layout} is not yet supported")
+
+
+@reg.register_convert_op_layout("qnn.avg_pool2d")
+def convert_qnn_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
+    """Convert Layout pass registration for QNN avg_pool2d op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current avg_pool2d
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    tinfos : list of types
+        List of input and output types
+    desired_layouts : list of layout strings
+        List of layouts defining our desired
+        layout for the data input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The transformed expr
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    assert len(desired_layouts) == 1, "A desired layout is expected for qnn.avg_pool2d's input"
+    desired_data_layout = desired_layouts[0]
+    if desired_data_layout == "NCHW" or desired_data_layout == "NHWC":
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_data_layout)
+        new_attrs["out_layout"] = str(desired_data_layout)
+        return relay.qnn.op.avg_pool2d(*inputs, **new_attrs)
+
+    raise ValueError(f"Layout {desired_data_layout} is not yet supported")
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 0e73a6889fcd..eb64b56e829d 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -1249,3 +1249,69 @@ def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zer
 
 def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1):
     return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point)
+
+
+def avg_pool2d(
+    data,
+    input_scale,
+    input_zero_point,
+    output_scale,
+    output_zero_point,
+    pool_size,
+    strides,
+    padding,
+    dilation,
+    ceil_mode=False,
+    count_include_pad=True,
+    layout="NHWC",
+    out_layout="",
+):
+
+    """Quantized avg_pool2d
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The quantized input tensor.
+    input_scale: float
+        The scale of the input quantized expr.
+    input_zero_point: int
+        The zero point of input quantized expr.
+    output_scale: flaot
+        The scale of the output quantized expr.
+    output_zero_point: int
+       The zero point of output quantized expr.
+    pool_size : relay.Expr
+        The pool_size
+    strides : relay.Expr
+        The strides
+    padding : relay.Expr
+        The padding size
+    dilation : relay.Expr
+        The dilation size
+    ceil_mode : bool, optional
+        Whether to use ceil or floor for calculating the output shape
+    count_include_pad : bool, optional
+        Determines if padding should be taken into account in the computation
+    layout: string, optinal
+    out_layout: string, optional
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.avg_pool2d(
+        data,
+        input_scale,
+        input_zero_point,
+        output_scale,
+        output_zero_point,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        ceil_mode,
+        count_include_pad,
+        layout,
+        out_layout,
+    )
diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
index 3ebf8edd3665..4c5884ffdc15 100644
--- a/python/tvm/relay/qnn/strategy/generic.py
+++ b/python/tvm/relay/qnn/strategy/generic.py
@@ -157,6 +157,36 @@ def wrapper(_attrs, inputs, out_type):
     return wrapper
 
 
+def wrap_compute_qnn_avg_pool2d(topi_compute):
+    """Wrap qnn.avg_pool2d topi compute"""
+
+    def wrapper(attrs, inputs, out_type):
+        kernel = attrs.pool_size
+        strides = attrs.strides
+        padding = attrs.padding
+        dilation = attrs.dilation
+        count_include_pad = attrs.count_include_pad
+        oshape = out_type.shape
+        odtype = out_type.dtype
+        args = [
+            inputs[0],
+            kernel,
+            strides,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            inputs[1],
+            inputs[2],
+            inputs[3],
+            inputs[4],
+        ]
+        return [topi_compute(*args)]
+
+    return wrapper
+
+
 def wrap_topi_concatenate(topi_compute):
     """Wrap TOPI compute which use qnn.concatenate attrs"""
 
@@ -280,3 +310,12 @@ def qnn_batch_matmul_strategy(attrs, inputs, out_type, target):
         "qnn.batch_matmul is currently only supported with Hexagon. "
         "Please run QNN Canonicalize pass to decompose this op into supported ops."
     )
+
+
+@override_native_generic_func("qnn_avg_pool2d_strategy")
+def qnn_avg_pool2d_strategy(attrs, inputs, out_type, target):
+    """qnn.avg_pool2d generic strategy"""
+    raise RuntimeError(
+        "qnn.avg_pool2d is currently only supported with Hexagon. "
+        "Please run QNN Canonicalize pass to decompose this op into supported ops."
+    )
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
index d17b0da6cf0a..3edbce34e30f 100644
--- a/python/tvm/relay/qnn/strategy/hexagon.py
+++ b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -201,3 +201,27 @@ def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
         name="qnn_batch_matmul.hexagon",
     )
     return strategy
+
+
+@qnn_avg_pool2d_strategy.register(["hexagon"])
+def qnn_avg_pool2d_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.avg_pool2d strategy for Hexagon"""
+    data_layout = attrs.layout
+    if data_layout == "NHWC":
+        strategy = _op.OpStrategy()
+        strategy.add_implementation(
+            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NHWC),
+            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
+            name="qnn_avg_pool2d.hexagon",
+        )
+        return strategy
+    elif data_layout == "NCHW":
+        strategy = _op.OpStrategy()
+        strategy.add_implementation(
+            wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NCHW),
+            wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d),
+            name="qnn_avg_pool2d.hexagon",
+        )
+        return strategy
+    else:
+        raise RuntimeError("Unsupported strategy for qnn.avg_pool2d")
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 4c9a3f7cd09c..5e289b0c9380 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -111,6 +111,32 @@ def identity(expr, type_map):
 register_unary_identity("image.resize2d")
 
 
+@register_fake_quantization_to_integer("nn.avg_pool2d")
+def avgpool2d(expr, type_map):
+    """Rewrite an avgpool op"""
+    attrs = {**expr.attrs}
+    arg = expr.args[0]
+    t = type_map[arg]
+    out_t = type_map[expr]
+
+    out = relay.qnn.op.avg_pool2d(
+        arg,
+        t.scale,
+        t.zero_point,
+        out_t.scale,
+        out_t.zero_point,
+        attrs["pool_size"],
+        attrs["strides"],
+        attrs["padding"],
+        attrs["dilation"],
+        attrs["ceil_mode"],
+        attrs["count_include_pad"],
+        attrs["layout"],
+    )
+
+    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
+
+
 @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d")
 def adaptive_avgpool1d(expr, type_map):
     """Rewrite an adaptive avgpool op"""
@@ -138,37 +164,6 @@ def adaptive_avgpool1d(expr, type_map):
     return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)]
 
 
-@register_fake_quantization_to_integer("nn.avg_pool2d")
-def avgpool2d(expr, type_map):
-    """Rewrite a avgpool op"""
-    arg = expr.args[0]
-    t = type_map[arg]
-    out_t = type_map[expr]
-    # Cast (or requantize) to int32.
-    if not (
-        approx_equal(t.scale, out_t.scale)
-        and approx_equal(t.zero_point, out_t.zero_point)
-        and tvm.ir.structural_equal(t.dtype, out_t.dtype)
-    ):
-        arg = relay.qnn.op.requantize(
-            arg,
-            t.scale,
-            t.zero_point,
-            out_t.scale,
-            out_t.zero_point,
-            out_dtype="int32",
-            axis=t.axis,
-        )
-    else:
-        arg = relay.op.cast(arg, "int32")
-    out = relay.op.nn.avg_pool2d(arg, **expr.attrs)
-    if out_t.dtype != "int32":
-        # Cast back to output dtype to preserve input dtype == output dtype for AvgPool2d.
-        out = relay.op.clip(out, a_min=np.iinfo(out_t.dtype).min, a_max=np.iinfo(out_t.dtype).max)
-        out = relay.op.cast(out, out_t.dtype)
-    return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)]
-
-
 @register_fake_quantization_to_integer("nn.global_avg_pool2d")
 def global_avgpool2d(expr, type_map):
     """Rewrite a global_avgpool op"""
diff --git a/python/tvm/topi/hexagon/compute_poolarea.py b/python/tvm/topi/hexagon/compute_poolarea.py
new file mode 100644
index 000000000000..6ba50c4a96e1
--- /dev/null
+++ b/python/tvm/topi/hexagon/compute_poolarea.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+
+"""Compute PoolArea size which is used to exclude the zero-padding elements in the averaging
+   calculation.
+"""
+
+from tvm import te, tir
+
+
+def compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left):
+    """
+    Parameters
+    ----------
+    i,j:
+        index of output tensor along H and W axis
+        This is equal to the starting point of the sliding window for which the average is computed
+    ih, iw:
+        input data size along H and W axis
+    kh, kw:
+        Kernel size along H and W axis
+    sh, sw:
+        Stride size along H and W axis
+    dh, dw:
+        Dilation size along H and W axis
+    pad_top, pad_left:
+        Pad size on Top and left side of input data
+
+    # PoolArea refers to the area of that portion of each sliding window which only includes
+    # the input data and not the padded area.
+
+    # Motivation: The following example shows the location of the first sliding window (at i=0, j=0)
+    # on a 6*6 array, with kernel=[3,3] and padding=[1, 1, 1, 1].
+    # The input data elements are shown with (X) and padding data with (0).
+    # As shown, the number of non-padding elements that should be used for computing
+    # the average of values inside this window is 4, while the windows area is 3*3=9.
+    # To compute the PoolArea, we have to move the top/left edge of the window down/right
+    # to exclude zero-padding elements. The edge adjustment can be formulated as
+    #    top_edge = max(i , pad_top)
+    #    left_edge= max(j , pad_left)
+    # Note that pad_top and pad_left represent point 0 of the input data along i and j direction.
+    # In this example, bottom_edge and right_edge of the PoolArea do not need any adjustment,
+    # because there is no padding data on those side of the window.
+    # However, as we slide the window down and to the right, the window might go
+    # beyond the input data boundaries (ih and iw). In these cases, bottom/right edge should be
+    # moved up/left to be located inside the input data.
+    # This can be formulated as
+    #    bottom_edge = min(i + kh, ih + pad_top)
+    #    left_edge   = min(j + kw, iw + pad_left)
+    # Having all the edges,
+    #    PoolArea = (bottom_edge - top_edge) * (right_edge - left_edge)
+
+    #    _______
+    #    |0 0 0|0 0 0 0 0                         0 0 0 0 0 0 0 0
+    #    |     |                                 _______
+    #    |0 X X|X X X X 0                        |0 X X|X X X X 0
+    #    |     |                                 |     |
+    #    |0 X X|X X X X 0        ====>           |0 X X|X X X X 0
+    #    |_____|                                 |_____|
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 X X X X X X 0                          0 X X X X X X 0
+    #    0 0 0 0 0 0 0 0                          0 0 0 0 0 0 0 0
+
+
+    # The above equations are derived under the assumption of having default value (1)
+    # for stride and dilation. However, we need to expand them to support non-default
+    # stride and dilation values.
+    # Stride impacts the starting location of the sliding windows, so i and j should be
+    # replaced by (i * sh) and j by (j * sw) in the equations.
+    # Dilation changes the window size, making k kernel elements scattered into a d*(k - 1) + 1
+    # window.
+    # Non-1 dilation means that, we need to divide the adjusted window size by the dilation value
+    # to find out how many kernel elements inside the sliding window are inside the input data
+    # boundaries:
+    #    top_edge= max(i * sh , pad_top)
+    #    left_edge= max(j * sw , pad_left)
+    #    bottom_edge = min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
+    #    left_edge   = min(j * sw + (kw - 1) * dw + 1, data_w + pad_left)
+    #    PoolArea = ceil_div((bottom_edge - top_edge), dh) * ceil_div((right_edge - left_edge), dw)
+    #
+    # Finally, we need to address one corner case related to the non-default dilation:
+    # Consider the following example along W axis, where iw = 3, kw = 3 and dw = 2.
+    # The first figure on the left shows the sliding window of size 5 starting at index 0,
+    # and the first figure on the right shows the same example with sliding window at index 1.
+    # The second row of figures show the PoolArea after adjusting the edges
+    # (both left_edge - right_edge = 3)
+    # The third row of figures show the location of dialated kernel points(*).
+    # As shown, although the distance between left and right edge in both cases is 3 and
+    # dilation is 2 and ceil_div(3,2)=2, the right PoolArea only includes 1 kernel point.
+
+    #  Sliding Window:                       |0 0 X X X |0                         0 |0 X X X  0|
+    #  PoolArea(after edge adjustment):       0 0|X X X |0                         0  0|X X X| 0
+    #  location of dilated kernel points:     * 0|* X * |0                         0  *|X * X| 0
+    #  PoolArea (dilated_point_aware):        * 0|* X * |0                         0  * X|* X| 0
+
+    # To address this issue, instead of moving the left_edge to bring it just inside the input
+    # data boundary, we should move the edge to the right untill we get to the first dilated kernel
+    # point inside the input data boundary.
+    # The third row of figures shows how this row adjustment can solve the problem.
+    # So the problem is reduced to finding the the first dilated kernel point inside the data
+    # boundary.# For that, we can find the number of dialted points which are mapped to the padded
+    # area and find the location of the next one which should be inside the input data:
+    #    num_of_prev_points = (pad_top - i * sh - 1) // dh
+    #    next_point_index = i * sh + (num_prev_points + 1) * dh
+    #
+    # With that, Top_edge and left_edge can be reformulated as:
+    #    if i*sh - pad_top < 0:
+    #        top_edge = i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh
+    #    else:
+    #        top_edge = i * sh
+    #
+    #    if j * sw - pad_left < 0:
+    #        left_edge = j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw
+    #    else:
+    #        left_edge= j * sw
+
+    """
+    top_edge = tir.if_then_else(
+        tir.all(i * sh - pad_top < 0), i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh, i * sh
+    )
+    bottom_edge = te.min(i * sh + (kh - 1) * dh + 1, ih + pad_top)
+    left_edge = tir.if_then_else(
+        tir.all(j * sw - pad_left < 0), j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw, j * sw
+    )
+    right_edge = te.min(j * sw + (kw - 1) * dw + 1, iw + pad_left)
+    return -((bottom_edge - top_edge) // -dh) * -((right_edge - left_edge) // -dw)
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index ba7d64b6b56d..f7c4502301c0 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -18,7 +18,7 @@
 """ Computes and schedules for Hexagon quantized ops """
 
 from .adaptive_avg_pool1d import *
-from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
+from .avg_pool2d import *
 from .conv2d_alter_op import *
 from .dense_alter_op import *
 from .dequantize import dequantize_compute, dequantize_schedule
diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
index 4aac15cbdc17..1370ad36e468 100644
--- a/python/tvm/topi/hexagon/qnn/avg_pool2d.py
+++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py
@@ -16,64 +16,144 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
 
-""" Compute and schedule for quantized avg_pool2d op
-
-Please note the following assumptions made by the implementation:
-
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
-   modified to support 'False' case but the element count for the pooling window
-   must be pre-computed and provided as an input to reduce the run-time overhead.
-3) 'padding' is ignored. It must be handled outside of the sliced op.
-4) Please note that this implementation will not work if the output includes any
-   physical layout related padding as it can result into out-of-bound access
-   for the input.
-"""
+""" Compute and schedule for quantized avg_pool2d op """
 
+import tvm
 from tvm import te
 from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value
+from ..utils import (
+    get_layout_transform_fn,
+    get_fixed_point_value,
+    is_scalar,
+    get_const_int_value,
+    get_const_float_value,
+)
+from ...utils import get_const_tuple
+from ...nn.utils import get_pad_tuple
+from ...nn.pad import pad
+from ..compute_poolarea import compute_PoolArea
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
 
 
-def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list):
-    """Validate output shape"""
-    _, oh, ow, _ = out_shape
-    _, ih, iw, _ = in_shape
+def qnn_avg_pool2d_NCHW(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Compute for quantized avg_pool2d"""
     kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
+
+    if odtype == "uint8":
+        temp_dtype = "uint16"
+    elif odtype == "int8":
+        temp_dtype = "int16"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
     sh, sw = stride
     dh, dw = dilation
-    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
-        raise RuntimeError("Output height is too large")
-    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
-        raise RuntimeError("Output width is too large")
 
+    scale = input_scale / output_scale
+    scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
 
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
+    Sum = te.compute(
+        oshape,
+        lambda b, c, h, w: te.sum(
+            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype(temp_dtype), axis=[rh, rw]
+        ),
+        name="pool_sum",
+    )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, _, oh, ow = oshape
+        _, _, ih, iw = data.shape
 
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
 
-def qnn_avg_pool2d_compute(
+        ScaleWithArea = te.compute(
+            (oh, ow),
+            lambda i, j: (scale_fixed_point // PoolArea[i, j]).astype("int32"),
+            name="scale_with_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: saturate(
+                ((Sum[b, c, h, w] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        ScaleWithArea = scale_fixed_point // (kh * kw)
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: saturate(
+                ((Sum[b, c, h, w] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    return Avg
+
+
+def qnn_avg_pool2d_NHWC(
     data: te.Tensor,
     kernel: list,
     stride: list,
+    padding: list,
     dilation: list,
+    count_include_pad: bool,
     oshape: list,
     odtype: str,
     # quantization params:
-    input_zero_point: int,
     input_scale: float,
-    output_zero_point: int,
+    input_zero_point: int,
     output_scale: float,
+    output_zero_point: int,
 ):
     """Compute for quantized avg_pool2d"""
     kh, kw = kernel
     rh = te.reduce_axis((0, kh), name="rh")
     rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = oshape
-    if isinstance(ob, int):
-        validate_out_shape(oshape, data.shape, kernel, stride, dilation)
 
     if odtype == "uint8":
         temp_dtype = "uint16"
@@ -85,86 +165,217 @@ def qnn_avg_pool2d_compute(
     sh, sw = stride
     dh, dw = dilation
 
-    PoolArea = kh * kw
-
     scale = input_scale / output_scale
     scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-    scale_with_area = scale_fixed_point // PoolArea
     corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+    # Compute Area
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, pad_top, pad_left, 0)
+        pad_after = (0, pad_down, pad_right, 0)
+        data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
     Sum = te.compute(
         oshape,
         lambda b, h, w, c: te.sum(
-            data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
+            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw]
         ),
-        name="sum",
+        name="pool_sum",
     )
 
-    Avg = te.compute(
-        oshape,
-        lambda b, h, w, c: saturate(
-            ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="avg",
-    )
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, oh, ow, _ = oshape
+        _, ih, iw, _ = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        ScaleWithArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0),
+                (scale_fixed_point // PoolArea[i, j]).astype("int32"),
+                0,
+            ),
+            name="scale_with_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: saturate(
+                ((Sum[b, h, w, c] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        ScaleWithArea = scale_fixed_point // (kh * kw)
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: saturate(
+                ((Sum[b, h, w, c] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype
+            ).astype(odtype),
+            name="pool_avg",
+        )
+
     return Avg
 
 
-def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for input and output layout nhwc-8h8w32c"""
+def qnn_avg_pool2d_wrapper_compute_NCHW(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Extract qnn params"""
+    if (
+        is_scalar(input_scale)
+        and is_scalar(output_scale)
+        and is_scalar(input_zero_point)
+        and is_scalar(output_zero_point)
+    ):
+        iscale = get_const_float_value(input_scale)
+        oscale = get_const_float_value(output_scale)
+        izero_point = get_const_int_value(input_zero_point)
+        ozero_point = get_const_int_value(output_zero_point)
+        return qnn_avg_pool2d_NCHW(
+            data,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            iscale,
+            izero_point,
+            oscale,
+            ozero_point,
+        )
+    else:
+        raise RuntimeError("quantization parameters should be scalar tensors")
+
+
+def qnn_avg_pool2d_wrapper_compute_NHWC(
+    data: te.Tensor,
+    kernel: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    count_include_pad: bool,
+    oshape: list,
+    odtype: str,
+    # quantization params:
+    input_scale: float,
+    input_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+):
+    """Extract qnn params"""
+    if (
+        is_scalar(input_scale)
+        and is_scalar(output_scale)
+        and is_scalar(input_zero_point)
+        and is_scalar(output_zero_point)
+    ):
+        iscale = get_const_float_value(input_scale)
+        oscale = get_const_float_value(output_scale)
+        izero_point = get_const_int_value(input_zero_point)
+        ozero_point = get_const_int_value(output_zero_point)
+        return qnn_avg_pool2d_NHWC(
+            data,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            count_include_pad,
+            oshape,
+            odtype,
+            iscale,
+            izero_point,
+            oscale,
+            ozero_point,
+        )
+    else:
+        raise RuntimeError("quantization parameters should be scalar tensors")
+
+
+def schedule_qnn_avg_pool2d(outs):
+    """Schedule for qnn.avg_pool2d
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qnn.avg_pool2d
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
+
+
+def schedule_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for input and output layout 8h8w32c"""
+
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
-
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_read = s.cache_read(Avg, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
-
-    # Schedule 'Avg'
-    # Split and reorder the axes to iterate over the output tensor chunks.
-    # Each chunk consists for 2048 bytes with 32 channels being the fastest
-    # changing axis, followed by 8 width and then 8 height.
-    # The width is split by a factor of 4 and then fused with 32 channels
-    # to provide full vector length of data for the output tensor chunks.
-    # NOTE: These schedules are a work in progress and may require
-    # adjustments in future as some of the missing features for 2-d tensors
-    # become available.
-    n, h, w, c = s.get_loops(Avg)
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 8])
-    wio, wii = s.split(wi, [None, 4])
-    co, ci = s.split(c, [None, 32])
-    s.reorder(n, ho, wo, co, hi, wio, wii, ci)
-    wii_ci = s.fuse(wii, ci)
-    s.vectorize(wii_ci)
-
-    # Schedule 'Sum'
-    s.compute_at(Sum, wio)
-    Sum_axis = s.get_loops(Sum)
-    # Compute for 'Sum' includes reduction along height and width. The axes
-    # are being reordered so that 4 width and 32 channels become the
-    # inner-most loops which then can be fused and vectorized. However,
-    # vectorization of the 2-d tensors doesn't work when reduction is
-    # involved and requires codegen support that is yet to be added.
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
-    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
-    # s.vectorize(ci_wii) # Doesn't work
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
     return s
 
 
-def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c"""
+def schedule_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
+    """Schedule for output layout: 2048c, input layout: 8h8w32c"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0)
 
     # Schedule 'Avg'
     # Split and reorder the axes to iterate over the output tensor chunks.
@@ -173,7 +384,13 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp
     # NOTE: These schedules are a work in progress and may require
     # adjustments in future as some of the missing features for 2-d tensors
     # become available.
-    n, h, w, c = s.get_loops(Avg)
+
+    if output_layout == "n11c-2048c-2d":
+        _, _, _, c = s.get_loops(Avg)
+    else:
+        _, c, _, _ = s.get_loops(Avg)
+
+    # n, h, w, c = s.get_loops(Avg)
     co, ci = s.split(c, [None, 2048])
     cio, cii = s.split(ci, [None, 128])
     s.vectorize(cii)
@@ -191,15 +408,10 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp
 
 
 def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str):
-    """Quantized avg_pool2d schedule
+    """Quantized avg_pool2d schedule"""
+    if output_layout == "nhwc-8h8w32c-2d" or output_layout == "nchw-8h8w32c-2d":
+        return schedule_8h8w32c(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-2048c-2d" or output_layout == "nc11-2048c-2d":
+        return schedule_2048c(outs, ins, output_layout, input_layout)
 
-    NOTE: This schedule assumes that both input and output tensors are in the form of
-    2d discontiguous buffer and data is already arranged as per the input and output layout
-    respectively.
-
-    """
-    if output_layout == "nhwc-8h8w32c-2d":
-        return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-2048c-2d":
-        return schedule_n11c_2048c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
index e60314b82757..286c33f53f42 100644
--- a/python/tvm/topi/hexagon/qnn/nn.py
+++ b/python/tvm/topi/hexagon/qnn/nn.py
@@ -22,8 +22,14 @@
 
 import tvm
 from tvm import te, topi
-from ..utils import saturate, get_fixed_point_value
-from ...utils import get_const_tuple, get_const_int, get_const_float
+from ..utils import (
+    saturate,
+    is_scalar,
+    get_const_int_value,
+    get_const_float_value,
+    get_fixed_point_value,
+)
+from ...utils import get_const_tuple
 from ...nn.utils import get_pad_tuple
 from ...nn.pad import pad
 from ... import tag, nn
@@ -38,27 +44,6 @@ def clip_cast(val, dtype):
     return te.max(tvm.te.min(val, const_max), const_min).astype(dtype)
 
 
-# Return True if given expression is scalar constant value.
-def is_scalar(expr):
-    if isinstance(expr, te.Tensor):
-        return expr.ndim == 0 and (isinstance(expr.op.body[0], (tvm.tir.FloatImm, tvm.tir.IntImm)))
-    return isinstance(expr, (tvm.tir.FloatImm, tvm.tir.IntImm))
-
-
-def get_const_int_value(expr):
-    if isinstance(expr, te.Tensor):
-        assert isinstance(expr.op.body[0], tvm.tir.IntImm)
-        return expr.op.body[0].value
-    return get_const_int(expr)
-
-
-def get_const_float_value(expr):
-    if isinstance(expr, te.Tensor):
-        assert isinstance(expr.op.body[0], tvm.tir.FloatImm)
-        return expr.op.body[0].value
-    return get_const_float(expr)
-
-
 def get_qnn_param(param, indices, axis):
     # Account scalar and 1D quantization parameters:
     if len(param.shape) == 0:
@@ -68,11 +53,7 @@ def get_qnn_param(param, indices, axis):
     return param[param_idx]
 
 
-def subtract_zero_point(
-    tensor: te.Tensor,
-    zero_point: Union[te.Tensor, tvm.tir.IntImm],
-    name: str,
-):
+def subtract_zero_point(tensor: te.Tensor, zero_point: Union[te.Tensor, tvm.tir.IntImm], name: str):
     """
     Subtract zero point from given tensor. If zero point is scalar constant and is equal to 0, then
     it can be optimized and return tensor as it is.
@@ -590,10 +571,7 @@ def qnn_conv2d(  # Conv2d inputs
         oshape,
         lambda n, oc, oh, ow: te.sum(
             data_pad[
-                n,
-                ic,
-                oh * height_stride + kh * dilation_h,
-                ow * width_stride + kw * dilation_w,
+                n, ic, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w
             ].astype("int32")
             * weight[oc, ic, kh, kw].astype("int32"),
             axis=[ic, kh, kw],
@@ -777,10 +755,7 @@ def qnn_depthwise_conv2d(  # Conv2d inputs
         oshape,
         lambda n, oc, oh, ow: te.sum(
             data_pad[
-                n,
-                oc,
-                oh * height_stride + kh * dilation_h,
-                ow * width_stride + kw * dilation_w,
+                n, oc, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w
             ].astype("int32")
             * te.subtract(weight[oc, 0, kh, kw], kernel_zero_point).astype("int32"),
             axis=[kh, kw],
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 46ae0c53200f..b38dd5ecb3c1 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -17,7 +17,7 @@
 
 """ Computes and Schedules for Hexagon slice ops. """
 
-from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule
+from .avg_pool2d import avg_pool2d_NHWC, avg_pool2d_NCHW, avg_pool2d_schedule
 from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule
 from .add_subtract_multiply import *
 from .argmax import argmax_compute, argmax_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
index bf6d57b8f7f8..0c7b00e287c3 100644
--- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
@@ -16,128 +16,214 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, pointless-exception-statement
 
-""" Compute and schedule for avg_pool2d slice op
-
-Please note the following assumptions made by the implementation:
-
-1) The input must be padded in advance to account for 'padding'. In addition,
-   both input and output must be padded as per the physical buffer layout.
-2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
-   modified to support 'False' case but the element count for the pooling window
-   must be pre-computed and provided as an input to reduce the run-time overhead.
-3) 'padding' is ignored. It must be handled outside of the sliced op.
-4) Please note that this implementation will not work if the output includes any
-   physical layout related padding as it can result into out-of-bound access
-   for the input.
-"""
+""" Compute and schedule for avg_pool2d slice op """
 
 from tvm import te
 from tvm import tir
 from ..utils import get_layout_transform_fn
+from ...utils import get_const_tuple
+from ...nn.utils import get_pad_tuple
+from ...nn.pad import pad
+from ..compute_poolarea import compute_PoolArea
 
 
-def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
-    """Validate output shape"""
-    _, oh, ow, _ = out_shape
-    _, ih, iw, _ = in_shape
+def avg_pool2d_NCHW(
+    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
+):
+    """avg_pool2d compute"""
+    if odtype != "float16":
+        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
     kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
     sh, sw = stride
     dh, dw = dilation
-    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
-        raise RuntimeError("Output height is too large")
-    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
-        raise RuntimeError("Output width is too large")
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
 
-def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"):
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, 0, pad_top, pad_left)
+        pad_after = (0, 0, pad_down, pad_right)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
+    Sum = te.compute(
+        oshape,
+        lambda b, c, h, w: te.sum(
+            data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype("float32"), axis=[rh, rw]
+        ),
+        name="pool_sum",
+    )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, _, oh, ow = oshape
+        _, _, ih, iw = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        InvArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
+            ),
+            name="inverse_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, c, h, w: (Sum[b, c, h, w] * InvArea[h, w]).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        InvArea = float(1) / (kh * kw)
+        Avg = te.compute(
+            oshape, lambda b, c, h, w: (Sum[b, c, h, w] * InvArea).astype(odtype), name="pool_avg"
+        )
+
+    return Avg
+
+
+def avg_pool2d_NHWC(
+    data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16"
+):
     """avg_pool2d compute"""
     if odtype != "float16":
-        RuntimeError(f"Unsupported output dtype '{odtype}'")
+        raise RuntimeError(f"Unsupported output dtype '{odtype}'")
     kh, kw = kernel
     rh = te.reduce_axis((0, kh), name="rh")
     rw = te.reduce_axis((0, kw), name="rw")
-    ob, oh, ow, oc = oshape
-    if isinstance(ob, int):
-        validate_out_shape(oshape, A.shape, kernel, stride, dilation)
 
     sh, sw = stride
     dh, dw = dilation
     InvArea = float(1) / (kh * kw)
 
+    dilated_kh = (kh - 1) * dh + 1
+    dilated_kw = (kw - 1) * dw + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        get_const_tuple(padding), (dilated_kh, dilated_kw)
+    )
+
+    # DOPAD
+    if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0:
+        pad_before = (0, pad_top, pad_left, 0)
+        pad_after = (0, pad_down, pad_right, 0)
+        data_pad = pad(data, pad_before, pad_after, name="data_pad")
+    else:
+        # By definition when True, zero-padding will be included in the averaging calculation
+        # This is equivalent to PoolArea = (kh * kw)
+        count_include_pad = True
+        data_pad = data
+
     Sum = te.compute(
         oshape,
         lambda b, h, w, c: te.sum(
-            A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
+            data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
         ),
-        name="sum",
-    )
-    Avg = te.compute(
-        oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
+        name="pool_sum",
     )
+
+    if not count_include_pad:
+        # Compute PoolArea using unpadded input tensor
+        _, oh, ow, _ = oshape
+        _, ih, iw, _ = data.shape
+
+        PoolArea = te.compute(
+            (oh, ow),
+            lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left),
+            name="pool_area",
+        )
+
+        InvArea = te.compute(
+            (oh, ow),
+            lambda i, j: tir.if_then_else(
+                tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0
+            ),
+            name="inverse_area",
+        )
+
+        Avg = te.compute(
+            oshape,
+            lambda b, h, w, c: (Sum[b, h, w, c] * InvArea[h, w]).astype(odtype),
+            name="pool_avg",
+        )
+    else:
+        InvArea = float(1) / (kh * kw)
+        Avg = te.compute(
+            oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(odtype), name="pool_avg"
+        )
+
     return Avg
 
 
-def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for input and output layout nhwc-8h2w32c2w"""
+def schedule_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for input and output layout 8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
+    print(func)
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
-
-    # Schedule 'Avg'
-    n, h, w, c = s.get_loops(Avg)
-    ho, hi = s.split(h, [None, 8])
-    wo, wi = s.split(w, [None, 4])
-    wio, wii = s.split(wi, [None, 2])
-    co, ci = s.split(c, [None, 32])
-    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
-    ci_wii = s.fuse(ci, wii)
-    s.vectorize(ci_wii)
-
-    # Schedule 'Sum'
-    s.compute_at(Sum, wio)
-    Sum_axis = s.get_loops(Sum)
-    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
-    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
-    # s.vectorize(ci_wii) # Doesn't work
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
     return s
 
 
-def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
-    """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
+def schedule_1024c(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for output layout: 1024c, input layout: 8h2w32c2w"""
     func = te.create_prim_func([ins, outs])
     s = tir.Schedule(func)
-    Sum = s.get_block("sum")
-    Avg = s.get_block("avg")
+    Sum = s.get_block("pool_sum")
+    Avg = s.get_block("pool_avg")
 
+    mem_scope = "global.vtcm"
+    sum_read = s.cache_read(Sum, 0, mem_scope)
+    avg_write = s.cache_write(Avg, 0, mem_scope)
     input_transform_fn = get_layout_transform_fn(input_layout)
     output_transform_fn = get_layout_transform_fn(output_layout)
-    s.transform_layout(Sum, ("read", 0), input_transform_fn)
-    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0)
 
     # Schedule 'Avg'
-    n, h, w, c = s.get_loops(Avg)
-    co, ci = s.split(c, [None, 1024])
+    if output_layout == "n11c-1024c-2d":
+        n, h, w, c = s.get_loops(Avg)
+    else:
+        n, c, h, w = s.get_loops(Avg)
+    _, ci = s.split(c, [None, 1024])
     cio, cii = s.split(ci, [None, 64])
     s.vectorize(cii)
 
     # Schedule 'Sum'
-    s.compute_at(Sum, cio)
     Sum_axis = s.get_loops(Sum)
     s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
-    # s.vectorize(Sum_axis[-3]) # Doesn't work
     return s
 
 
 def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str):
     """avg_pool2d schedule"""
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
-    if output_layout == "n11c-1024c-2d":
-        return schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+    if output_layout == "nhwc-8h2w32c2w-2d" or output_layout == "nchw-8h2w32c2w-2d":
+        return schedule_8h2w32c2w(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-1024c-2d" or output_layout == "nc11-1024c-2d":
+        return schedule_1024c(outs, ins, output_layout, input_layout)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index f017aaebbdeb..aa1af5de43db 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -24,10 +24,30 @@
 from typing import Dict, Tuple, Union
 
 import tvm
-from tvm import IRModule, te
+from tvm import IRModule, te, tir
 from tvm.tir import IndexMap, PrimFunc
 
 
+def is_scalar(expr):
+    if isinstance(expr, te.Tensor):
+        return expr.ndim == 0 and (isinstance(expr.op.body[0], (tir.FloatImm, tir.IntImm)))
+    return isinstance(expr, (tir.FloatImm, tir.IntImm))
+
+
+def get_const_int_value(expr):
+    if isinstance(expr, te.Tensor):
+        assert isinstance(expr.op.body[0], tir.IntImm)
+        return expr.op.body[0].value
+    return tvm.topi.utils.get_const_int(expr)
+
+
+def get_const_float_value(expr):
+    if isinstance(expr, te.Tensor):
+        assert isinstance(expr.op.body[0], tir.FloatImm)
+        return expr.op.body[0].value
+    return tvm.topi.utils.get_const_float(expr)
+
+
 def n11c_1024c_2d(n, h, w, c):
     """Return index map for n11c_1024 2d layout"""
     return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024]
@@ -38,6 +58,11 @@ def n11c_1024c_1d(n, h, w, c):
     return [n, h, w, c // 1024, c % 1024]
 
 
+def nc11_1024c_2d(n, c, h, w):
+    """Return index map for nc11_1024 2d layout"""
+    return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024, h, w]
+
+
 def nhwc_8h2w32c2w_2d(n, h, w, c):
     """Return index map for nhwc_8h2w32c2w 2d layout"""
     return [n, h // 8, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
@@ -48,6 +73,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c):
     return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
 
 
+def nchw_8h2w32c2w_2d(n, c, h, w):
+    """Return index map for nchw_8h2w32c2w 2d layout"""
+    return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
 def nhw_32h16w_2d(n, h, w):
     """Return index map for nhw_32h16w 2d layout"""
     return [n, h // 32, w // 16, IndexMap.AXIS_SEPARATOR, h % 32, w % 16]
@@ -88,6 +118,11 @@ def nc_2048c_2d(n, c):
     return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
 
 
+def nc11_2048c_2d(n, c, h, w):
+    """Return index map for nc11_2048c 2d layout"""
+    return [n, c // 2048, IndexMap.AXIS_SEPARATOR, h, w, c % 2048]
+
+
 def nc_1024c_1d(n, c):
     """Return index map for nc_1024c 1d layout"""
     return [n, c // 1024, c % 1024]
@@ -123,11 +158,25 @@ def nhwc_8h8w32c_2d(n, h, w, c):
     return [n, h // 8, w // 8, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
 
 
+def nhwc_8h8w32c_1d(n, h, w, c):
+    """Return index map for nhwc_8h8w32c 1d layout"""
+    return [n, h // 8, w // 8, c // 32, h % 8, w % 8, c % 32]
+
+
+def nchw_8h8w32c_2d(n, c, h, w):
+    return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
+
+
 def n11c_2048c_2d(n, h, w, c):
     """Return index map for n11c_2048c 2d layout"""
     return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048]
 
 
+def n11c_2048c_1d(n, h, w, c):
+    """Return index map for n11c_2048c 1 layout"""
+    return [n, h, w, c // 2048, c % 2048]
+
+
 def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
     return [
         in_channel // 32,
@@ -163,12 +212,16 @@ def get_layout_transform_fn(layout):
         return nhwc_8h2w32c2w_2d
     if layout == "nhwc-8h2w32c2w-1d":
         return nhwc_8h2w32c2w_1d
+    if layout == "nchw-8h2w32c2w-2d":
+        return nchw_8h2w32c2w_2d
     if layout == "n11c-1024c-2d":
         return n11c_1024c_2d
     if layout == "n11c-1024c-1d":
         return n11c_1024c_1d
     if layout == "nhwc-1024c-2d":
         return nhwc_1024c_2d
+    if layout == "nc11-1024c-2d":
+        return nc11_1024c_2d
     if layout == "nc-1024-2d":
         return nc_1024_2d
     if layout == "nhw-32h16w-2d":
@@ -201,16 +254,26 @@ def get_layout_transform_fn(layout):
         return nc_2048c_2d
     if layout == "nhwc-8h8w32c-2d":
         return nhwc_8h8w32c_2d
+    if layout == "nhwc-8h8w32c-1d":
+        return nhwc_8h8w32c_1d
+    if layout == "nchw-8h8w32c-2d":
+        return nchw_8h8w32c_2d
     if layout == "n11c-2048c-2d":
         return n11c_2048c_2d
+    if layout == "n11c-2048c-1d":
+        return n11c_2048c_1d
     if layout == "ohwi32o-1d":
         return ohwi32o_1d
+    if layout == "nc11-2048c-2d":
+        return nc11_2048c_2d
     if layout == "ncw-32c64w-2d":
         return ncw_32c64w_2d
     if layout == "nchw-32c8h8w-2d":
         return nchw_32c8h8w_2d
     if layout == "nchw-32c8h4w-2d":
         return nchw_32c8h4w_2d
+    if layout == "nchw-8h8w32c-2d":
+        return nchw_8h8w32c_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 2a8c2440519e..1cfbab6e661e 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "../../transforms/infer_layout_utils.h"
+#include "pooling_common.h"
 
 namespace tvm {
 namespace relay {
@@ -40,43 +41,6 @@ namespace relay {
 TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
 TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
 
-template <typename T>
-InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs,
-                                                const Array<Layout>& new_in_layouts,
-                                                const Array<Layout>& old_in_layouts,
-                                                const Array<tvm::relay::Type>& old_in_types) {
-  const auto* attrs_ptr = attrs.as<T>();
-  ICHECK(attrs_ptr);
-  ObjectPtr<T> params = make_object<T>(*attrs_ptr);
-
-  if (params->out_layout != "") {
-    // when users specify the out_layout of pooling, follow user's preference
-    ICHECK_EQ(params->layout, params->out_layout)
-        << "Pooling input/output layouts mismatch: " << params->layout << " vs. "
-        << params->out_layout;
-  } else if (new_in_layouts.defined()) {
-    // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller
-    ICHECK_EQ(new_in_layouts.size(), 1);
-    params->layout = new_in_layouts[0].name();
-  }
-
-  return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params));
-}
-
-IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount,
-                                   IndexExpr pool_size, IndexExpr dilation, IndexExpr stride_size,
-                                   bool ceil_mode) {
-  IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1);
-  IndexExpr denominator = stride_size;
-
-  // Emulate the behavior of running ceil on numerator / denominator rather than floor
-  if (ceil_mode) {
-    numerator += denominator - 1;
-  }
-
-  return numerator / denominator + 1;
-}
-
 template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
diff --git a/src/relay/op/nn/pooling.h b/src/relay/op/nn/pooling.h
index 32ae464101ab..123cfcd07570 100644
--- a/src/relay/op/nn/pooling.h
+++ b/src/relay/op/nn/pooling.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file src/relay/op/nn/convolution.h
- * \brief Properties def of convlution operator for sharing.
+ * \file src/relay/op/nn/pooling.h
+ * \brief utilities for creating pool ops
  */
 #ifndef TVM_RELAY_OP_NN_POOLING_H_
 #define TVM_RELAY_OP_NN_POOLING_H_
diff --git a/src/relay/op/nn/pooling_common.h b/src/relay/op/nn/pooling_common.h
new file mode 100644
index 000000000000..1193d36ebe88
--- /dev/null
+++ b/src/relay/op/nn/pooling_common.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/nn/pooling_common.h
+ * \brief Common functions for pooling operator definition.
+ */
+#ifndef TVM_RELAY_OP_NN_POOLING_COMMON_H_
+#define TVM_RELAY_OP_NN_POOLING_COMMON_H_
+
+#include <tvm/auto_scheduler/compute_dag.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/tir/analysis.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+inline IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount,
+                                          IndexExpr pool_size, IndexExpr dilation,
+                                          IndexExpr stride_size, bool ceil_mode) {
+  IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1);
+  IndexExpr denominator = stride_size;
+
+  // Emulate the behavior of running ceil on numerator / denominator rather than floor
+  if (ceil_mode) {
+    numerator += denominator - 1;
+  }
+
+  return numerator / denominator + 1;
+}
+
+template <typename T>
+InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs,
+                                                const Array<Layout>& new_in_layouts,
+                                                const Array<Layout>& old_in_layouts,
+                                                const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<T>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<T> params = make_object<T>(*attrs_ptr);
+
+  if (params->out_layout != "") {
+    // when users specify the out_layout of pooling, follow user's preference
+    ICHECK_EQ(params->layout, params->out_layout)
+        << "Pooling input/output layouts mismatch: " << params->layout << " vs. "
+        << params->out_layout;
+  } else if (new_in_layouts.defined()) {
+    // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller
+    // ICHECK_EQ(new_in_layouts.size(), 1);
+    params->layout = new_in_layouts[0].name();
+  }
+
+  return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params));
+}
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_NN_POOLING_COMMON_H_
diff --git a/src/relay/qnn/op/avg_pool2d.cc b/src/relay/qnn/op/avg_pool2d.cc
new file mode 100644
index 000000000000..b2dc08b85686
--- /dev/null
+++ b/src/relay/qnn/op/avg_pool2d.cc
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/avg_pool2d.cc
+ * \brief Quantized avg_pool2d operator
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/tir/expr.h>
+
+#include "../../op/nn/nn.h"
+#include "../../op/nn/pooling.h"
+#include "../../op/nn/pooling_common.h"
+#include "../../op/tensor/transform.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+// relay.op.qnn.avg_pool2d
+bool QnnAvgPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  // Expected Types: data, input_zero_point, input_scale, output_zero_point, output_scale
+  // out_type
+
+  ICHECK_EQ(types.size(), 6);
+
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+      << "Expected quantized avg_pool2d type(int8, uint8) for input but was " << data->dtype;
+
+  const auto* param = attrs.as<AvgPool2DAttrs>();
+  ICHECK(param != nullptr) << "AvgPool2DAttrs cannot be nullptr.";
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+
+  // Find the output shape and data type
+  const auto dshape = data->shape;
+  ICHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+
+  // Check input and output layout
+  Layout layout(param->layout);
+  // The Layout is always NHWC
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+      << "Invalid input layout " << layout
+      << ". qnn_avg_pool2d inut layout must have H and W, which cannot be split";
+
+  // Find the output shape and data type
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
+
+  IndexExpr pad_h, pad_w;
+  if (param->padding.size() == 1) {
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[0] * 2;
+  } else if (param->padding.size() == 2) {
+    // (top, left)
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[1] * 2;
+  } else if (param->padding.size() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param->padding[0] + param->padding[2];
+    pad_w = param->padding[1] + param->padding[3];
+  } else {
+    return false;
+  }
+
+  std::vector<IndexExpr> oshape(dshape.begin(), dshape.end());
+  if (dshape[hidx].as<tir::AnyNode>()) {
+    oshape[hidx] = dshape[hidx];
+  } else {
+    oshape[hidx] =
+        calculate_pool_dimension(dshape[hidx], pad_h, param->pool_size[0], param->dilation[0],
+                                 param->strides[0], param->ceil_mode);
+  }
+  if (dshape[widx].as<tir::AnyNode>()) {
+    oshape[widx] = dshape[widx];
+  } else {
+    oshape[widx] =
+        calculate_pool_dimension(dshape[widx], pad_w, param->pool_size[1], param->dilation[1],
+                                 param->strides[1], param->ceil_mode);
+  }
+
+  // assign output type
+  reporter->Assign(types[5], TensorType(oshape, data->dtype));
+  return true;
+}
+
+InferCorrectLayoutOutput QnnAvgPoolInferCorrectLayout(const Attrs& attrs,
+                                                      const Array<Layout>& new_in_layouts,
+                                                      const Array<Layout>& old_in_layouts,
+                                                      const Array<tvm::relay::Type>& old_in_types) {
+  // Use Relay AvgPool2D Infer correct layout.
+  auto avgpool_new_layouts =
+      PoolInferCorrectLayout<AvgPool2DAttrs>(attrs, new_in_layouts, old_in_layouts, old_in_types);
+
+  // Scales and zero points are scalars, use the "undef" layout for them.
+  Array<Layout> input_layouts = {avgpool_new_layouts->input_layouts[0], Layout::Undef(),
+                                 Layout::Undef(), Layout::Undef(), Layout::Undef()};
+  Array<Layout> output_layouts = avgpool_new_layouts->output_layouts;
+  return InferCorrectLayoutOutput(input_layouts, output_layouts, attrs);
+}
+
+/*
+ * \brief Forward rewrite the qnn avg_pool2d op.
+ * \param attrs The QNN avg_pool2d attrs.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for qnn avg_pool2d op.
+ * \note Lowering of the qnn.avg_pool2d operator
+
+ *  Quantized avg_pool2d will take one quantized input tensor and returns another
+ *  quantized tensor. Since the input qnn params can be different from the output
+ *  qnn params, first, we requantize the input tensors with output qnn params and
+ *  cast the results into Int32. Then we call relay.nn.avg_pool2d on that requantized
+ *  inputs. Finally, the results are cast into the quantized output data type.
+
+ * Note: The RequantizeOrUpcast function only perform requantization if the input
+ * and output qnn params are different, otherwise it only does casting to Int32.
+ */
+
+Expr QnnAvgPoolCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  ICHECK_EQ(new_args.size(), 5);
+  Expr input_data = new_args[0];
+  Expr input_scale = new_args[1];
+  Expr input_zero_point = new_args[2];
+  Expr output_scale = new_args[3];
+  Expr output_zero_point = new_args[4];
+  const auto in_shape = get_shape(arg_types[0]);
+  const auto* avgpool_attrs = attrs.as<AvgPool2DAttrs>();
+  auto requantized_input = RequantizeOrUpcast(input_data, input_scale, input_zero_point,
+                                              output_scale, output_zero_point, in_shape);
+  Expr nn_avg = AvgPool2D(requantized_input, avgpool_attrs->pool_size, avgpool_attrs->strides,
+                          avgpool_attrs->dilation, avgpool_attrs->padding, avgpool_attrs->layout,
+                          avgpool_attrs->out_layout, avgpool_attrs->ceil_mode,
+                          avgpool_attrs->count_include_pad);
+
+  const auto* data = arg_types[5].as<TensorTypeNode>();
+  const int32_t min_val = GetQmin(data->dtype);
+  const int32_t max_val = GetQmax(data->dtype);
+  return Cast(Clip(nn_avg, min_val, max_val), data->dtype);
+}
+
+// Positional relay function to create quantized avg_pool2d operator used by frontend FFI.
+Expr MakeQuantizedAvgPool2D(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale,
+                            Expr output_zero_point, Array<IndexExpr> pool_size,
+                            Array<IndexExpr> strides, Array<IndexExpr> padding,
+                            Array<IndexExpr> dilation, bool ceil_mode, bool count_include_pad,
+                            String layout, String output_layout) {
+  auto attrs = make_object<AvgPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(output_layout);
+  attrs->ceil_mode = ceil_mode;
+  attrs->count_include_pad = count_include_pad;
+  static const Op& op = Op::Get("qnn.avg_pool2d");
+  return Call(op, {data, input_scale, input_zero_point, output_scale, output_zero_point},
+              Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.avg_pool2d")
+    .describe("Customized? qnn_avg_pool2d for quantized tensors.")
+    .set_attrs_type<AvgPool2DAttrs>()
+    .set_num_inputs(5)
+    .add_argument("data", "Quantized Tensor", "The input data.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
+    .set_support_level(11)
+    .add_type_rel("QnnAvgPool2D", QnnAvgPool2DRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnAvgPoolInferCorrectLayout)
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnAvgPoolCanonicalize);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.avg_pool2d").set_body_typed(MakeQuantizedAvgPool2D);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
new file mode 100644
index 000000000000..8cff4ed626bc
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py
@@ -0,0 +1,576 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Tests for QNN operations on Hexagon"""
+
+import numpy as np
+
+import tvm.testing
+import tvm.topi.testing
+from tvm import relay
+from tvm.contrib.hexagon.session import Session
+from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
+from tvm.relay.backend import Executor
+from tvm.relay.testing import run_opt_pass, run_infer_type
+
+from .infrastructure import quantize_np
+
+
+@tvm.testing.requires_hexagon
+def test_disable_qnn_legalize_pass():
+    """No QNN pass test."""
+    x = relay.var("x", shape=(4, 8), dtype="float32")
+    op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
+    op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5))
+    relay_mod = tvm.IRModule.from_expr(op1)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    # Default compilation flow
+    with tvm.transform.PassContext(opt_level=3):
+        opt_with_legalize, _ = relay.optimize(
+            relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+
+    # Disable QNN legalization and canonicalization passes
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+        opt_without_legalize, _ = relay.optimize(
+            relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+
+    # Check that QNN ops are absent with default compilation flow.
+    text_with_legalize = opt_with_legalize.astext(show_meta_data=False)
+    assert "qnn.quantize" not in text_with_legalize and "qnn.dequantize" not in text_with_legalize
+
+    # Check that QNN ops are present without "qnn.Legalize" passes.
+    text_without_legalize = opt_without_legalize.astext(show_meta_data=False)
+    assert "qnn.quantize" in text_without_legalize and "qnn.dequantize" in text_without_legalize
+
+
+def build_hexagon_module(relay_mod):
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]):
+        exe_mod = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
+            executor=Executor("aot"),
+        )
+
+    return exe_mod
+
+
+def build_ref_module(relay_mod):
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        exe_mod = tvm.relay.build(
+            relay_mod, tvm.target.Target(target_llvm, host=target_llvm), executor=Executor("aot")
+        )
+    return exe_mod
+
+
+def execute(mod_executor, inputs: dict):
+    for input_name, input_data in inputs.items():
+        mod_executor.set_input(input_name, input_data)
+    mod_executor.run()
+    return [mod_executor.get_output(i).numpy() for i in range(mod_executor.get_num_outputs())]
+
+
+def execute_on_hexagon(hexagon_session, exe_mod, inputs: dict):
+    return execute(hexagon_session.get_executor_from_factory(exe_mod), inputs)
+
+
+def execute_on_cpu(exe_mod, inputs: dict):
+    return execute(tvm.runtime.executor.AotModule(exe_mod["default"](tvm.cpu(0))), inputs)
+
+
+def assert_allclose(actuals, desireds, rtol=1e-07, atol=0.01):
+    return [tvm.testing.assert_allclose(a, d, rtol, atol) for a, d in zip(actuals, desireds)]
+
+
+def run_and_compare(hexagon_session, relay_mod, inputs, rtol=None, atol=None):
+    """Compile and execute given relay module on CPU and Hexagon, and compare
+    results"""
+    hexagon_mod = build_hexagon_module(relay_mod)
+    cpu_mod = build_ref_module(relay_mod)
+
+    hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
+    cpu_outs = execute_on_cpu(cpu_mod, inputs)
+
+    # Do not pass rtol/atol if not present to use default values from assert_allclose
+    tolerances = dict()
+    if rtol is not None:
+        tolerances["rtol"] = rtol
+    if atol is not None:
+        tolerances["atol"] = atol
+
+    assert_allclose(hexagon_outs, cpu_outs, **tolerances)
+
+
+# First test basic QNN ops: quantize, dequantize, requantize
+#
+class TestQnnQuantize:
+    """QNN Quantize test class."""
+
+    input_shape = tvm.testing.parameter([1, 8, 8, 32], [1, 10, 10, 32], [1, 12, 12, 128])
+    odtype = tvm.testing.parameter("int8", "uint8")
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_quantize(self, hexagon_session: Session, odtype, input_shape):
+        """Test qnn.quantize"""
+
+        def gen_relay_expr_qnn(output_scale, output_zero_point):
+            data = relay.var("data", shape=input_shape, dtype="float32")
+            qnn_quantize = relay.qnn.op.quantize(
+                data,
+                output_scale=relay.const(output_scale),
+                output_zero_point=relay.const(output_zero_point),
+                axis=-1,
+                out_dtype=odtype,
+            )
+            return qnn_quantize
+
+        inputs = {"data": np.random.random(input_shape)}
+        # Use quantize_np to obtain reasonable quantization parameters.
+        ref_out, scale, zero_point = quantize_np(inputs["data"], odtype)
+
+        relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(scale, zero_point))
+
+        hexagon_mod = build_hexagon_module(relay_mod)
+        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
+        assert_allclose(hexagon_outs, [ref_out], atol=1)
+
+
+class TestQnnDequantize:
+    """QNN Dequantize test class."""
+
+    input_shape = tvm.testing.parameter(
+        [1, 12, 32, 128], [1, 10, 10, 32], [1, 6, 6, 2048], [1, 1000]
+    )
+    idtype = tvm.testing.parameter("int8", "uint8")
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_dequantize(self, hexagon_session: Session, idtype, input_shape):
+        """Test qnn.dequantize"""
+
+        def gen_relay_expr_qnn(dtype, input_scale, input_zero_point):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            qnn_dequantize = relay.qnn.op.dequantize(
+                data,
+                input_scale=relay.const(input_scale),
+                input_zero_point=relay.const(input_zero_point),
+            )
+            return qnn_dequantize
+
+        # Generate float data, then quantize it to produce input.
+        ref_out = np.random.random(input_shape)
+        data, scale, zero_point = quantize_np(ref_out, idtype)
+        inputs = {"data": data}
+
+        relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(idtype, scale, zero_point))
+
+        hexagon_mod = build_hexagon_module(relay_mod)
+        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
+        # We do
+        #   original -[quantize]-> input -[dequantize]-> output
+        # then compare "original" with "output". Use rtol=1 because of the quantized
+        # format in the middle.
+        assert_allclose(hexagon_outs, [ref_out], rtol=1, atol=1e-2)  # rtol = 1
+
+
+class TestQnnRequantize:
+    """QNN requantize test class"""
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_requantize(self, hexagon_session: Session):
+        """Test qnn.requantize"""
+        data_shape = [256]
+        data = relay.var("data", shape=data_shape, dtype="int32")
+
+        op = relay.qnn.op.requantize(
+            data,
+            input_scale=relay.const(0.156),
+            input_zero_point=relay.const(2),
+            output_scale=relay.const(0.212),
+            output_zero_point=relay.const(1),
+            out_dtype="int8",
+        )
+        relay_mod = tvm.IRModule.from_expr(op)
+
+        inputs = {"data": np.arange(-256, 256, 2, dtype="int32")}
+
+        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
+
+
+class TestQnnAvgPool2d:
+    """QNN AvgPool2d test class."""
+
+    _multitest_params = [
+        ([1, 12, 12, 32], "NHWC", [3, 3], [1, 1], [2, 3], [1, 2, 3, 4], False, False),
+        ([1, 18, 18, 32], "NCHW", [3, 3], [2, 2], [2, 1], [1, 2, 3, 4], False, True),
+    ]
+
+    (
+        input_shape,
+        layout,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(*_multitest_params)
+
+    idtype, odtype = tvm.testing.parameters(("uint8", "uint8"))
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_avg_pool2d(
+        self,
+        hexagon_session: Session,
+        idtype,
+        odtype,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ):
+        """Test qnn.avg_pool2d"""
+
+        def gen_relay_expr_qnn(
+            dtype, input_scale, input_zero_point, output_scale, output_zero_point
+        ):
+            data = relay.var("data", shape=input_shape, dtype=dtype)
+            qnn_avg_pool = relay.qnn.op.avg_pool2d(
+                data,
+                input_scale=relay.const(input_scale),
+                input_zero_point=relay.const(input_zero_point),
+                output_scale=relay.const(output_scale),
+                output_zero_point=relay.const(output_zero_point),
+                pool_size=kernel,
+                strides=stride,
+                dilation=dilation,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+                layout=layout,
+            )
+
+            return qnn_avg_pool
+
+        # Generate inputs and reference data first.
+        fp_input = np.random.random(input_shape)
+        fp_output = tvm.topi.testing.poolnd_python(
+            fp_input,
+            kernel,
+            stride,
+            dilation,
+            padding_before=padding[:2],
+            padding_after=padding[2:],
+            pool_type="avg",
+            count_include_pad=count_include_pad,
+            ceil_mode=ceil_mode,
+            layout=layout,
+        )
+        input_data, input_scale, input_zero_point = quantize_np(fp_input, idtype)
+        ref_out, output_scale, output_zero_point = quantize_np(fp_output, odtype)
+        inputs = {"data": input_data}
+
+        relay_mod = tvm.IRModule.from_expr(
+            gen_relay_expr_qnn(
+                idtype, input_scale, input_zero_point, output_scale, output_zero_point
+            )
+        )
+
+        hexagon_mod = build_hexagon_module(relay_mod)
+        hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs)
+        assert_allclose(hexagon_outs, [ref_out], rtol=0, atol=2)
+
+
+class TestQnnBinaryOp:
+    """QNN binary op test class"""
+
+    operation = tvm.testing.parameter(relay.qnn.op.add, relay.qnn.op.subtract, relay.qnn.op.mul)
+    dtype = tvm.testing.parameter("uint8", "int8")
+    input_shape = tvm.testing.parameter([256], [4, 256])
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_binary_op(self, hexagon_session: Session, operation, dtype, input_shape):
+        """Test binary qnn ops"""
+        lhs_shape = [4, 256]
+        rhs_shape = input_shape
+        lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype)
+        rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype)
+        lhs_zp = 1
+        rhs_zp = 3
+
+        op = operation(
+            lhs,
+            rhs,
+            lhs_scale=relay.const(0.041, "float32"),
+            lhs_zero_point=relay.const(lhs_zp, "int32"),
+            rhs_scale=relay.const(0.017, "float32"),
+            rhs_zero_point=relay.const(rhs_zp, "int32"),
+            output_scale=relay.const(0.039, "float32"),
+            output_zero_point=relay.const(2, "int32"),
+        )
+        relay_mod = tvm.IRModule.from_expr(op)
+
+        inputs = {
+            "lhs": np.random.randint(np.iinfo(dtype).min + lhs_zp, np.iinfo(dtype).max, lhs_shape),
+            "rhs": np.random.randint(np.iinfo(dtype).min + rhs_zp, np.iinfo(dtype).max, rhs_shape),
+        }
+
+        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_binary_op_broadcasting(self, hexagon_session: Session, operation):
+        """Test binary qnn ops (with argument broadcast)"""
+        lhs_shape = [4, 256]
+        lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8")
+        rhs = relay.const(11, dtype="uint8")
+
+        op = operation(
+            lhs,
+            rhs,
+            lhs_scale=relay.const(0.049, "float32"),
+            lhs_zero_point=relay.const(1, "int32"),
+            rhs_scale=relay.const(0.067, "float32"),
+            rhs_zero_point=relay.const(3, "int32"),
+            output_scale=relay.const(0.041, "float32"),
+            output_zero_point=relay.const(2, "int32"),
+        )
+        relay_mod = tvm.IRModule.from_expr(op)
+
+        inputs = {"lhs": np.random.randint(1, 255, size=lhs_shape)}
+
+        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
+
+
+class TestQnnConcatenate:
+    """QNN concatenate test class"""
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_concatenate(self, hexagon_session: Session):
+        """Test qnn.concatenate"""
+        x_shape = [1, 64]
+        y_shape = [2, 64]
+        z_shape = [3, 64]
+        input_x = relay.var("x", shape=x_shape, dtype="uint8")
+        input_y = relay.var("y", shape=y_shape, dtype="uint8")
+        input_z = relay.var("z", shape=z_shape, dtype="uint8")
+
+        op = relay.qnn.op.concatenate(
+            (input_x, input_y, input_z),
+            input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)),
+            input_zero_points=(relay.const(0), relay.const(1), relay.const(2)),
+            output_scale=relay.const(0.8),
+            output_zero_point=relay.const(5),
+            axis=0,
+        )
+        relay_mod = tvm.IRModule.from_expr(op)
+
+        inputs = {
+            "x": np.arange(0, 64, 1, dtype="uint8").reshape(x_shape),
+            "y": np.arange(0, 128, 1, dtype="uint8").reshape(y_shape),
+            "z": np.arange(0, 192, 1, dtype="uint8").reshape(z_shape),
+        }
+
+        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
+
+
+class TestQnnConv2D:
+    """QNN conv2d op test class."""
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_quantize_conv2d_requantize(self, hexagon_session: Session):
+        """Tast qnn.conv2d"""
+        data_shape = [1, 8, 32, 32]
+        weight_shape = [16, 8, 3, 3]
+        data = relay.var("data", shape=data_shape, dtype="float32")
+        weight = relay.var("weight", shape=weight_shape, dtype="float32")
+        op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8")
+        op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
+        op2 = relay.qnn.op.conv2d(
+            op0,
+            op1,
+            input_zero_point=relay.const(0),
+            kernel_zero_point=relay.const(0),
+            input_scale=relay.const(0.078),
+            kernel_scale=relay.const(0.07),
+            padding=[0, 0, 0, 0],
+            channels=16,
+            kernel_size=[3, 3],
+        )
+        op5 = relay.qnn.op.requantize(
+            op2,
+            input_scale=relay.const(0.05),
+            input_zero_point=relay.const(0),
+            output_scale=relay.const(0.21),
+            output_zero_point=relay.const(61),
+            out_dtype="int8",
+        )
+        relay_mod = tvm.IRModule.from_expr(op5)
+
+        inputs = {
+            "data": np.random.rand(*data_shape),
+            "weight": np.random.rand(*weight_shape) - 0.5,
+        }
+
+        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
+
+
+class TestQnnDense:
+    """QNN dense op test class."""
+
+    @tvm.testing.requires_hexagon
+    def test_alter_layout_qnn_dense(self):
+        """Test weights layout transformation of qnn.dense with int8 weights"""
+        data = relay.var("data", shape=(128, 16), dtype="uint8")
+        weight = relay.var("weight", shape=(64, 16), dtype="int8")
+        zero = relay.const(0)
+        iscale = relay.const(0.15)
+        wscale = relay.const(0.37)
+
+        def before():
+            return relay.qnn.op.dense(data, weight, zero, zero, iscale, wscale, units=None)
+
+        def expected():
+            op0 = relay.layout_transform(weight, src_layout="NC", dst_layout="NC32n4c")
+            return relay.qnn.op.contrib_dense_pack(data, op0, zero, zero, iscale, wscale, "NC32n4c")
+
+        target = tvm.target.hexagon("v68")
+        with tvm.target.Target(target):
+            a = run_opt_pass(before(), tvm.relay.transform.AlterOpLayout())
+            b = run_infer_type(expected())
+            tvm.ir.assert_structural_equal(a, b)
+
+    # Dense + bias_add + requantize
+    #
+    dtype = tvm.testing.parameter("uint8", "int8")
+    n_dim = tvm.testing.parameter(64, 60)
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_dense_biasadd_requantize(self, hexagon_session: Session, dtype, n_dim):
+        """Check lowering of qnn.dense + bias_add + qnn.requantize
+        dtype: type of weights
+        n_dim: N dimension of weights, need to check cases when it is multiple of 32 and not.
+        """
+        data_shape = [128, 32]
+        weight_shape = [n_dim, 32]
+        bias_shape = [n_dim]
+        data = relay.var("data", shape=data_shape, dtype="uint8")
+        weight = relay.var("weight", shape=weight_shape, dtype=dtype)
+        bias = relay.var("bias", shape=bias_shape, dtype="int32")
+
+        op0 = relay.qnn.op.dense(
+            data,
+            weight,
+            input_zero_point=relay.const(2),
+            kernel_zero_point=relay.const(0),
+            input_scale=relay.const(0.08),
+            kernel_scale=relay.const(0.07),
+            units=None,
+        )
+        op1 = relay.nn.bias_add(op0, bias)
+        op2 = relay.qnn.op.requantize(
+            op1,
+            input_scale=relay.const(1.3),
+            input_zero_point=relay.const(4),
+            output_scale=relay.const(3.7),
+            output_zero_point=relay.const(1),
+            out_dtype="uint8",
+        )
+        relay_mod = tvm.IRModule.from_expr(op2)
+
+        np.random.seed(0)
+
+        inputs = {
+            "data": np.random.randint(2, 8, size=data_shape, dtype="uint8"),
+            "weight": np.random.randint(0, 8, size=weight_shape, dtype=dtype),
+            "bias": np.random.randint(-10, 10, size=bias_shape, dtype="int32"),
+        }
+
+        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
+
+    # Dense + requantize
+    #
+    @tvm.testing.requires_hexagon
+    def test_qnn_dense_requantize(self, hexagon_session: Session):
+        """Check lowering of qnn.dense + qnn.requantize
+        Checkint the case: data type = "uint8", weight type = "int8", input zp = 0 and kernel zp = 0
+        """
+        data_shape = [128, 32]
+        weight_shape = [64, 32]
+        data = relay.var("data", shape=data_shape, dtype="uint8")
+        weight = relay.var("weight", shape=weight_shape, dtype="int8")
+
+        op0 = relay.qnn.op.dense(
+            data,
+            weight,
+            input_zero_point=relay.const(0),
+            kernel_zero_point=relay.const(0),
+            input_scale=relay.const(0.06),
+            kernel_scale=relay.const(0.19),
+            units=64,
+        )
+        op1 = relay.qnn.op.requantize(
+            op0,
+            input_scale=relay.const(0.1),
+            input_zero_point=relay.const(0),
+            output_scale=relay.const(0.24),
+            output_zero_point=relay.const(64),
+            out_dtype="uint8",
+        )
+        relay_mod = tvm.IRModule.from_expr(op1)
+
+        np.random.seed(0)
+
+        inputs = {
+            "data": np.random.randint(0, 8, size=data_shape, dtype="uint8"),
+            "weight": np.random.randint(-4, 4, size=weight_shape, dtype="int8"),
+        }
+
+        run_and_compare(hexagon_session, relay_mod, inputs, atol=1)  # diff by 1 is ok
+
+
+class TestQnnTanh:
+    """QNN tanh test class"""
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_tanh(self, hexagon_session: Session):
+        """Test qnn.tanh"""
+        data_shape = [256]
+        data = relay.var("data", shape=data_shape, dtype="uint8")
+
+        op = relay.qnn.op.tanh(
+            data,
+            scale=relay.const(0.518),
+            zero_point=relay.const(137),
+            output_scale=relay.const(0.207),
+            output_zero_point=relay.const(128),
+        )
+        relay_mod = tvm.IRModule.from_expr(op)
+
+        inputs = {"data": np.arange(0, 256, 1, dtype="uint8")}
+
+        run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0)  # equal
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
deleted file mode 100644
index f4342f5814df..000000000000
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""No QNN canonicalization tests."""
-
-import numpy as np
-
-import tvm.testing
-from tvm import relay
-from tvm.contrib.hexagon.session import Session
-from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
-from tvm.relay.backend import Executor
-from tvm.relay.testing import run_opt_pass, run_infer_type
-
-
-@tvm.testing.requires_hexagon
-def test_no_qnn_pass():
-    """No QNN pass test."""
-    x = relay.var("x", shape=(4, 8), dtype="float32")
-    op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
-    op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5))
-    mod = tvm.IRModule.from_expr(op1)
-
-    target_hexagon = tvm.target.hexagon("v68")
-    # Default compilation flow
-    with tvm.transform.PassContext(opt_level=3):
-        opt_mod_1, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon))
-
-    # Disable QNN legalization and canonicalization passes
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
-        opt_mod_2, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon))
-
-    # Check that QNN ops are absent with default compilation flow.
-    assert "qnn.quantize" not in opt_mod_1.astext(show_meta_data=False)
-    assert "qnn.dequantize" not in opt_mod_1.astext(show_meta_data=False)
-
-    # Check that QNN ops are present without "qnn.Legalize" passes.
-    assert "qnn.quantize" in opt_mod_2.astext(show_meta_data=False)
-    assert "qnn.dequantize" in opt_mod_2.astext(show_meta_data=False)
-
-
-def test_alter_layout_qnn_dense():
-    """Test weights layout transformation of qnn.dense with int8 weights"""
-    data = relay.var("data", shape=(128, 16), dtype="uint8")
-    weight = relay.var("weight", shape=(64, 16), dtype="int8")
-    zero = relay.const(0)
-    iscale = relay.const(0.15)
-    wscale = relay.const(0.37)
-
-    def before():
-        return relay.qnn.op.dense(data, weight, zero, zero, iscale, wscale, units=None)
-
-    def expected():
-        op0 = relay.layout_transform(weight, src_layout="NC", dst_layout="NC32n4c")
-        return relay.qnn.op.contrib_dense_pack(data, op0, zero, zero, iscale, wscale, "NC32n4c")
-
-    target = tvm.target.hexagon("v68")
-    with tvm.target.Target(target):
-        a = run_opt_pass(before(), tvm.relay.transform.AlterOpLayout())
-        b = run_infer_type(expected())
-        tvm.ir.assert_structural_equal(a, b)
-
-
-def execute(mod_executor, inputs: dict):
-    for input_name, input_data in inputs.items():
-        mod_executor.set_input(input_name, input_data)
-    mod_executor.run()
-    return mod_executor.get_output(0).numpy()
-
-
-def build_hexagon_module(mod):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]):
-        hexagon_lowered = tvm.relay.build(
-            mod,
-            tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
-            executor=Executor("aot"),
-        )
-
-    return hexagon_lowered
-
-
-def build_ref_module(mod):
-    target_llvm = tvm.target.Target("llvm")
-    with tvm.transform.PassContext(opt_level=3):
-        llvm_lowered = tvm.relay.build(
-            mod,
-            tvm.target.Target(target_llvm, host=target_llvm),
-            executor=Executor("aot"),
-        )
-    return llvm_lowered
-
-
-@tvm.testing.requires_hexagon
-def test_qnn_conv2d_rq(hexagon_session: Session):
-    """QNN conv2d test."""
-    data_shape = [1, 8, 32, 32]
-    weight_shape = [16, 8, 3, 3]
-    data = relay.var("data", shape=data_shape, dtype="float32")
-    weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8")
-    op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
-    op2 = relay.qnn.op.conv2d(
-        op0,
-        op1,
-        input_zero_point=relay.const(0),
-        kernel_zero_point=relay.const(0),
-        input_scale=relay.const(0.078),
-        kernel_scale=relay.const(0.07),
-        padding=[0, 0, 0, 0],
-        channels=16,
-        kernel_size=[3, 3],
-    )
-    op5 = relay.qnn.op.requantize(
-        op2,
-        input_scale=relay.const(0.05),
-        input_zero_point=relay.const(0),
-        output_scale=relay.const(0.21),
-        output_zero_point=relay.const(61),
-        out_dtype="int8",
-    )
-    relay_mod = tvm.IRModule.from_expr(op5)
-
-    # Compile for Hexagon
-    hexagon_lowered = build_hexagon_module(relay_mod)
-
-    # Reference compilation
-    llvm_lowered = build_ref_module(relay_mod)
-
-    data_np = np.random.rand(*data_shape)
-    weight_np = np.random.rand(*weight_shape) - 0.5
-    inputs = {"data": data_np, "weight": weight_np}
-
-    hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = execute(hx_m, inputs)
-
-    dev = tvm.cpu(0)
-    llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-    llvm_out = execute(llvm_m, inputs)
-
-    np.testing.assert_equal(hexagon_output, llvm_out)
-
-
-class TestQnnDense:
-    """QNN dense op test class."""
-
-    dtype = tvm.testing.parameter("uint8", "int8")
-    n_dim = tvm.testing.parameter(64, 60)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_dense_add_requantize(self, hexagon_session: Session, dtype, n_dim):
-        """Check lowering of qnn.dense + bias_add + qnn.requantize
-        dtype: type of weights
-        n_dim: N dimension of weights, need to check cases when it is multiple of 32 and not.
-        """
-        data_shape = [128, 32]
-        weight_shape = [n_dim, 32]
-        bias_shape = [n_dim]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-        weight = relay.var("weight", shape=weight_shape, dtype=dtype)
-        bias = relay.var("bias", shape=bias_shape, dtype="int32")
-
-        op0 = relay.qnn.op.dense(
-            data,
-            weight,
-            input_zero_point=relay.const(2),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(0.08),
-            kernel_scale=relay.const(0.07),
-            units=None,
-        )
-        op1 = relay.nn.bias_add(op0, bias)
-        op2 = relay.qnn.op.requantize(
-            op1,
-            input_scale=relay.const(1.3),
-            input_zero_point=relay.const(4),
-            output_scale=relay.const(3.7),
-            output_zero_point=relay.const(1),
-            out_dtype="uint8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op2)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(relay_mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(relay_mod)
-
-        np.random.seed(0)
-
-        data_np = np.random.randint(2, 8, size=data_shape, dtype="uint8")
-        weight_np = np.random.randint(0, 8, size=weight_shape, dtype=dtype)
-        bias_np = np.random.randint(-10, 10, size=bias_shape, dtype="int32")
-        inputs = {"data": data_np, "weight": weight_np, "bias": bias_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_out = execute(llvm_m, inputs)
-
-        # Diff by 1 is Ok.
-        tvm.testing.assert_allclose(hexagon_output, llvm_out, atol=1)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_dense_requantize(self, hexagon_session: Session):
-        """Check lowering of qnn.dense + qnn.requantize
-        Checkint the case: data type = "uint8", weight type = "int8", input zp = 0 and kernel zp = 0
-        """
-        data_shape = [128, 32]
-        weight_shape = [64, 32]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-        weight = relay.var("weight", shape=weight_shape, dtype="int8")
-
-        op0 = relay.qnn.op.dense(
-            data,
-            weight,
-            input_zero_point=relay.const(0),
-            kernel_zero_point=relay.const(0),
-            input_scale=relay.const(0.06),
-            kernel_scale=relay.const(0.19),
-            units=64,
-        )
-        op1 = relay.qnn.op.requantize(
-            op0,
-            input_scale=relay.const(0.1),
-            input_zero_point=relay.const(0),
-            output_scale=relay.const(0.24),
-            output_zero_point=relay.const(64),
-            out_dtype="uint8",
-        )
-        relay_mod = tvm.IRModule.from_expr(op1)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(relay_mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(relay_mod)
-
-        np.random.seed(0)
-
-        data_np = np.random.randint(0, 8, size=data_shape, dtype="uint8")
-        weight_np = np.random.randint(-4, 4, size=weight_shape, dtype="int8")
-        inputs = {"data": data_np, "weight": weight_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_out = execute(llvm_m, inputs)
-
-        # Diff by 1 is Ok.
-        tvm.testing.assert_allclose(hexagon_output, llvm_out, atol=1)
-
-
-class TestQnnBinaryOp:
-    """QNN binary op test class"""
-
-    operation = tvm.testing.parameter(
-        relay.qnn.op.add,
-        relay.qnn.op.subtract,
-        relay.qnn.op.mul,
-    )
-    dtype = tvm.testing.parameter("uint8", "int8")
-    input_shape = tvm.testing.parameter([256], [4, 256])
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_binary_op_broadcasting(
-        self, hexagon_session: Session, operation, dtype, input_shape
-    ):
-        """qnn binary op test without QNN canonicalization."""
-        lhs_shape = [4, 256]
-        rhs_shape = input_shape
-        lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype)
-        rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype)
-        zp_const1 = 1
-        zp_const2 = 3
-
-        op = operation(
-            lhs,
-            rhs,
-            lhs_scale=relay.const(0.041, "float32"),
-            lhs_zero_point=relay.const(zp_const1, "int32"),
-            rhs_scale=relay.const(0.017, "float32"),
-            rhs_zero_point=relay.const(zp_const2, "int32"),
-            output_scale=relay.const(0.039, "float32"),
-            output_zero_point=relay.const(2, "int32"),
-        )
-        mod = tvm.IRModule.from_expr(op)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(mod)
-
-        lhs_np = np.random.randint(np.iinfo(dtype).min + zp_const1, np.iinfo(dtype).max, lhs_shape)
-        rhs_np = np.random.randint(np.iinfo(dtype).min + zp_const2, np.iinfo(dtype).max, rhs_shape)
-        inputs = {"lhs": lhs_np, "rhs": rhs_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        dev = tvm.cpu(0)
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-        llvm_output = execute(llvm_m, inputs)
-
-        # Diff by 1 is Ok.
-        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_binary_op_scalar(self, hexagon_session: Session, operation):
-        """qnn binary op test without QNN canonicalization."""
-        lhs_shape = [4, 256]
-        lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8")
-        rhs = relay.const(11, dtype="uint8")
-
-        op = operation(
-            lhs,
-            rhs,
-            lhs_scale=relay.const(0.049, "float32"),
-            lhs_zero_point=relay.const(1, "int32"),
-            rhs_scale=relay.const(0.067, "float32"),
-            rhs_zero_point=relay.const(3, "int32"),
-            output_scale=relay.const(0.041, "float32"),
-            output_zero_point=relay.const(2, "int32"),
-        )
-        mod = tvm.IRModule.from_expr(op)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(mod)
-
-        lhs_np = np.random.randint(1, 255, size=lhs_shape)
-        inputs = {"lhs": lhs_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        dev = tvm.cpu(0)
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-        llvm_output = execute(llvm_m, inputs)
-
-        # Diff by 1 is Ok.
-        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
-
-
-class TestQnnOp:
-    """QNN op test class"""
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_requantize(self, hexagon_session: Session):
-        """qnn.requantize test without QNN canonicalization."""
-        data_shape = [256]
-        data = relay.var("data", shape=data_shape, dtype="int32")
-
-        op = relay.qnn.op.requantize(
-            data,
-            input_scale=relay.const(0.156),
-            input_zero_point=relay.const(2),
-            output_scale=relay.const(0.212),
-            output_zero_point=relay.const(1),
-            out_dtype="int8",
-        )
-        mod = tvm.IRModule.from_expr(op)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(mod)
-
-        data_np = np.arange(-256, 256, 2, dtype="int32")
-        inputs = {"data": data_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        dev = tvm.cpu(0)
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-        llvm_output = execute(llvm_m, inputs)
-
-        np.testing.assert_equal(hexagon_output, llvm_output)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_concatenate(self, hexagon_session: Session):
-        """qnn.concatenate op test without QNN canonicalization."""
-        x_shape = [1, 64]
-        y_shape = [2, 64]
-        z_shape = [3, 64]
-        input_x = relay.var("x", shape=x_shape, dtype="uint8")
-        input_y = relay.var("y", shape=y_shape, dtype="uint8")
-        input_z = relay.var("z", shape=z_shape, dtype="uint8")
-
-        op = relay.qnn.op.concatenate(
-            (input_x, input_y, input_z),
-            input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)),
-            input_zero_points=(relay.const(0), relay.const(1), relay.const(2)),
-            output_scale=relay.const(0.8),
-            output_zero_point=relay.const(5),
-            axis=0,
-        )
-        mod = tvm.IRModule.from_expr(op)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(mod)
-
-        x_np = np.arange(0, 64, 1, dtype="uint8").reshape(x_shape)
-        y_np = np.arange(0, 128, 1, dtype="uint8").reshape(y_shape)
-        z_np = np.arange(0, 192, 1, dtype="uint8").reshape(z_shape)
-        inputs = {"x": x_np, "y": y_np, "z": z_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        dev = tvm.cpu(0)
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-        llvm_output = execute(llvm_m, inputs)
-
-        # Diff by 1 is Ok.
-        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
-
-    @tvm.testing.requires_hexagon
-    def test_qnn_tanh(self, hexagon_session: Session):
-        """qnn.tanh op test without QNN canonicalization."""
-        data_shape = [256]
-        data = relay.var("data", shape=data_shape, dtype="uint8")
-
-        op = relay.qnn.op.tanh(
-            data,
-            scale=relay.const(0.518),
-            zero_point=relay.const(137),
-            output_scale=relay.const(0.207),
-            output_zero_point=relay.const(128),
-        )
-        mod = tvm.IRModule.from_expr(op)
-
-        # Compile for Hexagon
-        hexagon_lowered = build_hexagon_module(mod)
-
-        # Reference compilation
-        llvm_lowered = build_ref_module(mod)
-
-        data_np = np.arange(0, 256, 1, dtype="uint8")
-        inputs = {"data": data_np}
-
-        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
-        hexagon_output = execute(hx_m, inputs)
-
-        dev = tvm.cpu(0)
-        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
-        llvm_output = execute(llvm_m, inputs)
-
-        np.testing.assert_equal(hexagon_output, llvm_output)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
index 0eedfdbf8da1..712d5b303eeb 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py
@@ -20,74 +20,70 @@
 
 from tvm import te
 import tvm.testing
+from tvm.topi.testing import poolnd_python
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 import tvm.topi.hexagon.qnn as qn
 from tvm.contrib.hexagon import allocate_hexagon_array
-from ...infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-from ...pytest_util import (
-    get_multitest_ids,
-    create_populated_numpy_ndarray,
-    TensorContentRandom,
-)
+import pytest
+from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
+from ...pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom
 
-input_layout = tvm.testing.parameter(
-    "nhwc-8h2w32c2w-2d",
-)
 
-dtype = tvm.testing.parameter("float16", "uint8")
+dtype = tvm.testing.parameter("uint8", "float16")
 
 
 @tvm.testing.fixture
-def output_layout(output_shape, dtype):
-    o_b, o_h, o_w, o_c = output_shape
-    if dtype == "float16":
-        if o_h == 1 and o_w == 1:
-            return "n11c-1024c-2d"
+def output_layout(output_shape, op_layout, dtype):
+    if op_layout == "NHWC":
+        o_b, o_h, o_w, o_c = output_shape
+        if dtype == "float16":
+            if o_h == 1 and o_w == 1:
+                return "n11c-1024c-2d"
+            else:
+                return "nhwc-8h2w32c2w-2d"
+        elif dtype == "int8" or "uint8":
+            if o_h == 1 and o_w == 1:
+                return "n11c-2048c-2d"
+            else:
+                return "nhwc-8h8w32c-2d"
         else:
-            assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape"
-            return "nhwc-8h2w32c2w-2d"
-    elif dtype == "int8" or "uint8":
-        if o_h == 1 and o_w == 1:
-            return "n11c-2048c-2d"
+            raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    elif op_layout == "NCHW":
+        o_b, o_c, o_h, o_w = output_shape
+        if dtype == "float16":
+            if o_h == 1 and o_w == 1:
+                return "nc11-1024c-2d"
+            else:
+                return "nchw-8h2w32c2w-2d"
+        elif dtype == "int8" or "uint8":
+            if o_h == 1 and o_w == 1:
+                return "nc11-2048c-2d"
+            else:
+                return "nchw-8h8w32c-2d"
         else:
-            assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape"
-            return "nhwc-8h8w32c-2d"
+            raise RuntimeError(f"Unsupported data type '{dtype}'")
     else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
+        raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
 
 
 @tvm.testing.fixture
-def input_np(input_shape, dtype: str, input_tensor_populator):
-    if dtype == "uint8":
-        dtype = "float32"  # Use "float32" input which will be quantized later
-    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
-
-
-@tvm.testing.fixture
-def transformed_expected_output_np(expected_output_np, output_layout, dtype):
+def input_layout(op_layout, dtype):
+    in_layout = op_layout.lower()
     if dtype == "float16":
-        return transform_numpy(expected_output_np, "nhwc", output_layout)
-    elif dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype)
-        return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point]
+        return in_layout + "-8h2w32c2w-2d"
+    elif dtype == "int8" or "uint8":
+        return in_layout + "-8h8w32c-2d"
     else:
         raise RuntimeError(f"Unsupported data type '{dtype}'")
 
 
 @tvm.testing.fixture
-def transformed_input_np_padded(input_np_padded, input_layout, dtype):
-    if dtype == "float16":
-        return transform_numpy(input_np_padded, "nhwc", input_layout)
-    elif dtype in ("uint8", "int8"):
-        quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype)
-        return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point]
-    else:
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
+def input_np(input_shape, dtype: str, input_tensor_populator):
+    if dtype == "uint8":
+        dtype = "float32"  # Use "float32" input which will be quantized later
+    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
 
 
 class TestAvgPool2dSlice:
@@ -99,110 +95,143 @@ class TestAvgPool2dSlice:
         "pad",  # padding
         "ceil",  # ceil_mode
         "cnt_padded",  # count_include_pad
+        "op_layout",  # input output 4D layout
         None,  # input_tensor_populator
     ]
-
     _multitest_params = [
         (
-            [1, 8, 8, 32],
+            [1, 7, 11, 32],
+            [3, 3],
+            [3, 2],
+            [2, 3],
+            [1, 2, 3, 4],
+            False,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 2, 1, 4],
+            False,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        # Test default stride,dilation, and padding with different layouts
+        (
+            [1, 10, 10, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 16, 16, 32],
+            [1, 12, 12, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
-            [8, 8],
+            [1, 32, 14, 14],
+            [3, 3],
             [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
-        # Test non-one stride and dilation
         (
-            [1, 8, 8, 32],
-            [3, 3],
-            [2, 3],
+            [1, 32, 15, 15],
+            [8, 8],
+            [1, 1],
             [1, 1],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
+        # Test non-one stride and dilation with different layouts
         (
-            [1, 8, 8, 32],
+            [1, 18, 24, 32],
             [3, 3],
-            [2, 2],
+            [2, 3],
             [2, 2],
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
-            [3, 3],
+            [1, 32, 18, 18],
+            [5, 5],
             [2, 2],
             [2, 3],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
-        # Test non-zero padding
+        # Test non-zero padding with count include and exclude pad and different layouts
         (
-            [1, 8, 8, 32],
+            [1, 6, 6, 32],
             [3, 3],
             [1, 1],
             [1, 1],
             [1, 1, 1, 1],
             False,
-            True,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
             [3, 3],
-            [1, 1],
-            [1, 1],
-            [1, 2, 3, 4],
+            [1, 2],
+            [2, 3],
+            [2, 2, 3, 3],
             False,
-            True,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
+            [1, 32, 6, 6],
             [3, 3],
             [1, 1],
             [1, 1],
             [1, 2, 3, 4],
             False,
-            True,
+            False,
+            "NCHW",
             TensorContentRandom(),
         ),
         (
-            [1, 8, 8, 32],
+            [1, 32, 15, 22],
             [3, 3],
             [3, 2],
             [2, 3],
             [1, 2, 3, 4],
             False,
-            True,
+            False,
+            "NCHW",
             TensorContentRandom(),
         ),
         # Test n11c-1024c-2d layout which will require input and output to have different layout
@@ -214,6 +243,7 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
@@ -221,9 +251,21 @@ class TestAvgPool2dSlice:
             [6, 6],
             [1, 1],
             [1, 1],
-            [0, 0, 0, 0],
+            [2, 2, 2, 2],
             False,
-            True,
+            False,
+            "NHWC",
+            TensorContentRandom(),
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 2, 1, 4],
+            False,
+            False,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
@@ -234,23 +276,24 @@ class TestAvgPool2dSlice:
             [0, 0, 0, 0],
             False,
             True,
+            "NHWC",
             TensorContentRandom(),
         ),
         (
-            [1, 1, 1, 2048],
+            [1, 2048, 1, 1],
             [4, 4],
             [2, 2],
             [2, 3],
             [0, 0, 0, 0],
             False,
             True,
+            "NCHW",
             TensorContentRandom(),
         ),
     ]
 
     _param_ids = get_multitest_ids(_multitest_params, _param_descs)
 
-    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
     (
         output_shape,
         kernel,
@@ -259,23 +302,17 @@ class TestAvgPool2dSlice:
         padding,
         ceil_mode,
         count_include_pad,
+        op_layout,
         input_tensor_populator,
     ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
     @tvm.testing.fixture
     def expected_output_np(
-        self,
-        input_np,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
+        self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, op_layout
     ):
         pad_before = padding[:2]
         pad_after = padding[2:]
-        ref_np = tvm.topi.testing.poolnd_python(
+        ref_np = poolnd_python(
             input_np,
             kernel,
             stride,
@@ -285,150 +322,129 @@ def expected_output_np(
             "avg",  # pool_type
             count_include_pad,
             False,  # ceil_mode,
-            layout="NHWC",
+            layout=op_layout,
         )
         return ref_np
 
     @tvm.testing.fixture
-    def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout):
+    def input_shape(
+        self, output_shape, kernel, padding, stride, dilation, op_layout, output_layout
+    ):
         # Input shape without any padding; 'ceil' is being ignored from calculation:
-        o_b, o_h, o_w, o_c = output_shape
+        if op_layout == "NHWC":
+            o_b, o_h, o_w, o_c = output_shape
+        else:
+            o_b, o_c, o_h, o_w = output_shape
         d_h, d_w = dilation
         s_h, s_w = stride
         k_h, k_w = kernel
         pad_before_h, pad_before_w = padding[:2]
         pad_after_h, pad_after_w = padding[2:]
 
-        if output_layout == "n11c-1024c-2d":
-            assert (
-                pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0
-            ), "Padding must be zero for n11c-1024c-2d layout"
+        if (
+            output_layout == "n11c-2048c-2d"
+            or output_layout == "nc11-2048c-2d"
+            or output_layout == "n11c-1024c-2d"
+            or output_layout == "nc11-1024c-2d"
+        ):
             assert o_h == 1 and o_w == 1, "Output height and width must be 1"
 
         in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
         in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
 
-        return [o_b, in_h, in_w, o_c]
-
-    @tvm.testing.fixture
-    def input_shape_padded(self, input_shape, padding, output_layout, dtype):
-        # Input shape is adjusted to account for 'padding'. Also, due to the physical
-        # layout of the buffer, height and width are adjusted so that they are a
-        # multiple of the buffer size dictated by the layout.
-        # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and
-        # for int8/uint8, it's nhwc-8h8w32c-2d.
-        # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple
-        # of 8. However, the width should be a multiple of 4 for the first case and 8 for
-        # the second case.
-
-        height_mult = 8
-        if dtype == "float16":
-            width_mult = 4  # input layout : nhwc-8h2w32c2w-2d
-        elif dtype in ("uint8", "int8"):
-            width_mult = 8  # input layout : nhwc-8h8w32c-2d
+        if op_layout == "NHWC":
+            return [o_b, in_h, in_w, o_c]
         else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h, pad_after_w = padding[2:]
-        padded_input_height = (
-            (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult
-        ) * height_mult
-        padded_input_width = (
-            (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult
-        ) * width_mult
-        return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
-
-    @tvm.testing.fixture
-    def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
-        pad_before_h, pad_before_w = padding[:2]
-        pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h
-        pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w
-        input_padded = np.pad(
-            input_np,
-            ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)),
-            "constant",
-        )
-        return input_padded
+            return [o_b, o_c, in_h, in_w]
 
     @tvm.testing.fixture
     def schedule_args(
         self,
-        stride,
         kernel,
-        dtype,
+        stride,
+        padding,
         dilation,
-        input_layout,
+        count_include_pad,
         output_layout,
         output_shape,
-        input_shape_padded,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
+        input_np,
+        input_shape,
+        input_layout,
+        expected_output_np,
+        dtype,
+        op_layout,
     ):
-        """
-        Construct schedule args based on dtype
-        """
-        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
-
+        """Construct schedule args based on dtype"""
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
         if dtype == "float16":
-            M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape)
+            if op_layout == "NHWC":
+                M = sl.avg_pool2d_NHWC(
+                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
+                )
+            elif op_layout == "NCHW":
+                M = sl.avg_pool2d_NCHW(
+                    A, kernel, stride, padding, dilation, count_include_pad, output_shape
+                )
+            else:
+                raise RuntimeError(f"Unsupported layout for slice_op.avg_pool2d '{op_layout}'")
             tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout)
         elif dtype in ("uint8", "int8"):
-            in_data, in_scale, in_zero_point = transformed_input_np_padded
-            _, out_scale, out_zero_point = transformed_expected_output_np
-            M = qn.qnn_avg_pool2d_compute(
-                A,
-                kernel,
-                stride,
-                dilation,
-                output_shape,
-                dtype,
-                in_zero_point,
-                in_scale,
-                out_zero_point,
-                out_scale,
-            )
+            _, in_scale, in_zero_point = quantize_np(input_np, dtype)
+            _, out_scale, out_zero_point = quantize_np(expected_output_np, dtype)
+            if op_layout == "NHWC":
+                M = qn.qnn_avg_pool2d_NHWC(
+                    A,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    count_include_pad,
+                    output_shape,
+                    dtype,
+                    in_scale,
+                    in_zero_point,
+                    out_scale,
+                    out_zero_point,
+                )
+            elif op_layout == "NCHW":
+                M = qn.qnn_avg_pool2d_NCHW(
+                    A,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    count_include_pad,
+                    output_shape,
+                    dtype,
+                    in_scale,
+                    in_zero_point,
+                    out_scale,
+                    out_zero_point,
+                )
+            else:
+                raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'")
+
             tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout)
 
         return [tir_schedule.mod, [A, M]]
 
     @tvm.testing.requires_hexagon
     def test_avg_pool2d_slice(
-        self,
-        dtype,
-        output_layout,
-        output_shape,
-        transformed_input_np_padded,
-        transformed_expected_output_np,
-        schedule_args,
-        hexagon_session: Session,
+        self, dtype, input_np, expected_output_np, schedule_args, hexagon_session: Session
     ):
-        in_data = transformed_input_np_padded
-
+        print("schedule_args : ", schedule_args)
         with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                *schedule_args,
-                get_hexagon_target("v69"),
-                name="avg_pool2d",
-            )
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-            "n11c-1024c-2d",
-            "n11c-2048c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+            func = tvm.build(*schedule_args, get_hexagon_target("v69"), name="avg_pool2d")
+
+        input_axis_separator = []
+        output_axis_separator = []
 
         if dtype == "float16":
-            in_data_np = transformed_input_np_padded
-            out_data_np = transformed_expected_output_np
+            in_data_np = input_np
+            out_data_np = expected_output_np
         elif dtype in ("uint8", "int8"):
-            in_data_np, _, _ = transformed_input_np_padded
-            out_data_np, _, _ = transformed_expected_output_np
+            in_data_np, _, _ = quantize_np(input_np, dtype)
+            out_data_np, _, _ = quantize_np(expected_output_np, dtype)
         else:
             raise RuntimeError(f"Unsupport dtype '{dtype}'")
 
@@ -436,34 +452,26 @@ def test_avg_pool2d_slice(
             hexagon_session.device,
             data=in_data_np,
             axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
+            mem_scope="global.ddr",
         )
         output_arr = allocate_hexagon_array(
             hexagon_session.device,
             out_data_np.shape,
             dtype,
             axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
+            mem_scope="global.ddr",
         )
 
         mod = hexagon_session.load_module(func)
 
         mod(input_arr, output_arr)
-        b, h, w, c = output_shape
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
-        elif output_layout == "nhwc-8h8w32c-2d":
-            output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32])
-        elif output_layout == "n11c-2048c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048])
-        elif output_layout == "n11c-1024c-2d":
-            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        output_np = output_arr.numpy()
         if dtype == "float16":
             np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3)
         else:
-            np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1)
+            output_np = output_arr.numpy()
+            np.testing.assert_allclose(output_np, out_data_np, rtol=0, atol=2)
 
 
 if __name__ == "__main__":