From 949554731acf1221312175a2105e656779cb62a2 Mon Sep 17 00:00:00 2001 From: Fateme Hosseini Date: Fri, 24 Mar 2023 11:27:48 -0500 Subject: [PATCH] [QNN] Implement quantized avg_pool2d * qnn.avg_pool2d is integrated into Relay and integration tests are added * FQ2I is modified to pick up qnn.avg_pool2d * Canonicalization and layout conversion for qnn.avg_pool2d are implemented * Dynamic PoolArea computation is implemented for both qnn and slice_op avg_pool2d to exclude padding area in averaging calculation when count_include_pad=False Co-authored-by: Fateme Hosseini Co-authored-by: Jyotsna Verma Co-authored-by: Anirudh Sundar Co-authored-by: Venkat Rasagna Reddy Komatireddy --- python/tvm/relay/qnn/op/_qnn.py | 3 + python/tvm/relay/qnn/op/layout_conversions.py | 35 ++ python/tvm/relay/qnn/op/qnn.py | 66 +++ python/tvm/relay/qnn/strategy/generic.py | 39 ++ python/tvm/relay/qnn/strategy/hexagon.py | 24 + .../transform/fake_quantization_to_integer.py | 57 +-- python/tvm/topi/hexagon/compute_poolarea.py | 143 ++++++ python/tvm/topi/hexagon/qnn/__init__.py | 2 +- python/tvm/topi/hexagon/qnn/avg_pool2d.py | 408 ++++++++++++---- python/tvm/topi/hexagon/slice_ops/__init__.py | 2 +- .../tvm/topi/hexagon/slice_ops/avg_pool2d.py | 224 ++++++--- python/tvm/topi/hexagon/utils.py | 65 ++- src/relay/op/nn/pooling.cc | 38 +- src/relay/op/nn/pooling.h | 4 +- src/relay/op/nn/pooling_common.h | 78 +++ src/relay/qnn/op/avg_pool2d.cc | 222 +++++++++ .../test_hexagon/test_qnn_op_integration.py | 456 ++++++++++++++++++ .../topi/slice_op/test_avg_pool2d_slice.py | 412 ++++++++-------- 18 files changed, 1837 insertions(+), 441 deletions(-) create mode 100644 python/tvm/topi/hexagon/compute_poolarea.py create mode 100644 src/relay/op/nn/pooling_common.h create mode 100644 src/relay/qnn/op/avg_pool2d.cc create mode 100644 tests/python/contrib/test_hexagon/test_qnn_op_integration.py diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py index 2ad2445cd3651..f036e6cf840d2 100644 --- a/python/tvm/relay/qnn/op/_qnn.py +++ b/python/tvm/relay/qnn/op/_qnn.py @@ -167,3 +167,6 @@ def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type): # qnn.batch_matmul register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy) register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE) + +# qnn.avg_pool2d +register_strategy("qnn.avg_pool2d", strategy.qnn_avg_pool2d_strategy) diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py index 668cafb8ae342..587993603139f 100644 --- a/python/tvm/relay/qnn/op/layout_conversions.py +++ b/python/tvm/relay/qnn/op/layout_conversions.py @@ -126,3 +126,38 @@ def convert_qnn_conv2d_transpose(attrs, inputs, tinfos, desired_layouts): return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs) raise ValueError(f"Layout {desired_data_layout} is not yet supported") + + +@reg.register_convert_op_layout("qnn.avg_pool2d") +def convert_qnn_avg_pool2d(attrs, inputs, tinfos, desired_layouts): + """Convert Layout pass registration for QNN avg_pool2d op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current avg_pool2d + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + tinfos : list of types + List of input and output types + desired_layouts : list of layout strings + List of layouts defining our desired + layout for the data input. + + Returns + ------- + result : tvm.relay.Expr + The transformed expr + """ + # pylint: disable=import-outside-toplevel + from tvm import relay + + assert len(desired_layouts) == 1, "A desired layout is expected for qnn.avg_pool2d's input" + desired_data_layout = desired_layouts[0] + if desired_data_layout == "NCHW" or desired_data_layout == "NHWC": + new_attrs = dict(attrs) + new_attrs["layout"] = str(desired_data_layout) + new_attrs["out_layout"] = str(desired_data_layout) + return relay.qnn.op.avg_pool2d(*inputs, **new_attrs) + + raise ValueError(f"Layout {desired_data_layout} is not yet supported") diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index 0e73a6889fcd2..eb64b56e829d4 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -1249,3 +1249,69 @@ def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zer def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1): return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point) + + +def avg_pool2d( + data, + input_scale, + input_zero_point, + output_scale, + output_zero_point, + pool_size, + strides, + padding, + dilation, + ceil_mode=False, + count_include_pad=True, + layout="NHWC", + out_layout="", +): + + """Quantized avg_pool2d + + Parameters + ---------- + data : relay.Expr + The quantized input tensor. + input_scale: float + The scale of the input quantized expr. + input_zero_point: int + The zero point of input quantized expr. + output_scale: flaot + The scale of the output quantized expr. + output_zero_point: int + The zero point of output quantized expr. + pool_size : relay.Expr + The pool_size + strides : relay.Expr + The strides + padding : relay.Expr + The padding size + dilation : relay.Expr + The dilation size + ceil_mode : bool, optional + Whether to use ceil or floor for calculating the output shape + count_include_pad : bool, optional + Determines if padding should be taken into account in the computation + layout: string, optinal + out_layout: string, optional + Returns + ------- + result : relay.Expr + The computed result. + """ + return _make.avg_pool2d( + data, + input_scale, + input_zero_point, + output_scale, + output_zero_point, + pool_size, + strides, + padding, + dilation, + ceil_mode, + count_include_pad, + layout, + out_layout, + ) diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py index 3ebf8edd36654..4c5884ffdc15b 100644 --- a/python/tvm/relay/qnn/strategy/generic.py +++ b/python/tvm/relay/qnn/strategy/generic.py @@ -157,6 +157,36 @@ def wrapper(_attrs, inputs, out_type): return wrapper +def wrap_compute_qnn_avg_pool2d(topi_compute): + """Wrap qnn.avg_pool2d topi compute""" + + def wrapper(attrs, inputs, out_type): + kernel = attrs.pool_size + strides = attrs.strides + padding = attrs.padding + dilation = attrs.dilation + count_include_pad = attrs.count_include_pad + oshape = out_type.shape + odtype = out_type.dtype + args = [ + inputs[0], + kernel, + strides, + padding, + dilation, + count_include_pad, + oshape, + odtype, + inputs[1], + inputs[2], + inputs[3], + inputs[4], + ] + return [topi_compute(*args)] + + return wrapper + + def wrap_topi_concatenate(topi_compute): """Wrap TOPI compute which use qnn.concatenate attrs""" @@ -280,3 +310,12 @@ def qnn_batch_matmul_strategy(attrs, inputs, out_type, target): "qnn.batch_matmul is currently only supported with Hexagon. " "Please run QNN Canonicalize pass to decompose this op into supported ops." ) + + +@override_native_generic_func("qnn_avg_pool2d_strategy") +def qnn_avg_pool2d_strategy(attrs, inputs, out_type, target): + """qnn.avg_pool2d generic strategy""" + raise RuntimeError( + "qnn.avg_pool2d is currently only supported with Hexagon. " + "Please run QNN Canonicalize pass to decompose this op into supported ops." + ) diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py index d17b0da6cf0a4..3edbce34e30f9 100644 --- a/python/tvm/relay/qnn/strategy/hexagon.py +++ b/python/tvm/relay/qnn/strategy/hexagon.py @@ -201,3 +201,27 @@ def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target): name="qnn_batch_matmul.hexagon", ) return strategy + + +@qnn_avg_pool2d_strategy.register(["hexagon"]) +def qnn_avg_pool2d_strategy_hexagon(attrs, inputs, out_type, target): + """qnn.avg_pool2d strategy for Hexagon""" + data_layout = attrs.layout + if data_layout == "NHWC": + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NHWC), + wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d), + name="qnn_avg_pool2d.hexagon", + ) + return strategy + elif data_layout == "NCHW": + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NCHW), + wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d), + name="qnn_avg_pool2d.hexagon", + ) + return strategy + else: + raise RuntimeError("Unsupported strategy for qnn.avg_pool2d") diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index 4c9a3f7cd09cf..5e289b0c9380f 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -111,6 +111,32 @@ def identity(expr, type_map): register_unary_identity("image.resize2d") +@register_fake_quantization_to_integer("nn.avg_pool2d") +def avgpool2d(expr, type_map): + """Rewrite an avgpool op""" + attrs = {**expr.attrs} + arg = expr.args[0] + t = type_map[arg] + out_t = type_map[expr] + + out = relay.qnn.op.avg_pool2d( + arg, + t.scale, + t.zero_point, + out_t.scale, + out_t.zero_point, + attrs["pool_size"], + attrs["strides"], + attrs["padding"], + attrs["dilation"], + attrs["ceil_mode"], + attrs["count_include_pad"], + attrs["layout"], + ) + + return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)] + + @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d") def adaptive_avgpool1d(expr, type_map): """Rewrite an adaptive avgpool op""" @@ -138,37 +164,6 @@ def adaptive_avgpool1d(expr, type_map): return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)] -@register_fake_quantization_to_integer("nn.avg_pool2d") -def avgpool2d(expr, type_map): - """Rewrite a avgpool op""" - arg = expr.args[0] - t = type_map[arg] - out_t = type_map[expr] - # Cast (or requantize) to int32. - if not ( - approx_equal(t.scale, out_t.scale) - and approx_equal(t.zero_point, out_t.zero_point) - and tvm.ir.structural_equal(t.dtype, out_t.dtype) - ): - arg = relay.qnn.op.requantize( - arg, - t.scale, - t.zero_point, - out_t.scale, - out_t.zero_point, - out_dtype="int32", - axis=t.axis, - ) - else: - arg = relay.op.cast(arg, "int32") - out = relay.op.nn.avg_pool2d(arg, **expr.attrs) - if out_t.dtype != "int32": - # Cast back to output dtype to preserve input dtype == output dtype for AvgPool2d. - out = relay.op.clip(out, a_min=np.iinfo(out_t.dtype).min, a_max=np.iinfo(out_t.dtype).max) - out = relay.op.cast(out, out_t.dtype) - return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)] - - @register_fake_quantization_to_integer("nn.global_avg_pool2d") def global_avgpool2d(expr, type_map): """Rewrite a global_avgpool op""" diff --git a/python/tvm/topi/hexagon/compute_poolarea.py b/python/tvm/topi/hexagon/compute_poolarea.py new file mode 100644 index 0000000000000..6ba50c4a96e16 --- /dev/null +++ b/python/tvm/topi/hexagon/compute_poolarea.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals + +"""Compute PoolArea size which is used to exclude the zero-padding elements in the averaging + calculation. +""" + +from tvm import te, tir + + +def compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left): + """ + Parameters + ---------- + i,j: + index of output tensor along H and W axis + This is equal to the starting point of the sliding window for which the average is computed + ih, iw: + input data size along H and W axis + kh, kw: + Kernel size along H and W axis + sh, sw: + Stride size along H and W axis + dh, dw: + Dilation size along H and W axis + pad_top, pad_left: + Pad size on Top and left side of input data + + # PoolArea refers to the area of that portion of each sliding window which only includes + # the input data and not the padded area. + + # Motivation: The following example shows the location of the first sliding window (at i=0, j=0) + # on a 6*6 array, with kernel=[3,3] and padding=[1, 1, 1, 1]. + # The input data elements are shown with (X) and padding data with (0). + # As shown, the number of non-padding elements that should be used for computing + # the average of values inside this window is 4, while the windows area is 3*3=9. + # To compute the PoolArea, we have to move the top/left edge of the window down/right + # to exclude zero-padding elements. The edge adjustment can be formulated as + # top_edge = max(i , pad_top) + # left_edge= max(j , pad_left) + # Note that pad_top and pad_left represent point 0 of the input data along i and j direction. + # In this example, bottom_edge and right_edge of the PoolArea do not need any adjustment, + # because there is no padding data on those side of the window. + # However, as we slide the window down and to the right, the window might go + # beyond the input data boundaries (ih and iw). In these cases, bottom/right edge should be + # moved up/left to be located inside the input data. + # This can be formulated as + # bottom_edge = min(i + kh, ih + pad_top) + # left_edge = min(j + kw, iw + pad_left) + # Having all the edges, + # PoolArea = (bottom_edge - top_edge) * (right_edge - left_edge) + + # _______ + # |0 0 0|0 0 0 0 0 0 0 0 0 0 0 0 0 + # | | _______ + # |0 X X|X X X X 0 |0 X X|X X X X 0 + # | | | | + # |0 X X|X X X X 0 ====> |0 X X|X X X X 0 + # |_____| |_____| + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + + # The above equations are derived under the assumption of having default value (1) + # for stride and dilation. However, we need to expand them to support non-default + # stride and dilation values. + # Stride impacts the starting location of the sliding windows, so i and j should be + # replaced by (i * sh) and j by (j * sw) in the equations. + # Dilation changes the window size, making k kernel elements scattered into a d*(k - 1) + 1 + # window. + # Non-1 dilation means that, we need to divide the adjusted window size by the dilation value + # to find out how many kernel elements inside the sliding window are inside the input data + # boundaries: + # top_edge= max(i * sh , pad_top) + # left_edge= max(j * sw , pad_left) + # bottom_edge = min(i * sh + (kh - 1) * dh + 1, ih + pad_top) + # left_edge = min(j * sw + (kw - 1) * dw + 1, data_w + pad_left) + # PoolArea = ceil_div((bottom_edge - top_edge), dh) * ceil_div((right_edge - left_edge), dw) + # + # Finally, we need to address one corner case related to the non-default dilation: + # Consider the following example along W axis, where iw = 3, kw = 3 and dw = 2. + # The first figure on the left shows the sliding window of size 5 starting at index 0, + # and the first figure on the right shows the same example with sliding window at index 1. + # The second row of figures show the PoolArea after adjusting the edges + # (both left_edge - right_edge = 3) + # The third row of figures show the location of dialated kernel points(*). + # As shown, although the distance between left and right edge in both cases is 3 and + # dilation is 2 and ceil_div(3,2)=2, the right PoolArea only includes 1 kernel point. + + # Sliding Window: |0 0 X X X |0 0 |0 X X X 0| + # PoolArea(after edge adjustment): 0 0|X X X |0 0 0|X X X| 0 + # location of dilated kernel points: * 0|* X * |0 0 *|X * X| 0 + # PoolArea (dilated_point_aware): * 0|* X * |0 0 * X|* X| 0 + + # To address this issue, instead of moving the left_edge to bring it just inside the input + # data boundary, we should move the edge to the right untill we get to the first dilated kernel + # point inside the input data boundary. + # The third row of figures shows how this row adjustment can solve the problem. + # So the problem is reduced to finding the the first dilated kernel point inside the data + # boundary.# For that, we can find the number of dialted points which are mapped to the padded + # area and find the location of the next one which should be inside the input data: + # num_of_prev_points = (pad_top - i * sh - 1) // dh + # next_point_index = i * sh + (num_prev_points + 1) * dh + # + # With that, Top_edge and left_edge can be reformulated as: + # if i*sh - pad_top < 0: + # top_edge = i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh + # else: + # top_edge = i * sh + # + # if j * sw - pad_left < 0: + # left_edge = j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw + # else: + # left_edge= j * sw + + """ + top_edge = tir.if_then_else( + tir.all(i * sh - pad_top < 0), i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh, i * sh + ) + bottom_edge = te.min(i * sh + (kh - 1) * dh + 1, ih + pad_top) + left_edge = tir.if_then_else( + tir.all(j * sw - pad_left < 0), j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw, j * sw + ) + right_edge = te.min(j * sw + (kw - 1) * dw + 1, iw + pad_left) + return -((bottom_edge - top_edge) // -dh) * -((right_edge - left_edge) // -dw) diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py index ba7d64b6b56d6..f7c4502301c06 100644 --- a/python/tvm/topi/hexagon/qnn/__init__.py +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -18,7 +18,7 @@ """ Computes and schedules for Hexagon quantized ops """ from .adaptive_avg_pool1d import * -from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule +from .avg_pool2d import * from .conv2d_alter_op import * from .dense_alter_op import * from .dequantize import dequantize_compute, dequantize_schedule diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py index 4aac15cbdc178..1370ad36e4687 100644 --- a/python/tvm/topi/hexagon/qnn/avg_pool2d.py +++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py @@ -16,64 +16,144 @@ # under the License. # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals -""" Compute and schedule for quantized avg_pool2d op - -Please note the following assumptions made by the implementation: - -1) The input must be padded in advance to account for 'padding'. In addition, - both input and output must be padded as per the physical buffer layout. -2) The current implementation assumes 'count_include_pad' to be 'True'. It can be - modified to support 'False' case but the element count for the pooling window - must be pre-computed and provided as an input to reduce the run-time overhead. -3) 'padding' is ignored. It must be handled outside of the sliced op. -4) Please note that this implementation will not work if the output includes any - physical layout related padding as it can result into out-of-bound access - for the input. -""" +""" Compute and schedule for quantized avg_pool2d op """ +import tvm from tvm import te from tvm import tir -from ..utils import get_layout_transform_fn, get_fixed_point_value +from ..utils import ( + get_layout_transform_fn, + get_fixed_point_value, + is_scalar, + get_const_int_value, + get_const_float_value, +) +from ...utils import get_const_tuple +from ...nn.utils import get_pad_tuple +from ...nn.pad import pad +from ..compute_poolarea import compute_PoolArea + + +def saturate(x: te.Tensor, dtype: str): + """Saturate value for the specified data type""" + return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) -def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list): - """Validate output shape""" - _, oh, ow, _ = out_shape - _, ih, iw, _ = in_shape +def qnn_avg_pool2d_NCHW( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Compute for quantized avg_pool2d""" kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") + + if odtype == "uint8": + temp_dtype = "uint16" + elif odtype == "int8": + temp_dtype = "int16" + else: + raise RuntimeError(f"Unsupported output dtype, {odtype}'") + sh, sw = stride dh, dw = dilation - if ih < (oh - 1) * sh + dh * (kh - 1) + 1: - raise RuntimeError("Output height is too large") - if iw < (ow - 1) * sw + dw * (kw - 1) + 1: - raise RuntimeError("Output width is too large") + scale = input_scale / output_scale + scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") + corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point -def saturate(x: te.Tensor, dtype: str): - """Saturate value for the specified data type""" - return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, 0, pad_top, pad_left) + pad_after = (0, 0, pad_down, pad_right) + data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + + Sum = te.compute( + oshape, + lambda b, c, h, w: te.sum( + data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype(temp_dtype), axis=[rh, rw] + ), + name="pool_sum", + ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, _, oh, ow = oshape + _, _, ih, iw = data.shape + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) -def qnn_avg_pool2d_compute( + ScaleWithArea = te.compute( + (oh, ow), + lambda i, j: (scale_fixed_point // PoolArea[i, j]).astype("int32"), + name="scale_with_area", + ) + + Avg = te.compute( + oshape, + lambda b, c, h, w: saturate( + ((Sum[b, c, h, w] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + else: + ScaleWithArea = scale_fixed_point // (kh * kw) + Avg = te.compute( + oshape, + lambda b, c, h, w: saturate( + ((Sum[b, c, h, w] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + return Avg + + +def qnn_avg_pool2d_NHWC( data: te.Tensor, kernel: list, stride: list, + padding: list, dilation: list, + count_include_pad: bool, oshape: list, odtype: str, # quantization params: - input_zero_point: int, input_scale: float, - output_zero_point: int, + input_zero_point: int, output_scale: float, + output_zero_point: int, ): """Compute for quantized avg_pool2d""" kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") - ob, oh, ow, oc = oshape - if isinstance(ob, int): - validate_out_shape(oshape, data.shape, kernel, stride, dilation) if odtype == "uint8": temp_dtype = "uint16" @@ -85,86 +165,217 @@ def qnn_avg_pool2d_compute( sh, sw = stride dh, dw = dilation - PoolArea = kh * kw - scale = input_scale / output_scale scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") - scale_with_area = scale_fixed_point // PoolArea corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + # Compute Area + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, pad_top, pad_left, 0) + pad_after = (0, pad_down, pad_right, 0) + data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + Sum = te.compute( oshape, lambda b, h, w, c: te.sum( - data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw] + data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw] ), - name="sum", + name="pool_sum", ) - Avg = te.compute( - oshape, - lambda b, h, w, c: saturate( - ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype - ).astype(odtype), - name="avg", - ) + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, oh, ow, _ = oshape + _, ih, iw, _ = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + ScaleWithArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), + (scale_fixed_point // PoolArea[i, j]).astype("int32"), + 0, + ), + name="scale_with_area", + ) + + Avg = te.compute( + oshape, + lambda b, h, w, c: saturate( + ((Sum[b, h, w, c] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + else: + ScaleWithArea = scale_fixed_point // (kh * kw) + Avg = te.compute( + oshape, + lambda b, h, w, c: saturate( + ((Sum[b, h, w, c] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + return Avg -def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Schedule for input and output layout nhwc-8h8w32c""" +def qnn_avg_pool2d_wrapper_compute_NCHW( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Extract qnn params""" + if ( + is_scalar(input_scale) + and is_scalar(output_scale) + and is_scalar(input_zero_point) + and is_scalar(output_zero_point) + ): + iscale = get_const_float_value(input_scale) + oscale = get_const_float_value(output_scale) + izero_point = get_const_int_value(input_zero_point) + ozero_point = get_const_int_value(output_zero_point) + return qnn_avg_pool2d_NCHW( + data, + kernel, + stride, + padding, + dilation, + count_include_pad, + oshape, + odtype, + iscale, + izero_point, + oscale, + ozero_point, + ) + else: + raise RuntimeError("quantization parameters should be scalar tensors") + + +def qnn_avg_pool2d_wrapper_compute_NHWC( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Extract qnn params""" + if ( + is_scalar(input_scale) + and is_scalar(output_scale) + and is_scalar(input_zero_point) + and is_scalar(output_zero_point) + ): + iscale = get_const_float_value(input_scale) + oscale = get_const_float_value(output_scale) + izero_point = get_const_int_value(input_zero_point) + ozero_point = get_const_int_value(output_zero_point) + return qnn_avg_pool2d_NHWC( + data, + kernel, + stride, + padding, + dilation, + count_include_pad, + oshape, + odtype, + iscale, + izero_point, + oscale, + ozero_point, + ) + else: + raise RuntimeError("quantization parameters should be scalar tensors") + + +def schedule_qnn_avg_pool2d(outs): + """Schedule for qnn.avg_pool2d + Parameters + ---------- + outs: Array of Tensor + The computation graph description of qnn.avg_pool2d + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs + s = tvm.te.create_schedule([x.op for x in outs]) + tvm.te.schedule.AutoInlineInjective(s) + return s + + +def schedule_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for input and output layout 8h8w32c""" + func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") - + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_read = s.cache_read(Avg, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) - - # Schedule 'Avg' - # Split and reorder the axes to iterate over the output tensor chunks. - # Each chunk consists for 2048 bytes with 32 channels being the fastest - # changing axis, followed by 8 width and then 8 height. - # The width is split by a factor of 4 and then fused with 32 channels - # to provide full vector length of data for the output tensor chunks. - # NOTE: These schedules are a work in progress and may require - # adjustments in future as some of the missing features for 2-d tensors - # become available. - n, h, w, c = s.get_loops(Avg) - ho, hi = s.split(h, [None, 8]) - wo, wi = s.split(w, [None, 8]) - wio, wii = s.split(wi, [None, 4]) - co, ci = s.split(c, [None, 32]) - s.reorder(n, ho, wo, co, hi, wio, wii, ci) - wii_ci = s.fuse(wii, ci) - s.vectorize(wii_ci) - - # Schedule 'Sum' - s.compute_at(Sum, wio) - Sum_axis = s.get_loops(Sum) - # Compute for 'Sum' includes reduction along height and width. The axes - # are being reordered so that 4 width and 32 channels become the - # inner-most loops which then can be fused and vectorized. However, - # vectorization of the 2-d tensors doesn't work when reduction is - # involved and requires codegen support that is yet to be added. - s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3]) - ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3]) - # s.vectorize(ci_wii) # Doesn't work + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0) return s -def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c""" +def schedule_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for output layout: 2048c, input layout: 8h8w32c""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0) # Schedule 'Avg' # Split and reorder the axes to iterate over the output tensor chunks. @@ -173,7 +384,13 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp # NOTE: These schedules are a work in progress and may require # adjustments in future as some of the missing features for 2-d tensors # become available. - n, h, w, c = s.get_loops(Avg) + + if output_layout == "n11c-2048c-2d": + _, _, _, c = s.get_loops(Avg) + else: + _, c, _, _ = s.get_loops(Avg) + + # n, h, w, c = s.get_loops(Avg) co, ci = s.split(c, [None, 2048]) cio, cii = s.split(ci, [None, 128]) s.vectorize(cii) @@ -191,15 +408,10 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Quantized avg_pool2d schedule + """Quantized avg_pool2d schedule""" + if output_layout == "nhwc-8h8w32c-2d" or output_layout == "nchw-8h8w32c-2d": + return schedule_8h8w32c(outs, ins, output_layout, input_layout) + if output_layout == "n11c-2048c-2d" or output_layout == "nc11-2048c-2d": + return schedule_2048c(outs, ins, output_layout, input_layout) - NOTE: This schedule assumes that both input and output tensors are in the form of - 2d discontiguous buffer and data is already arranged as per the input and output layout - respectively. - - """ - if output_layout == "nhwc-8h8w32c-2d": - return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout) - if output_layout == "n11c-2048c-2d": - return schedule_n11c_2048c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 46ae0c53200f8..b38dd5ecb3c1e 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -17,7 +17,7 @@ """ Computes and Schedules for Hexagon slice ops. """ -from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule +from .avg_pool2d import avg_pool2d_NHWC, avg_pool2d_NCHW, avg_pool2d_schedule from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule from .add_subtract_multiply import * from .argmax import argmax_compute, argmax_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py index bf6d57b8f7f84..9f78cfce28380 100644 --- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py +++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py @@ -16,118 +16,206 @@ # under the License. # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, pointless-exception-statement -""" Compute and schedule for avg_pool2d slice op - -Please note the following assumptions made by the implementation: - -1) The input must be padded in advance to account for 'padding'. In addition, - both input and output must be padded as per the physical buffer layout. -2) The current implementation assumes 'count_include_pad' to be 'True'. It can be - modified to support 'False' case but the element count for the pooling window - must be pre-computed and provided as an input to reduce the run-time overhead. -3) 'padding' is ignored. It must be handled outside of the sliced op. -4) Please note that this implementation will not work if the output includes any - physical layout related padding as it can result into out-of-bound access - for the input. -""" +""" Compute and schedule for avg_pool2d slice op """ from tvm import te from tvm import tir from ..utils import get_layout_transform_fn +from ...utils import get_const_tuple +from ...nn.utils import get_pad_tuple +from ...nn.pad import pad +from ..compute_poolarea import compute_PoolArea -def validate_out_shape(out_shape, in_shape, kernel, stride, dilation): - """Validate output shape""" - _, oh, ow, _ = out_shape - _, ih, iw, _ = in_shape +def avg_pool2d_NCHW( + data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16" +): + """avg_pool2d compute""" + if odtype != "float16": + raise RuntimeError(f"Unsupported output dtype '{odtype}'") kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") sh, sw = stride dh, dw = dilation - if ih < (oh - 1) * sh + dh * (kh - 1) + 1: - raise RuntimeError("Output height is too large") - if iw < (ow - 1) * sw + dw * (kw - 1) + 1: - raise RuntimeError("Output width is too large") + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD -def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"): + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, 0, pad_top, pad_left) + pad_after = (0, 0, pad_down, pad_right) + data_pad = pad(data, pad_before, pad_after, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + + Sum = te.compute( + oshape, + lambda b, c, h, w: te.sum( + data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype("float32"), axis=[rh, rw] + ), + name="pool_sum", + ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, _, oh, ow = oshape + _, _, ih, iw = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + InvArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0 + ), + name="inverse_area", + ) + + Avg = te.compute( + oshape, + lambda b, c, h, w: (Sum[b, c, h, w] * InvArea[h, w]).astype(odtype), + name="pool_avg", + ) + else: + InvArea = float(1) / (kh * kw) + Avg = te.compute( + oshape, lambda b, c, h, w: (Sum[b, c, h, w] * InvArea).astype(odtype), name="pool_avg" + ) + + return Avg + + +def avg_pool2d_NHWC( + data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16" +): """avg_pool2d compute""" if odtype != "float16": - RuntimeError(f"Unsupported output dtype '{odtype}'") + raise RuntimeError(f"Unsupported output dtype '{odtype}'") kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") - ob, oh, ow, oc = oshape - if isinstance(ob, int): - validate_out_shape(oshape, A.shape, kernel, stride, dilation) sh, sw = stride dh, dw = dilation InvArea = float(1) / (kh * kw) + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, pad_top, pad_left, 0) + pad_after = (0, pad_down, pad_right, 0) + data_pad = pad(data, pad_before, pad_after, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + Sum = te.compute( oshape, lambda b, h, w, c: te.sum( - A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw] + data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw] ), - name="sum", - ) - Avg = te.compute( - oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg" + name="pool_sum", ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, oh, ow, _ = oshape + _, ih, iw, _ = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + InvArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0 + ), + name="inverse_area", + ) + + Avg = te.compute( + oshape, + lambda b, h, w, c: (Sum[b, h, w, c] * InvArea[h, w]).astype(odtype), + name="pool_avg", + ) + else: + InvArea = float(1) / (kh * kw) + Avg = te.compute( + oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(odtype), name="pool_avg" + ) + return Avg -def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): - """Schedule for input and output layout nhwc-8h2w32c2w""" +def schedule_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): + """Schedule for input and output layout 8h2w32c2w""" func = te.create_prim_func([ins, outs]) + print(func) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) - - # Schedule 'Avg' - n, h, w, c = s.get_loops(Avg) - ho, hi = s.split(h, [None, 8]) - wo, wi = s.split(w, [None, 4]) - wio, wii = s.split(wi, [None, 2]) - co, ci = s.split(c, [None, 32]) - s.reorder(n, ho, wo, co, hi, wio, ci, wii) - ci_wii = s.fuse(ci, wii) - s.vectorize(ci_wii) - - # Schedule 'Sum' - s.compute_at(Sum, wio) - Sum_axis = s.get_loops(Sum) - s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3]) - ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3]) - # s.vectorize(ci_wii) # Doesn't work + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0) return s -def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): - """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w""" +def schedule_1024c(outs, ins, output_layout: str, input_layout: str): + """Schedule for output layout: 1024c, input layout: 8h2w32c2w""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0) # Schedule 'Avg' - n, h, w, c = s.get_loops(Avg) - co, ci = s.split(c, [None, 1024]) + if output_layout == "n11c-1024c-2d": + n, h, w, c = s.get_loops(Avg) + else: + n, c, h, w = s.get_loops(Avg) + _, ci = s.split(c, [None, 1024]) cio, cii = s.split(ci, [None, 64]) s.vectorize(cii) # Schedule 'Sum' - s.compute_at(Sum, cio) + # s.compute_at(Sum, cio) Sum_axis = s.get_loops(Sum) s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3]) # s.vectorize(Sum_axis[-3]) # Doesn't work @@ -136,8 +224,8 @@ def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str): """avg_pool2d schedule""" - if output_layout == "nhwc-8h2w32c2w-2d": - return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) - if output_layout == "n11c-1024c-2d": - return schedule_n11c_1024c(outs, ins, output_layout, input_layout) + if output_layout == "nhwc-8h2w32c2w-2d" or output_layout == "nchw-8h2w32c2w-2d": + return schedule_8h2w32c2w(outs, ins, output_layout, input_layout) + if output_layout == "n11c-1024c-2d" or output_layout == "nc11-1024c-2d": + return schedule_1024c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index f017aaebbdeb4..4bec64e9dbceb 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -24,10 +24,30 @@ from typing import Dict, Tuple, Union import tvm -from tvm import IRModule, te +from tvm import IRModule, te, tir from tvm.tir import IndexMap, PrimFunc +def is_scalar(expr): + if isinstance(expr, te.Tensor): + return expr.ndim == 0 and (isinstance(expr.op.body[0], (tir.FloatImm, tir.IntImm))) + return isinstance(expr, (tir.FloatImm, tir.IntImm)) + + +def get_const_int_value(expr): + if isinstance(expr, te.Tensor): + assert isinstance(expr.op.body[0], tir.IntImm) + return expr.op.body[0].value + return get_const_int(expr) + + +def get_const_float_value(expr): + if isinstance(expr, te.Tensor): + assert isinstance(expr.op.body[0], tir.FloatImm) + return expr.op.body[0].value + return get_const_float(expr) + + def n11c_1024c_2d(n, h, w, c): """Return index map for n11c_1024 2d layout""" return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024] @@ -38,6 +58,11 @@ def n11c_1024c_1d(n, h, w, c): return [n, h, w, c // 1024, c % 1024] +def nc11_1024c_2d(n, c, h, w): + """Return index map for nc11_1024 2d layout""" + return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024, h, w] + + def nhwc_8h2w32c2w_2d(n, h, w, c): """Return index map for nhwc_8h2w32c2w 2d layout""" return [n, h // 8, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2] @@ -48,6 +73,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c): return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2] +def nchw_8h2w32c2w_2d(n, c, h, w): + """Return index map for nchw_8h2w32c2w 2d layout""" + return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2] + + def nhw_32h16w_2d(n, h, w): """Return index map for nhw_32h16w 2d layout""" return [n, h // 32, w // 16, IndexMap.AXIS_SEPARATOR, h % 32, w % 16] @@ -88,6 +118,11 @@ def nc_2048c_2d(n, c): return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048] +def nc11_2048c_2d(n, c, h, w): + """Return index map for nc11_2048c 2d layout""" + return [n, c // 2048, IndexMap.AXIS_SEPARATOR, h, w, c % 2048] + + def nc_1024c_1d(n, c): """Return index map for nc_1024c 1d layout""" return [n, c // 1024, c % 1024] @@ -123,11 +158,25 @@ def nhwc_8h8w32c_2d(n, h, w, c): return [n, h // 8, w // 8, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32] +def nhwc_8h8w32c_1d(n, h, w, c): + """Return index map for nhwc_8h8w32c 1d layout""" + return [n, h // 8, w // 8, c // 32, h % 8, w % 8, c % 32] + + +def nchw_8h8w32c_2d(n, c, h, w): + return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32] + + def n11c_2048c_2d(n, h, w, c): """Return index map for n11c_2048c 2d layout""" return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048] +def n11c_2048c_1d(n, h, w, c): + """Return index map for n11c_2048c 1 layout""" + return [n, h, w, c // 2048, c % 2048] + + def iohw_16i32o2i_1d(height, width, in_channel, out_channel): return [ in_channel // 32, @@ -163,12 +212,16 @@ def get_layout_transform_fn(layout): return nhwc_8h2w32c2w_2d if layout == "nhwc-8h2w32c2w-1d": return nhwc_8h2w32c2w_1d + if layout == "nchw-8h2w32c2w-2d": + return nchw_8h2w32c2w_2d if layout == "n11c-1024c-2d": return n11c_1024c_2d if layout == "n11c-1024c-1d": return n11c_1024c_1d if layout == "nhwc-1024c-2d": return nhwc_1024c_2d + if layout == "nc11-1024c-2d": + return nc11_1024c_2d if layout == "nc-1024-2d": return nc_1024_2d if layout == "nhw-32h16w-2d": @@ -201,16 +254,26 @@ def get_layout_transform_fn(layout): return nc_2048c_2d if layout == "nhwc-8h8w32c-2d": return nhwc_8h8w32c_2d + if layout == "nhwc-8h8w32c-1d": + return nhwc_8h8w32c_1d + if layout == "nchw-8h8w32c-2d": + return nchw_8h8w32c_2d if layout == "n11c-2048c-2d": return n11c_2048c_2d + if layout == "n11c-2048c-1d": + return n11c_2048c_1d if layout == "ohwi32o-1d": return ohwi32o_1d + if layout == "nc11-2048c-2d": + return nc11_2048c_2d if layout == "ncw-32c64w-2d": return ncw_32c64w_2d if layout == "nchw-32c8h8w-2d": return nchw_32c8h8w_2d if layout == "nchw-32c8h4w-2d": return nchw_32c8h4w_2d + if layout == "nchw-8h8w32c-2d": + return nchw_8h8w32c_2d raise RuntimeError(f"Unexpected layout '{layout}'") diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc index 2a8c2440519ee..1cfbab6e661ee 100644 --- a/src/relay/op/nn/pooling.cc +++ b/src/relay/op/nn/pooling.cc @@ -32,6 +32,7 @@ #include #include "../../transforms/infer_layout_utils.h" +#include "pooling_common.h" namespace tvm { namespace relay { @@ -40,43 +41,6 @@ namespace relay { TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs); TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs); -template -InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs, - const Array& new_in_layouts, - const Array& old_in_layouts, - const Array& old_in_types) { - const auto* attrs_ptr = attrs.as(); - ICHECK(attrs_ptr); - ObjectPtr params = make_object(*attrs_ptr); - - if (params->out_layout != "") { - // when users specify the out_layout of pooling, follow user's preference - ICHECK_EQ(params->layout, params->out_layout) - << "Pooling input/output layouts mismatch: " << params->layout << " vs. " - << params->out_layout; - } else if (new_in_layouts.defined()) { - // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller - ICHECK_EQ(new_in_layouts.size(), 1); - params->layout = new_in_layouts[0].name(); - } - - return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params)); -} - -IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount, - IndexExpr pool_size, IndexExpr dilation, IndexExpr stride_size, - bool ceil_mode) { - IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1); - IndexExpr denominator = stride_size; - - // Emulate the behavior of running ceil on numerator / denominator rather than floor - if (ceil_mode) { - numerator += denominator - 1; - } - - return numerator / denominator + 1; -} - template bool Pool2DRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { diff --git a/src/relay/op/nn/pooling.h b/src/relay/op/nn/pooling.h index 32ae464101ab1..123cfcd075705 100644 --- a/src/relay/op/nn/pooling.h +++ b/src/relay/op/nn/pooling.h @@ -18,8 +18,8 @@ */ /*! - * \file src/relay/op/nn/convolution.h - * \brief Properties def of convlution operator for sharing. + * \file src/relay/op/nn/pooling.h + * \brief utilities for creating pool ops */ #ifndef TVM_RELAY_OP_NN_POOLING_H_ #define TVM_RELAY_OP_NN_POOLING_H_ diff --git a/src/relay/op/nn/pooling_common.h b/src/relay/op/nn/pooling_common.h new file mode 100644 index 0000000000000..9e30cadddd583 --- /dev/null +++ b/src/relay/op/nn/pooling_common.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/op/nn/pooling_common.h + * \brief Properties def of pooling operator for sharing. + */ +#ifndef TVM_RELAY_OP_NN_POOLING_COMMON_H_ +#define TVM_RELAY_OP_NN_POOLING_COMMON_H_ + +#include +#include +#include + +#include +#include +#include + +#include "../op_common.h" + +namespace tvm { +namespace relay { + +inline IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount, + IndexExpr pool_size, IndexExpr dilation, + IndexExpr stride_size, bool ceil_mode) { + IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1); + IndexExpr denominator = stride_size; + + // Emulate the behavior of running ceil on numerator / denominator rather than floor + if (ceil_mode) { + numerator += denominator - 1; + } + + return numerator / denominator + 1; +} + +template +InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + const auto* attrs_ptr = attrs.as(); + ICHECK(attrs_ptr); + ObjectPtr params = make_object(*attrs_ptr); + + if (params->out_layout != "") { + // when users specify the out_layout of pooling, follow user's preference + ICHECK_EQ(params->layout, params->out_layout) + << "Pooling input/output layouts mismatch: " << params->layout << " vs. " + << params->out_layout; + } else if (new_in_layouts.defined()) { + // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller + // ICHECK_EQ(new_in_layouts.size(), 1); + params->layout = new_in_layouts[0].name(); + } + + return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params)); +} +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_OP_NN_POOLING_COMMON_H_ diff --git a/src/relay/qnn/op/avg_pool2d.cc b/src/relay/qnn/op/avg_pool2d.cc new file mode 100644 index 0000000000000..b0ba9ff028fe7 --- /dev/null +++ b/src/relay/qnn/op/avg_pool2d.cc @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/Avg_pool2d.cc + * \brief Property def of qnn Avg_pool2d operator. + */ + +#include +#include +#include +#include +#include +#include + +#include "../../op/nn/nn.h" +#include "../../op/nn/pooling.h" +#include "../../op/nn/pooling_common.h" +#include "../../op/tensor/transform.h" +#include "../../transforms/infer_layout_utils.h" +#include "../../transforms/pattern_utils.h" +#include "../utils.h" +#include "op_common.h" + +namespace tvm { +namespace relay { +namespace qnn { + +// relay.op.qnn.avg_pool2d +bool QnnAvgPool2DRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // Expected Types: data, input_zero_point, input_scale, output_zero_point, output_scale + // out_type + + ICHECK_EQ(types.size(), 6); + + const auto* data = types[0].as(); + if (data == nullptr) return false; + ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8)) + << "Expected quantized avg_pool2d type(int8, uint8) for input but was " << data->dtype; + + const auto* param = attrs.as(); + ICHECK(param != nullptr) << "AvgPool2DAttrs cannot be nullptr."; + + // Check the types of scale and zero points. + for (size_t i = 1; i < 5; ++i) { + if (types[i].as()) { + return false; + } + } + + ICHECK(IsScalarType(types[1], DataType::Float(32))); // input_scale + ICHECK(IsScalarType(types[2], DataType::Int(32))); // input_zero_point + ICHECK(IsScalarType(types[3], DataType::Float(32))); // output_scale + ICHECK(IsScalarType(types[4], DataType::Int(32))); // output_zero_point + + // Find the output shape and data type + const auto dshape = data->shape; + ICHECK_GE(dshape.size(), 2U) + << "Pool2D only support input >= 2-D: input must have height and width"; + + // Check input and output layout + Layout layout(param->layout); + // The Layout is always NHWC + ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) && + !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w'))) + << "Invalid input layout " << layout + << ". qnn_avg_pool2d inut layout must have H and W, which cannot be split"; + + // Find the output shape and data type + const auto hidx = layout.IndexOf(LayoutAxis::Get('H')); + const auto widx = layout.IndexOf(LayoutAxis::Get('W')); + + IndexExpr pad_h, pad_w; + if (param->padding.size() == 1) { + pad_h = param->padding[0] * 2; + pad_w = param->padding[0] * 2; + } else if (param->padding.size() == 2) { + // (top, left) + pad_h = param->padding[0] * 2; + pad_w = param->padding[1] * 2; + } else if (param->padding.size() == 4) { + // (top, left, bottom, right) + pad_h = param->padding[0] + param->padding[2]; + pad_w = param->padding[1] + param->padding[3]; + } else { + return false; + } + + std::vector oshape(dshape.begin(), dshape.end()); + if (dshape[hidx].as()) { + oshape[hidx] = dshape[hidx]; + } else { + oshape[hidx] = + calculate_pool_dimension(dshape[hidx], pad_h, param->pool_size[0], param->dilation[0], + param->strides[0], param->ceil_mode); + } + if (dshape[widx].as()) { + oshape[widx] = dshape[widx]; + } else { + oshape[widx] = + calculate_pool_dimension(dshape[widx], pad_w, param->pool_size[1], param->dilation[1], + param->strides[1], param->ceil_mode); + } + + // assign output type + reporter->Assign(types[5], TensorType(oshape, data->dtype)); + return true; +} + +InferCorrectLayoutOutput QnnAvgPoolInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + // Use Relay AvgPool2D Infer correct layout. + auto avgpool_new_layouts = + PoolInferCorrectLayout(attrs, new_in_layouts, old_in_layouts, old_in_types); + + // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these + // tensors can be treated as channel layout. + Layout channel_layout = Layout("C"); + Array input_layouts = {avgpool_new_layouts->input_layouts[0], channel_layout, + channel_layout, channel_layout, channel_layout}; + Array output_layouts = avgpool_new_layouts->output_layouts; + return InferCorrectLayoutOutput(input_layouts, output_layouts, attrs); +} + +/* + * \brief Forward rewrite the qnn Avg_pool2d op. + * \param attrs The QNN Avg_pool2d attrs. + * \param new_args The new mutated args to the call node. + * \param arg_types The types of input and output. + * \return The sequence of Relay ops for qnn Avg_pool2d op. + * \note Lowering of the qnn.Avg_pool2d operator + + * Quantized Avg_pool2d will take one quantized input tensor and returns another + * quantized tensor. Since the input qnn params can be different from the output + * qnn params, first, we requantize the input tensors with output qnn params and + * cast the results into Int32. Then we call relay.nn.avg_pool2d on that requantized + * inputs. Finally, the results are cast into the quantized output data type. + + * Note: The RequantizeOrUpcast function only perform requantization if the input + * and output qnn params are different, otherwise it only does casting to Int32. + */ + +Expr QnnAvgPoolCanonicalize(const Attrs& attrs, const Array& new_args, + const Array& arg_types) { + ICHECK_EQ(new_args.size(), 5); + Expr input_data = new_args[0]; + Expr input_scale = new_args[1]; + Expr input_zero_point = new_args[2]; + Expr output_scale = new_args[3]; + Expr output_zero_point = new_args[4]; + const auto in_shape = get_shape(arg_types[0]); + const auto* avgpool_attrs = attrs.as(); + auto requantized_input = RequantizeOrUpcast(input_data, input_scale, input_zero_point, + output_scale, output_zero_point, in_shape); + Expr nnAvg = AvgPool2D(requantized_input, avgpool_attrs->pool_size, avgpool_attrs->strides, + avgpool_attrs->dilation, avgpool_attrs->padding, avgpool_attrs->layout, + avgpool_attrs->out_layout, avgpool_attrs->ceil_mode, + avgpool_attrs->count_include_pad); + const auto* data = arg_types[5].as(); + return Cast(nnAvg, data->dtype); +} + +// Positional relay function to create quantized avg_pool2d operator used by frontend FFI. +Expr MakeQuantizedAvgPool2D(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale, + Expr output_zero_point, Array pool_size, + Array strides, Array padding, + Array dilation, bool ceil_mode, bool count_include_pad, + String layout, String output_layout) { + auto attrs = make_object(); + attrs->pool_size = std::move(pool_size); + attrs->strides = std::move(strides); + attrs->padding = std::move(padding); + attrs->dilation = std::move(dilation); + attrs->layout = std::move(layout); + attrs->out_layout = std::move(output_layout); + attrs->ceil_mode = ceil_mode; + attrs->count_include_pad = count_include_pad; + static const Op& op = Op::Get("qnn.avg_pool2d"); + return Call(op, {data, input_scale, input_zero_point, output_scale, output_zero_point}, + Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.avg_pool2d") + .describe("Customized? qnn_avg_pool2d for quantized tensors.") + .set_attrs_type() + .set_num_inputs(5) + .add_argument("data", "Quantized Tensor", "The input data.") + .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.") + .add_argument("output_zero_point", "Tensor", + "The quantization zero_point of the output tensor.") + .set_support_level(11) + .add_type_rel("QnnAvgPool2D", QnnAvgPool2DRel) + .set_attr("TOpPattern", kOutEWiseFusable) + .set_attr("FInferCorrectLayout", QnnAvgPoolInferCorrectLayout) + .set_attr("FTVMQnnCanonicalize", QnnAvgPoolCanonicalize); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.avg_pool2d").set_body_typed(MakeQuantizedAvgPool2D); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py new file mode 100644 index 0000000000000..d62dd0c04673c --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py @@ -0,0 +1,456 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,missing-function-docstring,redefined-outer-name + +""" Test Relay integrated qnn ops +There are two types of tests for qnn ops in this file. One to verify the +correctness of the relay integration and the other one to verify +the fake quantization to integer implemented for picking up the qnn op. +The former is only executed when qnn canonicalization is disabled. +The latter is executed both with and without canonicalization. +""" +# TODO: We might want to distribute these test cases into other test cases such as +# test_wo_qnn_canonicalization and test_pass_fake_quantization_to_integer in the future. + +import numpy as np + +import tvm.testing +import tvm.topi.testing +from tvm import relay +from tvm.contrib.hexagon.session import Session +from tvm.relay.backend import Executor, Runtime +from tvm.contrib.hexagon import allocate_hexagon_array +from .infrastructure import quantize_np + +from .pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom + + +def compile_for_target(mod, target="hexagon", disable_canonicalization=False): + runtime = Runtime("cpp") + executor = Executor("graph", {"link-params": True}) + if target == "hexagon": + target_hexagon = tvm.target.hexagon("v68") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + print("Trying relay.build for ...", target) + dis_passes = [] + if disable_canonicalization: + dis_passes = ["QnnCanonicalize"] + with tvm.transform.PassContext(opt_level=3, disabled_pass=dis_passes): + lib = relay.build(mod, target=target, runtime=runtime, executor=executor) + print(lib.function_metadata) + print("Finished relay.build for...", target) + elif target == "llvm": + target = tvm.target.Target("llvm") + print("Trying relay.build for ...", target) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, runtime=runtime, executor=executor) + print(lib.function_metadata) + print("Finished relay.build for...", target) + return lib + + +def run_model_on_hexagon(hexagon_session, mod, inputs, params=None, disable_canonicalization=True): + hexagon_lowered = compile_for_target(mod, "hexagon", disable_canonicalization) + graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) + if params is None: + params = {} + graph_mod.set_input(**params) + graph_mod.run(**inputs) + return graph_mod.get_output(0).numpy() + + +def run_model_on_llvm(mod, inputs, params=None): + llvm_lowered = compile_for_target(mod, "llvm") + llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) + if params is None: + params = {} + llvm_graph_mod.set_input(**params) + llvm_graph_mod.run(**inputs) + return llvm_graph_mod.get_output(0).numpy() + + +def compare_fq_to_int(hexagon_session, expr, input_np_quant, params=None): + working_scope = "global" + inputs_llvm = {"data": input_np_quant} + input_arr = allocate_hexagon_array( + hexagon_session.device, data=input_np_quant, mem_scope=working_scope + ) + inputs_hex = {"data": input_arr} + mod = tvm.IRModule.from_expr(expr) + mod = tvm.relay.transform.InferType()(mod) + mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod) + assert not tvm.ir.structural_equal(mod, mod_int) + + ref_out_llvm = run_model_on_llvm(mod, inputs_llvm, params) + + # Compare the Hexagon and LLVM results with and without the qnn canonicalization + print("Comparing Hexagon and LLVM reusults (canonicalization disabled)...") + hexagon_output_fq_wo_qnn_can = run_model_on_hexagon( + hexagon_session, mod_int, inputs_hex, params, True + ) + tvm.testing.assert_allclose(ref_out_llvm, hexagon_output_fq_wo_qnn_can, rtol=0, atol=2) + print("Comparing Hexagon and LLVM reusults (canonicalization enabled)...") + hexagon_output_fq_w_qnn_can = run_model_on_hexagon( + hexagon_session, mod_int, inputs_hex, params, False + ) + assert np.all( + np.abs(ref_out_llvm.astype("int32") - hexagon_output_fq_w_qnn_can.astype("int32")) <= 1 + ) + + +@tvm.testing.fixture +def input_np(input_shape, idtype, input_tensor_populator): + if idtype in ("int8", "uint8"): + idtype = "float32" # Use "float32" input which will be quantized later + return create_populated_numpy_ndarray(input_shape, idtype, input_tensor_populator) + + +@tvm.testing.fixture +def transformed_expected_output_np(expected_output_np, odtype): + scale = None + zero_point = None + if odtype in ("int8", "uint8"): + quant_arr, scale, zero_point = quantize_np(expected_output_np, odtype) + else: + quant_arr = expected_output_np + return quant_arr, scale, zero_point + + +@tvm.testing.fixture +def transformed_input_np(input_np, idtype): + scale = None + zero_point = None + if idtype in ("int8", "uint8"): + quant_arr, scale, zero_point = quantize_np(input_np, idtype) + else: + quant_arr = input_np + return quant_arr, scale, zero_point + + +input_layout = tvm.testing.parameter("nhwc") +output_layout = tvm.testing.parameter("nhwc") + + +class TestQnnAvgPool2d: + """QNN AvgPool2d test class.""" + + _param_descs = [ + "in_shape", # input_shape + "layout", # NHWC or NCHW + "kernel", # kernel + "stride", # stride + "dil", # dilation + "pad", # padding + "ceil", # ceil_mode + "cnt_padded", # count_include_pad + None, # input_tensor_populator + ] + + _multitest_params = [ + ( + [1, 12, 12, 32], + "NHWC", + [3, 3], + [1, 1], + [2, 3], + [1, 2, 3, 4], + False, + False, + TensorContentRandom(), + ), + ( + [1, 18, 18, 32], # output shape: [1, 16, 16, 32] + "NCHW", + [3, 3], + [2, 2], + [2, 1], + [1, 2, 3, 4], + False, + True, + TensorContentRandom(), + ), + ] + + _param_ids = get_multitest_ids(_multitest_params, _param_descs) + idtype, odtype = tvm.testing.parameters(("uint8", "uint8")) + + ( + input_shape, + layout, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + input_tensor_populator, + ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) + + @tvm.testing.fixture + def expected_output_np( + self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, layout + ): + pad_before = padding[:2] + pad_after = padding[2:] + ref_np = tvm.topi.testing.poolnd_python( + input_np, + kernel, + stride, + dilation, + pad_before, + pad_after, + "avg", # pool_type + count_include_pad, + ceil_mode, + layout=layout, + ) + + return ref_np + + @tvm.testing.requires_hexagon + def test_integrated_qnn_avg_pool2d( + self, + idtype, + input_shape, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + layout, + transformed_input_np, + transformed_expected_output_np, + hexagon_session: Session, + ): + working_scope = "global" + + if idtype in ("uint8"): + input_np_quant, input_scale, input_zero_point = transformed_input_np + golden_out_np, output_scale, output_zero_point = transformed_expected_output_np + else: + raise RuntimeError(f"Unsupport input dtype '{idtype}'") + + input_arr = allocate_hexagon_array( + hexagon_session.device, data=input_np_quant, mem_scope=working_scope + ) + inputs_hex = {"data": input_arr} + + def gen_relay_expr_qnn(dtype): + data = relay.var("data", shape=input_shape, dtype=dtype) + qnn_avg_pool = relay.qnn.op.avg_pool2d( + data, + input_scale=relay.const(input_scale), + input_zero_point=relay.const(input_zero_point), + output_scale=relay.const(output_scale), + output_zero_point=relay.const(output_zero_point), + pool_size=kernel, + strides=stride, + dilation=dilation, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + layout=layout, + ) + + return qnn_avg_pool + + op_hex = gen_relay_expr_qnn(idtype) + mod = tvm.IRModule.from_expr(op_hex) + mod = relay.transform.InferType()(mod) + hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex) + np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=0, atol=2) + + @tvm.testing.requires_hexagon + def test_fake_quantize_avg_pool2d( + self, + idtype, + input_shape, + kernel, + stride, + dilation, + padding, + layout, + ceil_mode, + count_include_pad, + transformed_input_np, + transformed_expected_output_np, + hexagon_session: Session, + ): + if idtype in ("uint8"): + input_np_quant, input_scale, input_zero_point = transformed_input_np + _, output_scale, output_zero_point = transformed_expected_output_np + else: + raise RuntimeError(f"Unsupport input dtype '{idtype}'") + + def gen_relay_expr(dtype): + data = relay.var("data", shape=input_shape, dtype=dtype) + data_deq = relay.qnn.op.dequantize( + data, relay.const(input_scale), relay.const(input_zero_point) + ) + op = relay.op.nn.avg_pool2d( + data=data_deq, + pool_size=kernel, + strides=stride, + dilation=dilation, + padding=padding, + layout=layout, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + out_quant = relay.qnn.op.quantize( + op, relay.const(output_scale), relay.const(output_zero_point), out_dtype=dtype + ) + return out_quant + + op_llvm = gen_relay_expr(idtype) + compare_fq_to_int(hexagon_session, op_llvm, input_np_quant) + + +class TestQnnQuantize: + """QNN Quantize test class.""" + + _param_descs = ["in_shape", None] # input_shape # input_tensor_populator + + _multitest_params = [ + ([1, 8, 8, 32], TensorContentRandom()), + ([1, 10, 10, 32], TensorContentRandom()), + ([1, 12, 12, 128], TensorContentRandom()), + ] + + _param_ids = get_multitest_ids(_multitest_params, _param_descs) + + (input_shape, input_tensor_populator) = tvm.testing.parameters( + *_multitest_params, ids=_param_ids + ) + + idtype, odtype = tvm.testing.parameters(("float32", "int8"), ("float32", "uint8")) + + @tvm.testing.fixture + def expected_output_np(self, input_np): + # The expected output is of the same shape as input. + # The only computation applied on the input is quanization. + # Since transform_expected_output quantizes the data, + # here, we return the orignal input array in float + return input_np + + @tvm.testing.requires_hexagon + def test_integrated_qnn_quantize( + self, + idtype, + odtype, + input_shape, + input_np, + transformed_input_np, + transformed_expected_output_np, + hexagon_session: Session, + ): + working_scope = "global" + if odtype in ("int8", "uint8"): + golden_out_np, output_scale, output_zero_point = transformed_expected_output_np + else: + raise RuntimeError(f"Unsupport output dtype '{odtype}'") + + input_arr = allocate_hexagon_array( + hexagon_session.device, data=input_np, mem_scope=working_scope + ) + inputs_hex = {"data": input_arr} + + def gen_relay_expr_qnn(dtype): + data = relay.var("data", shape=input_shape, dtype=dtype) + qnn_quantize = relay.qnn.op.quantize( + data, + output_scale=relay.const(output_scale), + output_zero_point=relay.const(output_zero_point), + axis=-1, + out_dtype=odtype, + ) + return qnn_quantize + + op_hex = gen_relay_expr_qnn(idtype) + mod = tvm.IRModule.from_expr(op_hex) + mod = relay.transform.InferType()(mod) + hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex) + np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=0, atol=1) + + +class TestQnnDequantize: + """QNN Dequantize test class.""" + + _param_descs = ["in_shape", None] # input_shape # input_tensor_populator + + _multitest_params = [ + ([1, 12, 32, 128], TensorContentRandom()), + ([1, 10, 10, 32], TensorContentRandom()), + ([1, 6, 6, 2048], TensorContentRandom()), + ([1, 1000], TensorContentRandom()), + ] + + _param_ids = get_multitest_ids(_multitest_params, _param_descs) + + (input_shape, input_tensor_populator) = tvm.testing.parameters( + *_multitest_params, ids=_param_ids + ) + + idtype, odtype = tvm.testing.parameters(("int8", "float32"), ("uint8", "float32")) + + @tvm.testing.fixture + def expected_output_np(self, input_np, idtype): + quant_np, scale, zero_point = quantize_np(input_np, idtype) + ref_np = (scale * (quant_np.astype("int32") - zero_point)).astype("float32") + return ref_np + + @tvm.testing.requires_hexagon + def test_integrated_qnn_dequantize( + self, + idtype, + odtype, + input_shape, + transformed_input_np, + transformed_expected_output_np, + hexagon_session: Session, + ): + working_scope = "global" + if odtype in ("float32"): + input_np_quant, input_scale, input_zero_point = transformed_input_np + golden_out_np, _, _ = transformed_expected_output_np + else: + raise RuntimeError(f"Unsupport odtype '{odtype}'") + + input_arr = allocate_hexagon_array( + hexagon_session.device, data=input_np_quant, mem_scope=working_scope + ) + inputs_hex = {"data": input_arr} + + def gen_relay_expr_qnn(dtype): + data = relay.var("data", shape=input_shape, dtype=dtype) + qnn_quantize = relay.qnn.op.dequantize( + data, + input_scale=relay.const(input_scale), + input_zero_point=relay.const(input_zero_point), + ) + return qnn_quantize + + op_hex = gen_relay_expr_qnn(idtype) + mod = tvm.IRModule.from_expr(op_hex) + mod = relay.transform.InferType()(mod) + hexagon_out = run_model_on_hexagon(hexagon_session, mod, inputs_hex) + np.testing.assert_allclose(hexagon_out, golden_out_np, rtol=1e-3, atol=1e-3) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py index 0eedfdbf8da1d..712d5b303eeb7 100644 --- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py @@ -20,74 +20,70 @@ from tvm import te import tvm.testing +from tvm.topi.testing import poolnd_python from tvm.contrib.hexagon.session import Session import tvm.topi.hexagon.slice_ops as sl import tvm.topi.hexagon.qnn as qn from tvm.contrib.hexagon import allocate_hexagon_array -from ...infrastructure import ( - transform_numpy, - quantize_np, - get_hexagon_target, -) -from ...pytest_util import ( - get_multitest_ids, - create_populated_numpy_ndarray, - TensorContentRandom, -) +import pytest +from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target +from ...pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom -input_layout = tvm.testing.parameter( - "nhwc-8h2w32c2w-2d", -) -dtype = tvm.testing.parameter("float16", "uint8") +dtype = tvm.testing.parameter("uint8", "float16") @tvm.testing.fixture -def output_layout(output_shape, dtype): - o_b, o_h, o_w, o_c = output_shape - if dtype == "float16": - if o_h == 1 and o_w == 1: - return "n11c-1024c-2d" +def output_layout(output_shape, op_layout, dtype): + if op_layout == "NHWC": + o_b, o_h, o_w, o_c = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "n11c-1024c-2d" + else: + return "nhwc-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "n11c-2048c-2d" + else: + return "nhwc-8h8w32c-2d" else: - assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape" - return "nhwc-8h2w32c2w-2d" - elif dtype == "int8" or "uint8": - if o_h == 1 and o_w == 1: - return "n11c-2048c-2d" + raise RuntimeError(f"Unsupported data type '{dtype}'") + + elif op_layout == "NCHW": + o_b, o_c, o_h, o_w = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "nc11-1024c-2d" + else: + return "nchw-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "nc11-2048c-2d" + else: + return "nchw-8h8w32c-2d" else: - assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape" - return "nhwc-8h8w32c-2d" + raise RuntimeError(f"Unsupported data type '{dtype}'") else: - raise RuntimeError(f"Unsupported data type '{dtype}'") + raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'") @tvm.testing.fixture -def input_np(input_shape, dtype: str, input_tensor_populator): - if dtype == "uint8": - dtype = "float32" # Use "float32" input which will be quantized later - return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) - - -@tvm.testing.fixture -def transformed_expected_output_np(expected_output_np, output_layout, dtype): +def input_layout(op_layout, dtype): + in_layout = op_layout.lower() if dtype == "float16": - return transform_numpy(expected_output_np, "nhwc", output_layout) - elif dtype in ("uint8", "int8"): - quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype) - return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point] + return in_layout + "-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + return in_layout + "-8h8w32c-2d" else: raise RuntimeError(f"Unsupported data type '{dtype}'") @tvm.testing.fixture -def transformed_input_np_padded(input_np_padded, input_layout, dtype): - if dtype == "float16": - return transform_numpy(input_np_padded, "nhwc", input_layout) - elif dtype in ("uint8", "int8"): - quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype) - return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point] - else: - raise RuntimeError(f"Unsupported data type '{dtype}'") +def input_np(input_shape, dtype: str, input_tensor_populator): + if dtype == "uint8": + dtype = "float32" # Use "float32" input which will be quantized later + return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) class TestAvgPool2dSlice: @@ -99,110 +95,143 @@ class TestAvgPool2dSlice: "pad", # padding "ceil", # ceil_mode "cnt_padded", # count_include_pad + "op_layout", # input output 4D layout None, # input_tensor_populator ] - _multitest_params = [ ( - [1, 8, 8, 32], + [1, 7, 11, 32], + [3, 3], + [3, 2], + [2, 3], + [1, 2, 3, 4], + False, + False, + "NHWC", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [4, 4], + [2, 2], + [2, 3], + [0, 2, 1, 4], + False, + False, + "NHWC", + TensorContentRandom(), + ), + # Test default stride,dilation, and padding with different layouts + ( + [1, 10, 10, 32], [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 16, 16, 32], + [1, 12, 12, 32], [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], - [8, 8], + [1, 32, 14, 14], + [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), - # Test non-one stride and dilation ( - [1, 8, 8, 32], - [3, 3], - [2, 3], + [1, 32, 15, 15], + [8, 8], + [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), + # Test non-one stride and dilation with different layouts ( - [1, 8, 8, 32], + [1, 18, 24, 32], [3, 3], - [2, 2], + [2, 3], [2, 2], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], - [3, 3], + [1, 32, 18, 18], + [5, 5], [2, 2], [2, 3], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), - # Test non-zero padding + # Test non-zero padding with count include and exclude pad and different layouts ( - [1, 8, 8, 32], + [1, 6, 6, 32], [3, 3], [1, 1], [1, 1], [1, 1, 1, 1], False, - True, + False, + "NHWC", TensorContentRandom(), ), ( [1, 8, 8, 32], [3, 3], - [1, 1], - [1, 1], - [1, 2, 3, 4], + [1, 2], + [2, 3], + [2, 2, 3, 3], False, - True, + False, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], + [1, 32, 6, 6], [3, 3], [1, 1], [1, 1], [1, 2, 3, 4], False, - True, + False, + "NCHW", TensorContentRandom(), ), ( - [1, 8, 8, 32], + [1, 32, 15, 22], [3, 3], [3, 2], [2, 3], [1, 2, 3, 4], False, - True, + False, + "NCHW", TensorContentRandom(), ), # Test n11c-1024c-2d layout which will require input and output to have different layout @@ -214,6 +243,7 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( @@ -221,9 +251,21 @@ class TestAvgPool2dSlice: [6, 6], [1, 1], [1, 1], - [0, 0, 0, 0], + [2, 2, 2, 2], False, - True, + False, + "NHWC", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [4, 4], + [2, 2], + [2, 3], + [0, 2, 1, 4], + False, + False, + "NHWC", TensorContentRandom(), ), ( @@ -234,23 +276,24 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 1, 1, 2048], + [1, 2048, 1, 1], [4, 4], [2, 2], [2, 3], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), ] _param_ids = get_multitest_ids(_multitest_params, _param_descs) - # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" ( output_shape, kernel, @@ -259,23 +302,17 @@ class TestAvgPool2dSlice: padding, ceil_mode, count_include_pad, + op_layout, input_tensor_populator, ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) @tvm.testing.fixture def expected_output_np( - self, - input_np, - kernel, - stride, - dilation, - padding, - ceil_mode, - count_include_pad, + self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, op_layout ): pad_before = padding[:2] pad_after = padding[2:] - ref_np = tvm.topi.testing.poolnd_python( + ref_np = poolnd_python( input_np, kernel, stride, @@ -285,150 +322,129 @@ def expected_output_np( "avg", # pool_type count_include_pad, False, # ceil_mode, - layout="NHWC", + layout=op_layout, ) return ref_np @tvm.testing.fixture - def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout): + def input_shape( + self, output_shape, kernel, padding, stride, dilation, op_layout, output_layout + ): # Input shape without any padding; 'ceil' is being ignored from calculation: - o_b, o_h, o_w, o_c = output_shape + if op_layout == "NHWC": + o_b, o_h, o_w, o_c = output_shape + else: + o_b, o_c, o_h, o_w = output_shape d_h, d_w = dilation s_h, s_w = stride k_h, k_w = kernel pad_before_h, pad_before_w = padding[:2] pad_after_h, pad_after_w = padding[2:] - if output_layout == "n11c-1024c-2d": - assert ( - pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0 - ), "Padding must be zero for n11c-1024c-2d layout" + if ( + output_layout == "n11c-2048c-2d" + or output_layout == "nc11-2048c-2d" + or output_layout == "n11c-1024c-2d" + or output_layout == "nc11-1024c-2d" + ): assert o_h == 1 and o_w == 1, "Output height and width must be 1" in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w - return [o_b, in_h, in_w, o_c] - - @tvm.testing.fixture - def input_shape_padded(self, input_shape, padding, output_layout, dtype): - # Input shape is adjusted to account for 'padding'. Also, due to the physical - # layout of the buffer, height and width are adjusted so that they are a - # multiple of the buffer size dictated by the layout. - # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and - # for int8/uint8, it's nhwc-8h8w32c-2d. - # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple - # of 8. However, the width should be a multiple of 4 for the first case and 8 for - # the second case. - - height_mult = 8 - if dtype == "float16": - width_mult = 4 # input layout : nhwc-8h2w32c2w-2d - elif dtype in ("uint8", "int8"): - width_mult = 8 # input layout : nhwc-8h8w32c-2d + if op_layout == "NHWC": + return [o_b, in_h, in_w, o_c] else: - raise RuntimeError(f"Unsupport dtype '{dtype}'") - - pad_before_h, pad_before_w = padding[:2] - pad_after_h, pad_after_w = padding[2:] - padded_input_height = ( - (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult - ) * height_mult - padded_input_width = ( - (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult - ) * width_mult - return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]] - - @tvm.testing.fixture - def input_np_padded(self, input_np, input_shape, input_shape_padded, padding): - pad_before_h, pad_before_w = padding[:2] - pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h - pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w - input_padded = np.pad( - input_np, - ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)), - "constant", - ) - return input_padded + return [o_b, o_c, in_h, in_w] @tvm.testing.fixture def schedule_args( self, - stride, kernel, - dtype, + stride, + padding, dilation, - input_layout, + count_include_pad, output_layout, output_shape, - input_shape_padded, - transformed_input_np_padded, - transformed_expected_output_np, + input_np, + input_shape, + input_layout, + expected_output_np, + dtype, + op_layout, ): - """ - Construct schedule args based on dtype - """ - A = te.placeholder(input_shape_padded, name="A", dtype=dtype) - + """Construct schedule args based on dtype""" + A = te.placeholder(input_shape, name="A", dtype=dtype) if dtype == "float16": - M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape) + if op_layout == "NHWC": + M = sl.avg_pool2d_NHWC( + A, kernel, stride, padding, dilation, count_include_pad, output_shape + ) + elif op_layout == "NCHW": + M = sl.avg_pool2d_NCHW( + A, kernel, stride, padding, dilation, count_include_pad, output_shape + ) + else: + raise RuntimeError(f"Unsupported layout for slice_op.avg_pool2d '{op_layout}'") tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout) elif dtype in ("uint8", "int8"): - in_data, in_scale, in_zero_point = transformed_input_np_padded - _, out_scale, out_zero_point = transformed_expected_output_np - M = qn.qnn_avg_pool2d_compute( - A, - kernel, - stride, - dilation, - output_shape, - dtype, - in_zero_point, - in_scale, - out_zero_point, - out_scale, - ) + _, in_scale, in_zero_point = quantize_np(input_np, dtype) + _, out_scale, out_zero_point = quantize_np(expected_output_np, dtype) + if op_layout == "NHWC": + M = qn.qnn_avg_pool2d_NHWC( + A, + kernel, + stride, + padding, + dilation, + count_include_pad, + output_shape, + dtype, + in_scale, + in_zero_point, + out_scale, + out_zero_point, + ) + elif op_layout == "NCHW": + M = qn.qnn_avg_pool2d_NCHW( + A, + kernel, + stride, + padding, + dilation, + count_include_pad, + output_shape, + dtype, + in_scale, + in_zero_point, + out_scale, + out_zero_point, + ) + else: + raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'") + tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout) return [tir_schedule.mod, [A, M]] @tvm.testing.requires_hexagon def test_avg_pool2d_slice( - self, - dtype, - output_layout, - output_shape, - transformed_input_np_padded, - transformed_expected_output_np, - schedule_args, - hexagon_session: Session, + self, dtype, input_np, expected_output_np, schedule_args, hexagon_session: Session ): - in_data = transformed_input_np_padded - + print("schedule_args : ", schedule_args) with tvm.transform.PassContext(opt_level=3): - func = tvm.build( - *schedule_args, - get_hexagon_target("v69"), - name="avg_pool2d", - ) - - input_axis_separator = [4] - if output_layout in ( - "nhwc-8h2w32c2w-2d", - "nhwc-8h8w32c-2d", - "n11c-1024c-2d", - "n11c-2048c-2d", - ): - output_axis_separator = [4] - else: - raise RuntimeError(f"Unexpected layout '{output_layout}'") + func = tvm.build(*schedule_args, get_hexagon_target("v69"), name="avg_pool2d") + + input_axis_separator = [] + output_axis_separator = [] if dtype == "float16": - in_data_np = transformed_input_np_padded - out_data_np = transformed_expected_output_np + in_data_np = input_np + out_data_np = expected_output_np elif dtype in ("uint8", "int8"): - in_data_np, _, _ = transformed_input_np_padded - out_data_np, _, _ = transformed_expected_output_np + in_data_np, _, _ = quantize_np(input_np, dtype) + out_data_np, _, _ = quantize_np(expected_output_np, dtype) else: raise RuntimeError(f"Unsupport dtype '{dtype}'") @@ -436,34 +452,26 @@ def test_avg_pool2d_slice( hexagon_session.device, data=in_data_np, axis_separators=input_axis_separator, - mem_scope="global.vtcm", + mem_scope="global.ddr", ) output_arr = allocate_hexagon_array( hexagon_session.device, out_data_np.shape, dtype, axis_separators=output_axis_separator, - mem_scope="global.vtcm", + mem_scope="global.ddr", ) mod = hexagon_session.load_module(func) mod(input_arr, output_arr) - b, h, w, c = output_shape - if output_layout == "nhwc-8h2w32c2w-2d": - output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) - elif output_layout == "nhwc-8h8w32c-2d": - output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) - elif output_layout == "n11c-2048c-2d": - output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048]) - elif output_layout == "n11c-1024c-2d": - output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024]) - else: - raise RuntimeError(f"Unexpected layout '{output_layout}'") + + output_np = output_arr.numpy() if dtype == "float16": np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3) else: - np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1) + output_np = output_arr.numpy() + np.testing.assert_allclose(output_np, out_data_np, rtol=0, atol=2) if __name__ == "__main__":