diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py index 2ad2445cd365..f036e6cf840d 100644 --- a/python/tvm/relay/qnn/op/_qnn.py +++ b/python/tvm/relay/qnn/op/_qnn.py @@ -167,3 +167,6 @@ def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type): # qnn.batch_matmul register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy) register_pattern("qnn.batch_matmul", OpPattern.OUT_ELEMWISE_FUSABLE) + +# qnn.avg_pool2d +register_strategy("qnn.avg_pool2d", strategy.qnn_avg_pool2d_strategy) diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py index 668cafb8ae34..587993603139 100644 --- a/python/tvm/relay/qnn/op/layout_conversions.py +++ b/python/tvm/relay/qnn/op/layout_conversions.py @@ -126,3 +126,38 @@ def convert_qnn_conv2d_transpose(attrs, inputs, tinfos, desired_layouts): return relay.qnn.op.conv2d_transpose(*inputs, **new_attrs) raise ValueError(f"Layout {desired_data_layout} is not yet supported") + + +@reg.register_convert_op_layout("qnn.avg_pool2d") +def convert_qnn_avg_pool2d(attrs, inputs, tinfos, desired_layouts): + """Convert Layout pass registration for QNN avg_pool2d op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current avg_pool2d + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + tinfos : list of types + List of input and output types + desired_layouts : list of layout strings + List of layouts defining our desired + layout for the data input. + + Returns + ------- + result : tvm.relay.Expr + The transformed expr + """ + # pylint: disable=import-outside-toplevel + from tvm import relay + + assert len(desired_layouts) == 1, "A desired layout is expected for qnn.avg_pool2d's input" + desired_data_layout = desired_layouts[0] + if desired_data_layout == "NCHW" or desired_data_layout == "NHWC": + new_attrs = dict(attrs) + new_attrs["layout"] = str(desired_data_layout) + new_attrs["out_layout"] = str(desired_data_layout) + return relay.qnn.op.avg_pool2d(*inputs, **new_attrs) + + raise ValueError(f"Layout {desired_data_layout} is not yet supported") diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index 0e73a6889fcd..eb64b56e829d 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -1249,3 +1249,69 @@ def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zer def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1): return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point) + + +def avg_pool2d( + data, + input_scale, + input_zero_point, + output_scale, + output_zero_point, + pool_size, + strides, + padding, + dilation, + ceil_mode=False, + count_include_pad=True, + layout="NHWC", + out_layout="", +): + + """Quantized avg_pool2d + + Parameters + ---------- + data : relay.Expr + The quantized input tensor. + input_scale: float + The scale of the input quantized expr. + input_zero_point: int + The zero point of input quantized expr. + output_scale: flaot + The scale of the output quantized expr. + output_zero_point: int + The zero point of output quantized expr. + pool_size : relay.Expr + The pool_size + strides : relay.Expr + The strides + padding : relay.Expr + The padding size + dilation : relay.Expr + The dilation size + ceil_mode : bool, optional + Whether to use ceil or floor for calculating the output shape + count_include_pad : bool, optional + Determines if padding should be taken into account in the computation + layout: string, optinal + out_layout: string, optional + Returns + ------- + result : relay.Expr + The computed result. + """ + return _make.avg_pool2d( + data, + input_scale, + input_zero_point, + output_scale, + output_zero_point, + pool_size, + strides, + padding, + dilation, + ceil_mode, + count_include_pad, + layout, + out_layout, + ) diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py index 3ebf8edd3665..4c5884ffdc15 100644 --- a/python/tvm/relay/qnn/strategy/generic.py +++ b/python/tvm/relay/qnn/strategy/generic.py @@ -157,6 +157,36 @@ def wrapper(_attrs, inputs, out_type): return wrapper +def wrap_compute_qnn_avg_pool2d(topi_compute): + """Wrap qnn.avg_pool2d topi compute""" + + def wrapper(attrs, inputs, out_type): + kernel = attrs.pool_size + strides = attrs.strides + padding = attrs.padding + dilation = attrs.dilation + count_include_pad = attrs.count_include_pad + oshape = out_type.shape + odtype = out_type.dtype + args = [ + inputs[0], + kernel, + strides, + padding, + dilation, + count_include_pad, + oshape, + odtype, + inputs[1], + inputs[2], + inputs[3], + inputs[4], + ] + return [topi_compute(*args)] + + return wrapper + + def wrap_topi_concatenate(topi_compute): """Wrap TOPI compute which use qnn.concatenate attrs""" @@ -280,3 +310,12 @@ def qnn_batch_matmul_strategy(attrs, inputs, out_type, target): "qnn.batch_matmul is currently only supported with Hexagon. " "Please run QNN Canonicalize pass to decompose this op into supported ops." ) + + +@override_native_generic_func("qnn_avg_pool2d_strategy") +def qnn_avg_pool2d_strategy(attrs, inputs, out_type, target): + """qnn.avg_pool2d generic strategy""" + raise RuntimeError( + "qnn.avg_pool2d is currently only supported with Hexagon. " + "Please run QNN Canonicalize pass to decompose this op into supported ops." + ) diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py index d17b0da6cf0a..3edbce34e30f 100644 --- a/python/tvm/relay/qnn/strategy/hexagon.py +++ b/python/tvm/relay/qnn/strategy/hexagon.py @@ -201,3 +201,27 @@ def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target): name="qnn_batch_matmul.hexagon", ) return strategy + + +@qnn_avg_pool2d_strategy.register(["hexagon"]) +def qnn_avg_pool2d_strategy_hexagon(attrs, inputs, out_type, target): + """qnn.avg_pool2d strategy for Hexagon""" + data_layout = attrs.layout + if data_layout == "NHWC": + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NHWC), + wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d), + name="qnn_avg_pool2d.hexagon", + ) + return strategy + elif data_layout == "NCHW": + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_qnn_avg_pool2d(topi.hexagon.qnn.qnn_avg_pool2d_wrapper_compute_NCHW), + wrap_topi_schedule(topi.hexagon.qnn.schedule_qnn_avg_pool2d), + name="qnn_avg_pool2d.hexagon", + ) + return strategy + else: + raise RuntimeError("Unsupported strategy for qnn.avg_pool2d") diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index 4c9a3f7cd09c..5e289b0c9380 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -111,6 +111,32 @@ def identity(expr, type_map): register_unary_identity("image.resize2d") +@register_fake_quantization_to_integer("nn.avg_pool2d") +def avgpool2d(expr, type_map): + """Rewrite an avgpool op""" + attrs = {**expr.attrs} + arg = expr.args[0] + t = type_map[arg] + out_t = type_map[expr] + + out = relay.qnn.op.avg_pool2d( + arg, + t.scale, + t.zero_point, + out_t.scale, + out_t.zero_point, + attrs["pool_size"], + attrs["strides"], + attrs["padding"], + attrs["dilation"], + attrs["ceil_mode"], + attrs["count_include_pad"], + attrs["layout"], + ) + + return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)] + + @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d") def adaptive_avgpool1d(expr, type_map): """Rewrite an adaptive avgpool op""" @@ -138,37 +164,6 @@ def adaptive_avgpool1d(expr, type_map): return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)] -@register_fake_quantization_to_integer("nn.avg_pool2d") -def avgpool2d(expr, type_map): - """Rewrite a avgpool op""" - arg = expr.args[0] - t = type_map[arg] - out_t = type_map[expr] - # Cast (or requantize) to int32. - if not ( - approx_equal(t.scale, out_t.scale) - and approx_equal(t.zero_point, out_t.zero_point) - and tvm.ir.structural_equal(t.dtype, out_t.dtype) - ): - arg = relay.qnn.op.requantize( - arg, - t.scale, - t.zero_point, - out_t.scale, - out_t.zero_point, - out_dtype="int32", - axis=t.axis, - ) - else: - arg = relay.op.cast(arg, "int32") - out = relay.op.nn.avg_pool2d(arg, **expr.attrs) - if out_t.dtype != "int32": - # Cast back to output dtype to preserve input dtype == output dtype for AvgPool2d. - out = relay.op.clip(out, a_min=np.iinfo(out_t.dtype).min, a_max=np.iinfo(out_t.dtype).max) - out = relay.op.cast(out, out_t.dtype) - return [out, TensorAffineType(out_t.scale, out_t.zero_point, out_t.dtype, out_t.axis)] - - @register_fake_quantization_to_integer("nn.global_avg_pool2d") def global_avgpool2d(expr, type_map): """Rewrite a global_avgpool op""" diff --git a/python/tvm/topi/hexagon/compute_poolarea.py b/python/tvm/topi/hexagon/compute_poolarea.py new file mode 100644 index 000000000000..6ba50c4a96e1 --- /dev/null +++ b/python/tvm/topi/hexagon/compute_poolarea.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals + +"""Compute PoolArea size which is used to exclude the zero-padding elements in the averaging + calculation. +""" + +from tvm import te, tir + + +def compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left): + """ + Parameters + ---------- + i,j: + index of output tensor along H and W axis + This is equal to the starting point of the sliding window for which the average is computed + ih, iw: + input data size along H and W axis + kh, kw: + Kernel size along H and W axis + sh, sw: + Stride size along H and W axis + dh, dw: + Dilation size along H and W axis + pad_top, pad_left: + Pad size on Top and left side of input data + + # PoolArea refers to the area of that portion of each sliding window which only includes + # the input data and not the padded area. + + # Motivation: The following example shows the location of the first sliding window (at i=0, j=0) + # on a 6*6 array, with kernel=[3,3] and padding=[1, 1, 1, 1]. + # The input data elements are shown with (X) and padding data with (0). + # As shown, the number of non-padding elements that should be used for computing + # the average of values inside this window is 4, while the windows area is 3*3=9. + # To compute the PoolArea, we have to move the top/left edge of the window down/right + # to exclude zero-padding elements. The edge adjustment can be formulated as + # top_edge = max(i , pad_top) + # left_edge= max(j , pad_left) + # Note that pad_top and pad_left represent point 0 of the input data along i and j direction. + # In this example, bottom_edge and right_edge of the PoolArea do not need any adjustment, + # because there is no padding data on those side of the window. + # However, as we slide the window down and to the right, the window might go + # beyond the input data boundaries (ih and iw). In these cases, bottom/right edge should be + # moved up/left to be located inside the input data. + # This can be formulated as + # bottom_edge = min(i + kh, ih + pad_top) + # left_edge = min(j + kw, iw + pad_left) + # Having all the edges, + # PoolArea = (bottom_edge - top_edge) * (right_edge - left_edge) + + # _______ + # |0 0 0|0 0 0 0 0 0 0 0 0 0 0 0 0 + # | | _______ + # |0 X X|X X X X 0 |0 X X|X X X X 0 + # | | | | + # |0 X X|X X X X 0 ====> |0 X X|X X X X 0 + # |_____| |_____| + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 X X X X X X 0 0 X X X X X X 0 + # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + + # The above equations are derived under the assumption of having default value (1) + # for stride and dilation. However, we need to expand them to support non-default + # stride and dilation values. + # Stride impacts the starting location of the sliding windows, so i and j should be + # replaced by (i * sh) and j by (j * sw) in the equations. + # Dilation changes the window size, making k kernel elements scattered into a d*(k - 1) + 1 + # window. + # Non-1 dilation means that, we need to divide the adjusted window size by the dilation value + # to find out how many kernel elements inside the sliding window are inside the input data + # boundaries: + # top_edge= max(i * sh , pad_top) + # left_edge= max(j * sw , pad_left) + # bottom_edge = min(i * sh + (kh - 1) * dh + 1, ih + pad_top) + # left_edge = min(j * sw + (kw - 1) * dw + 1, data_w + pad_left) + # PoolArea = ceil_div((bottom_edge - top_edge), dh) * ceil_div((right_edge - left_edge), dw) + # + # Finally, we need to address one corner case related to the non-default dilation: + # Consider the following example along W axis, where iw = 3, kw = 3 and dw = 2. + # The first figure on the left shows the sliding window of size 5 starting at index 0, + # and the first figure on the right shows the same example with sliding window at index 1. + # The second row of figures show the PoolArea after adjusting the edges + # (both left_edge - right_edge = 3) + # The third row of figures show the location of dialated kernel points(*). + # As shown, although the distance between left and right edge in both cases is 3 and + # dilation is 2 and ceil_div(3,2)=2, the right PoolArea only includes 1 kernel point. + + # Sliding Window: |0 0 X X X |0 0 |0 X X X 0| + # PoolArea(after edge adjustment): 0 0|X X X |0 0 0|X X X| 0 + # location of dilated kernel points: * 0|* X * |0 0 *|X * X| 0 + # PoolArea (dilated_point_aware): * 0|* X * |0 0 * X|* X| 0 + + # To address this issue, instead of moving the left_edge to bring it just inside the input + # data boundary, we should move the edge to the right untill we get to the first dilated kernel + # point inside the input data boundary. + # The third row of figures shows how this row adjustment can solve the problem. + # So the problem is reduced to finding the the first dilated kernel point inside the data + # boundary.# For that, we can find the number of dialted points which are mapped to the padded + # area and find the location of the next one which should be inside the input data: + # num_of_prev_points = (pad_top - i * sh - 1) // dh + # next_point_index = i * sh + (num_prev_points + 1) * dh + # + # With that, Top_edge and left_edge can be reformulated as: + # if i*sh - pad_top < 0: + # top_edge = i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh + # else: + # top_edge = i * sh + # + # if j * sw - pad_left < 0: + # left_edge = j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw + # else: + # left_edge= j * sw + + """ + top_edge = tir.if_then_else( + tir.all(i * sh - pad_top < 0), i * sh + ((pad_top - i * sh - 1) // dh + 1) * dh, i * sh + ) + bottom_edge = te.min(i * sh + (kh - 1) * dh + 1, ih + pad_top) + left_edge = tir.if_then_else( + tir.all(j * sw - pad_left < 0), j * sw + ((pad_left - j * sw - 1) // dw + 1) * dw, j * sw + ) + right_edge = te.min(j * sw + (kw - 1) * dw + 1, iw + pad_left) + return -((bottom_edge - top_edge) // -dh) * -((right_edge - left_edge) // -dw) diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py index ba7d64b6b56d..f7c4502301c0 100644 --- a/python/tvm/topi/hexagon/qnn/__init__.py +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -18,7 +18,7 @@ """ Computes and schedules for Hexagon quantized ops """ from .adaptive_avg_pool1d import * -from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule +from .avg_pool2d import * from .conv2d_alter_op import * from .dense_alter_op import * from .dequantize import dequantize_compute, dequantize_schedule diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py index 4aac15cbdc17..1370ad36e468 100644 --- a/python/tvm/topi/hexagon/qnn/avg_pool2d.py +++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py @@ -16,64 +16,144 @@ # under the License. # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals -""" Compute and schedule for quantized avg_pool2d op - -Please note the following assumptions made by the implementation: - -1) The input must be padded in advance to account for 'padding'. In addition, - both input and output must be padded as per the physical buffer layout. -2) The current implementation assumes 'count_include_pad' to be 'True'. It can be - modified to support 'False' case but the element count for the pooling window - must be pre-computed and provided as an input to reduce the run-time overhead. -3) 'padding' is ignored. It must be handled outside of the sliced op. -4) Please note that this implementation will not work if the output includes any - physical layout related padding as it can result into out-of-bound access - for the input. -""" +""" Compute and schedule for quantized avg_pool2d op """ +import tvm from tvm import te from tvm import tir -from ..utils import get_layout_transform_fn, get_fixed_point_value +from ..utils import ( + get_layout_transform_fn, + get_fixed_point_value, + is_scalar, + get_const_int_value, + get_const_float_value, +) +from ...utils import get_const_tuple +from ...nn.utils import get_pad_tuple +from ...nn.pad import pad +from ..compute_poolarea import compute_PoolArea + + +def saturate(x: te.Tensor, dtype: str): + """Saturate value for the specified data type""" + return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) -def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list): - """Validate output shape""" - _, oh, ow, _ = out_shape - _, ih, iw, _ = in_shape +def qnn_avg_pool2d_NCHW( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Compute for quantized avg_pool2d""" kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") + + if odtype == "uint8": + temp_dtype = "uint16" + elif odtype == "int8": + temp_dtype = "int16" + else: + raise RuntimeError(f"Unsupported output dtype, {odtype}'") + sh, sw = stride dh, dw = dilation - if ih < (oh - 1) * sh + dh * (kh - 1) + 1: - raise RuntimeError("Output height is too large") - if iw < (ow - 1) * sw + dw * (kw - 1) + 1: - raise RuntimeError("Output width is too large") + scale = input_scale / output_scale + scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") + corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point -def saturate(x: te.Tensor, dtype: str): - """Saturate value for the specified data type""" - return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, 0, pad_top, pad_left) + pad_after = (0, 0, pad_down, pad_right) + data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + + Sum = te.compute( + oshape, + lambda b, c, h, w: te.sum( + data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype(temp_dtype), axis=[rh, rw] + ), + name="pool_sum", + ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, _, oh, ow = oshape + _, _, ih, iw = data.shape + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) -def qnn_avg_pool2d_compute( + ScaleWithArea = te.compute( + (oh, ow), + lambda i, j: (scale_fixed_point // PoolArea[i, j]).astype("int32"), + name="scale_with_area", + ) + + Avg = te.compute( + oshape, + lambda b, c, h, w: saturate( + ((Sum[b, c, h, w] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + else: + ScaleWithArea = scale_fixed_point // (kh * kw) + Avg = te.compute( + oshape, + lambda b, c, h, w: saturate( + ((Sum[b, c, h, w] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + return Avg + + +def qnn_avg_pool2d_NHWC( data: te.Tensor, kernel: list, stride: list, + padding: list, dilation: list, + count_include_pad: bool, oshape: list, odtype: str, # quantization params: - input_zero_point: int, input_scale: float, - output_zero_point: int, + input_zero_point: int, output_scale: float, + output_zero_point: int, ): """Compute for quantized avg_pool2d""" kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") - ob, oh, ow, oc = oshape - if isinstance(ob, int): - validate_out_shape(oshape, data.shape, kernel, stride, dilation) if odtype == "uint8": temp_dtype = "uint16" @@ -85,86 +165,217 @@ def qnn_avg_pool2d_compute( sh, sw = stride dh, dw = dilation - PoolArea = kh * kw - scale = input_scale / output_scale scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") - scale_with_area = scale_fixed_point // PoolArea corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + # Compute Area + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, pad_top, pad_left, 0) + pad_after = (0, pad_down, pad_right, 0) + data_pad = pad(data, pad_before, pad_after, pad_value=input_zero_point, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + Sum = te.compute( oshape, lambda b, h, w, c: te.sum( - data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw] + data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw] ), - name="sum", + name="pool_sum", ) - Avg = te.compute( - oshape, - lambda b, h, w, c: saturate( - ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype - ).astype(odtype), - name="avg", - ) + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, oh, ow, _ = oshape + _, ih, iw, _ = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + ScaleWithArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), + (scale_fixed_point // PoolArea[i, j]).astype("int32"), + 0, + ), + name="scale_with_area", + ) + + Avg = te.compute( + oshape, + lambda b, h, w, c: saturate( + ((Sum[b, h, w, c] * ScaleWithArea[h, w]) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + else: + ScaleWithArea = scale_fixed_point // (kh * kw) + Avg = te.compute( + oshape, + lambda b, h, w, c: saturate( + ((Sum[b, h, w, c] * ScaleWithArea) + corr + (1 << (rsh - 1))) >> rsh, odtype + ).astype(odtype), + name="pool_avg", + ) + return Avg -def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Schedule for input and output layout nhwc-8h8w32c""" +def qnn_avg_pool2d_wrapper_compute_NCHW( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Extract qnn params""" + if ( + is_scalar(input_scale) + and is_scalar(output_scale) + and is_scalar(input_zero_point) + and is_scalar(output_zero_point) + ): + iscale = get_const_float_value(input_scale) + oscale = get_const_float_value(output_scale) + izero_point = get_const_int_value(input_zero_point) + ozero_point = get_const_int_value(output_zero_point) + return qnn_avg_pool2d_NCHW( + data, + kernel, + stride, + padding, + dilation, + count_include_pad, + oshape, + odtype, + iscale, + izero_point, + oscale, + ozero_point, + ) + else: + raise RuntimeError("quantization parameters should be scalar tensors") + + +def qnn_avg_pool2d_wrapper_compute_NHWC( + data: te.Tensor, + kernel: list, + stride: list, + padding: list, + dilation: list, + count_include_pad: bool, + oshape: list, + odtype: str, + # quantization params: + input_scale: float, + input_zero_point: int, + output_scale: float, + output_zero_point: int, +): + """Extract qnn params""" + if ( + is_scalar(input_scale) + and is_scalar(output_scale) + and is_scalar(input_zero_point) + and is_scalar(output_zero_point) + ): + iscale = get_const_float_value(input_scale) + oscale = get_const_float_value(output_scale) + izero_point = get_const_int_value(input_zero_point) + ozero_point = get_const_int_value(output_zero_point) + return qnn_avg_pool2d_NHWC( + data, + kernel, + stride, + padding, + dilation, + count_include_pad, + oshape, + odtype, + iscale, + izero_point, + oscale, + ozero_point, + ) + else: + raise RuntimeError("quantization parameters should be scalar tensors") + + +def schedule_qnn_avg_pool2d(outs): + """Schedule for qnn.avg_pool2d + Parameters + ---------- + outs: Array of Tensor + The computation graph description of qnn.avg_pool2d + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs + s = tvm.te.create_schedule([x.op for x in outs]) + tvm.te.schedule.AutoInlineInjective(s) + return s + + +def schedule_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for input and output layout 8h8w32c""" + func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") - + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_read = s.cache_read(Avg, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) - - # Schedule 'Avg' - # Split and reorder the axes to iterate over the output tensor chunks. - # Each chunk consists for 2048 bytes with 32 channels being the fastest - # changing axis, followed by 8 width and then 8 height. - # The width is split by a factor of 4 and then fused with 32 channels - # to provide full vector length of data for the output tensor chunks. - # NOTE: These schedules are a work in progress and may require - # adjustments in future as some of the missing features for 2-d tensors - # become available. - n, h, w, c = s.get_loops(Avg) - ho, hi = s.split(h, [None, 8]) - wo, wi = s.split(w, [None, 8]) - wio, wii = s.split(wi, [None, 4]) - co, ci = s.split(c, [None, 32]) - s.reorder(n, ho, wo, co, hi, wio, wii, ci) - wii_ci = s.fuse(wii, ci) - s.vectorize(wii_ci) - - # Schedule 'Sum' - s.compute_at(Sum, wio) - Sum_axis = s.get_loops(Sum) - # Compute for 'Sum' includes reduction along height and width. The axes - # are being reordered so that 4 width and 32 channels become the - # inner-most loops which then can be fused and vectorized. However, - # vectorization of the 2-d tensors doesn't work when reduction is - # involved and requires codegen support that is yet to be added. - s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3]) - ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3]) - # s.vectorize(ci_wii) # Doesn't work + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0) return s -def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c""" +def schedule_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for output layout: 2048c, input layout: 8h8w32c""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0) # Schedule 'Avg' # Split and reorder the axes to iterate over the output tensor chunks. @@ -173,7 +384,13 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp # NOTE: These schedules are a work in progress and may require # adjustments in future as some of the missing features for 2-d tensors # become available. - n, h, w, c = s.get_loops(Avg) + + if output_layout == "n11c-2048c-2d": + _, _, _, c = s.get_loops(Avg) + else: + _, c, _, _ = s.get_loops(Avg) + + # n, h, w, c = s.get_loops(Avg) co, ci = s.split(c, [None, 2048]) cio, cii = s.split(ci, [None, 128]) s.vectorize(cii) @@ -191,15 +408,10 @@ def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, inp def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): - """Quantized avg_pool2d schedule + """Quantized avg_pool2d schedule""" + if output_layout == "nhwc-8h8w32c-2d" or output_layout == "nchw-8h8w32c-2d": + return schedule_8h8w32c(outs, ins, output_layout, input_layout) + if output_layout == "n11c-2048c-2d" or output_layout == "nc11-2048c-2d": + return schedule_2048c(outs, ins, output_layout, input_layout) - NOTE: This schedule assumes that both input and output tensors are in the form of - 2d discontiguous buffer and data is already arranged as per the input and output layout - respectively. - - """ - if output_layout == "nhwc-8h8w32c-2d": - return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout) - if output_layout == "n11c-2048c-2d": - return schedule_n11c_2048c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py index e60314b82757..286c33f53f42 100644 --- a/python/tvm/topi/hexagon/qnn/nn.py +++ b/python/tvm/topi/hexagon/qnn/nn.py @@ -22,8 +22,14 @@ import tvm from tvm import te, topi -from ..utils import saturate, get_fixed_point_value -from ...utils import get_const_tuple, get_const_int, get_const_float +from ..utils import ( + saturate, + is_scalar, + get_const_int_value, + get_const_float_value, + get_fixed_point_value, +) +from ...utils import get_const_tuple from ...nn.utils import get_pad_tuple from ...nn.pad import pad from ... import tag, nn @@ -38,27 +44,6 @@ def clip_cast(val, dtype): return te.max(tvm.te.min(val, const_max), const_min).astype(dtype) -# Return True if given expression is scalar constant value. -def is_scalar(expr): - if isinstance(expr, te.Tensor): - return expr.ndim == 0 and (isinstance(expr.op.body[0], (tvm.tir.FloatImm, tvm.tir.IntImm))) - return isinstance(expr, (tvm.tir.FloatImm, tvm.tir.IntImm)) - - -def get_const_int_value(expr): - if isinstance(expr, te.Tensor): - assert isinstance(expr.op.body[0], tvm.tir.IntImm) - return expr.op.body[0].value - return get_const_int(expr) - - -def get_const_float_value(expr): - if isinstance(expr, te.Tensor): - assert isinstance(expr.op.body[0], tvm.tir.FloatImm) - return expr.op.body[0].value - return get_const_float(expr) - - def get_qnn_param(param, indices, axis): # Account scalar and 1D quantization parameters: if len(param.shape) == 0: @@ -68,11 +53,7 @@ def get_qnn_param(param, indices, axis): return param[param_idx] -def subtract_zero_point( - tensor: te.Tensor, - zero_point: Union[te.Tensor, tvm.tir.IntImm], - name: str, -): +def subtract_zero_point(tensor: te.Tensor, zero_point: Union[te.Tensor, tvm.tir.IntImm], name: str): """ Subtract zero point from given tensor. If zero point is scalar constant and is equal to 0, then it can be optimized and return tensor as it is. @@ -590,10 +571,7 @@ def qnn_conv2d( # Conv2d inputs oshape, lambda n, oc, oh, ow: te.sum( data_pad[ - n, - ic, - oh * height_stride + kh * dilation_h, - ow * width_stride + kw * dilation_w, + n, ic, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w ].astype("int32") * weight[oc, ic, kh, kw].astype("int32"), axis=[ic, kh, kw], @@ -777,10 +755,7 @@ def qnn_depthwise_conv2d( # Conv2d inputs oshape, lambda n, oc, oh, ow: te.sum( data_pad[ - n, - oc, - oh * height_stride + kh * dilation_h, - ow * width_stride + kw * dilation_w, + n, oc, oh * height_stride + kh * dilation_h, ow * width_stride + kw * dilation_w ].astype("int32") * te.subtract(weight[oc, 0, kh, kw], kernel_zero_point).astype("int32"), axis=[kh, kw], diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 46ae0c53200f..b38dd5ecb3c1 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -17,7 +17,7 @@ """ Computes and Schedules for Hexagon slice ops. """ -from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule +from .avg_pool2d import avg_pool2d_NHWC, avg_pool2d_NCHW, avg_pool2d_schedule from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule from .add_subtract_multiply import * from .argmax import argmax_compute, argmax_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py index bf6d57b8f7f8..0c7b00e287c3 100644 --- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py +++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py @@ -16,128 +16,214 @@ # under the License. # pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals, pointless-exception-statement -""" Compute and schedule for avg_pool2d slice op - -Please note the following assumptions made by the implementation: - -1) The input must be padded in advance to account for 'padding'. In addition, - both input and output must be padded as per the physical buffer layout. -2) The current implementation assumes 'count_include_pad' to be 'True'. It can be - modified to support 'False' case but the element count for the pooling window - must be pre-computed and provided as an input to reduce the run-time overhead. -3) 'padding' is ignored. It must be handled outside of the sliced op. -4) Please note that this implementation will not work if the output includes any - physical layout related padding as it can result into out-of-bound access - for the input. -""" +""" Compute and schedule for avg_pool2d slice op """ from tvm import te from tvm import tir from ..utils import get_layout_transform_fn +from ...utils import get_const_tuple +from ...nn.utils import get_pad_tuple +from ...nn.pad import pad +from ..compute_poolarea import compute_PoolArea -def validate_out_shape(out_shape, in_shape, kernel, stride, dilation): - """Validate output shape""" - _, oh, ow, _ = out_shape - _, ih, iw, _ = in_shape +def avg_pool2d_NCHW( + data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16" +): + """avg_pool2d compute""" + if odtype != "float16": + raise RuntimeError(f"Unsupported output dtype '{odtype}'") kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") sh, sw = stride dh, dw = dilation - if ih < (oh - 1) * sh + dh * (kh - 1) + 1: - raise RuntimeError("Output height is too large") - if iw < (ow - 1) * sw + dw * (kw - 1) + 1: - raise RuntimeError("Output width is too large") + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD -def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"): + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, 0, pad_top, pad_left) + pad_after = (0, 0, pad_down, pad_right) + data_pad = pad(data, pad_before, pad_after, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + + Sum = te.compute( + oshape, + lambda b, c, h, w: te.sum( + data_pad[b, c, h * sh + dh * rh, w * sw + dw * rw].astype("float32"), axis=[rh, rw] + ), + name="pool_sum", + ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, _, oh, ow = oshape + _, _, ih, iw = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + InvArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0 + ), + name="inverse_area", + ) + + Avg = te.compute( + oshape, + lambda b, c, h, w: (Sum[b, c, h, w] * InvArea[h, w]).astype(odtype), + name="pool_avg", + ) + else: + InvArea = float(1) / (kh * kw) + Avg = te.compute( + oshape, lambda b, c, h, w: (Sum[b, c, h, w] * InvArea).astype(odtype), name="pool_avg" + ) + + return Avg + + +def avg_pool2d_NHWC( + data, kernel, stride, padding, dilation, count_include_pad, oshape, odtype="float16" +): """avg_pool2d compute""" if odtype != "float16": - RuntimeError(f"Unsupported output dtype '{odtype}'") + raise RuntimeError(f"Unsupported output dtype '{odtype}'") kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") - ob, oh, ow, oc = oshape - if isinstance(ob, int): - validate_out_shape(oshape, A.shape, kernel, stride, dilation) sh, sw = stride dh, dw = dilation InvArea = float(1) / (kh * kw) + dilated_kh = (kh - 1) * dh + 1 + dilated_kw = (kw - 1) * dw + 1 + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + get_const_tuple(padding), (dilated_kh, dilated_kw) + ) + + # DOPAD + if pad_top != 0 or pad_down != 0 or pad_left != 0 or pad_right != 0: + pad_before = (0, pad_top, pad_left, 0) + pad_after = (0, pad_down, pad_right, 0) + data_pad = pad(data, pad_before, pad_after, name="data_pad") + else: + # By definition when True, zero-padding will be included in the averaging calculation + # This is equivalent to PoolArea = (kh * kw) + count_include_pad = True + data_pad = data + Sum = te.compute( oshape, lambda b, h, w, c: te.sum( - A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw] + data_pad[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw] ), - name="sum", - ) - Avg = te.compute( - oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg" + name="pool_sum", ) + + if not count_include_pad: + # Compute PoolArea using unpadded input tensor + _, oh, ow, _ = oshape + _, ih, iw, _ = data.shape + + PoolArea = te.compute( + (oh, ow), + lambda i, j: compute_PoolArea(i, j, ih, iw, kh, kw, sh, sw, dh, dw, pad_top, pad_left), + name="pool_area", + ) + + InvArea = te.compute( + (oh, ow), + lambda i, j: tir.if_then_else( + tir.all(PoolArea[i, j] > 0), (float(1) / PoolArea[i, j]), 0 + ), + name="inverse_area", + ) + + Avg = te.compute( + oshape, + lambda b, h, w, c: (Sum[b, h, w, c] * InvArea[h, w]).astype(odtype), + name="pool_avg", + ) + else: + InvArea = float(1) / (kh * kw) + Avg = te.compute( + oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(odtype), name="pool_avg" + ) + return Avg -def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): - """Schedule for input and output layout nhwc-8h2w32c2w""" +def schedule_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): + """Schedule for input and output layout 8h2w32c2w""" func = te.create_prim_func([ins, outs]) + print(func) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) - - # Schedule 'Avg' - n, h, w, c = s.get_loops(Avg) - ho, hi = s.split(h, [None, 8]) - wo, wi = s.split(w, [None, 4]) - wio, wii = s.split(wi, [None, 2]) - co, ci = s.split(c, [None, 32]) - s.reorder(n, ho, wo, co, hi, wio, ci, wii) - ci_wii = s.fuse(ci, wii) - s.vectorize(ci_wii) - - # Schedule 'Sum' - s.compute_at(Sum, wio) - Sum_axis = s.get_loops(Sum) - s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3]) - ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3]) - # s.vectorize(ci_wii) # Doesn't work + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0) return s -def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): - """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w""" +def schedule_1024c(outs, ins, output_layout: str, input_layout: str): + """Schedule for output layout: 1024c, input layout: 8h2w32c2w""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) - Sum = s.get_block("sum") - Avg = s.get_block("avg") + Sum = s.get_block("pool_sum") + Avg = s.get_block("pool_avg") + mem_scope = "global.vtcm" + sum_read = s.cache_read(Sum, 0, mem_scope) + avg_write = s.cache_write(Avg, 0, mem_scope) input_transform_fn = get_layout_transform_fn(input_layout) output_transform_fn = get_layout_transform_fn(output_layout) - s.transform_layout(Sum, ("read", 0), input_transform_fn) - s.transform_layout(Avg, ("write", 0), output_transform_fn) + s.transform_layout(Sum, ("read", 0), input_transform_fn, pad_value=0.0) + s.transform_layout(Avg, ("write", 0), output_transform_fn, pad_value=0.0) # Schedule 'Avg' - n, h, w, c = s.get_loops(Avg) - co, ci = s.split(c, [None, 1024]) + if output_layout == "n11c-1024c-2d": + n, h, w, c = s.get_loops(Avg) + else: + n, c, h, w = s.get_loops(Avg) + _, ci = s.split(c, [None, 1024]) cio, cii = s.split(ci, [None, 64]) s.vectorize(cii) # Schedule 'Sum' - s.compute_at(Sum, cio) Sum_axis = s.get_loops(Sum) s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3]) - # s.vectorize(Sum_axis[-3]) # Doesn't work return s def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str): """avg_pool2d schedule""" - if output_layout == "nhwc-8h2w32c2w-2d": - return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) - if output_layout == "n11c-1024c-2d": - return schedule_n11c_1024c(outs, ins, output_layout, input_layout) + if output_layout == "nhwc-8h2w32c2w-2d" or output_layout == "nchw-8h2w32c2w-2d": + return schedule_8h2w32c2w(outs, ins, output_layout, input_layout) + if output_layout == "n11c-1024c-2d" or output_layout == "nc11-1024c-2d": + return schedule_1024c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index f017aaebbdeb..aa1af5de43db 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -24,10 +24,30 @@ from typing import Dict, Tuple, Union import tvm -from tvm import IRModule, te +from tvm import IRModule, te, tir from tvm.tir import IndexMap, PrimFunc +def is_scalar(expr): + if isinstance(expr, te.Tensor): + return expr.ndim == 0 and (isinstance(expr.op.body[0], (tir.FloatImm, tir.IntImm))) + return isinstance(expr, (tir.FloatImm, tir.IntImm)) + + +def get_const_int_value(expr): + if isinstance(expr, te.Tensor): + assert isinstance(expr.op.body[0], tir.IntImm) + return expr.op.body[0].value + return tvm.topi.utils.get_const_int(expr) + + +def get_const_float_value(expr): + if isinstance(expr, te.Tensor): + assert isinstance(expr.op.body[0], tir.FloatImm) + return expr.op.body[0].value + return tvm.topi.utils.get_const_float(expr) + + def n11c_1024c_2d(n, h, w, c): """Return index map for n11c_1024 2d layout""" return [n, h, w, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024] @@ -38,6 +58,11 @@ def n11c_1024c_1d(n, h, w, c): return [n, h, w, c // 1024, c % 1024] +def nc11_1024c_2d(n, c, h, w): + """Return index map for nc11_1024 2d layout""" + return [n, c // 1024, IndexMap.AXIS_SEPARATOR, c % 1024, h, w] + + def nhwc_8h2w32c2w_2d(n, h, w, c): """Return index map for nhwc_8h2w32c2w 2d layout""" return [n, h // 8, w // 4, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2] @@ -48,6 +73,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c): return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2] +def nchw_8h2w32c2w_2d(n, c, h, w): + """Return index map for nchw_8h2w32c2w 2d layout""" + return [n, c // 32, h // 8, w // 4, IndexMap.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2] + + def nhw_32h16w_2d(n, h, w): """Return index map for nhw_32h16w 2d layout""" return [n, h // 32, w // 16, IndexMap.AXIS_SEPARATOR, h % 32, w % 16] @@ -88,6 +118,11 @@ def nc_2048c_2d(n, c): return [n, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048] +def nc11_2048c_2d(n, c, h, w): + """Return index map for nc11_2048c 2d layout""" + return [n, c // 2048, IndexMap.AXIS_SEPARATOR, h, w, c % 2048] + + def nc_1024c_1d(n, c): """Return index map for nc_1024c 1d layout""" return [n, c // 1024, c % 1024] @@ -123,11 +158,25 @@ def nhwc_8h8w32c_2d(n, h, w, c): return [n, h // 8, w // 8, c // 32, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32] +def nhwc_8h8w32c_1d(n, h, w, c): + """Return index map for nhwc_8h8w32c 1d layout""" + return [n, h // 8, w // 8, c // 32, h % 8, w % 8, c % 32] + + +def nchw_8h8w32c_2d(n, c, h, w): + return [n, c // 32, h // 8, w // 8, IndexMap.AXIS_SEPARATOR, h % 8, w % 8, c % 32] + + def n11c_2048c_2d(n, h, w, c): """Return index map for n11c_2048c 2d layout""" return [n, h, w, c // 2048, IndexMap.AXIS_SEPARATOR, c % 2048] +def n11c_2048c_1d(n, h, w, c): + """Return index map for n11c_2048c 1 layout""" + return [n, h, w, c // 2048, c % 2048] + + def iohw_16i32o2i_1d(height, width, in_channel, out_channel): return [ in_channel // 32, @@ -163,12 +212,16 @@ def get_layout_transform_fn(layout): return nhwc_8h2w32c2w_2d if layout == "nhwc-8h2w32c2w-1d": return nhwc_8h2w32c2w_1d + if layout == "nchw-8h2w32c2w-2d": + return nchw_8h2w32c2w_2d if layout == "n11c-1024c-2d": return n11c_1024c_2d if layout == "n11c-1024c-1d": return n11c_1024c_1d if layout == "nhwc-1024c-2d": return nhwc_1024c_2d + if layout == "nc11-1024c-2d": + return nc11_1024c_2d if layout == "nc-1024-2d": return nc_1024_2d if layout == "nhw-32h16w-2d": @@ -201,16 +254,26 @@ def get_layout_transform_fn(layout): return nc_2048c_2d if layout == "nhwc-8h8w32c-2d": return nhwc_8h8w32c_2d + if layout == "nhwc-8h8w32c-1d": + return nhwc_8h8w32c_1d + if layout == "nchw-8h8w32c-2d": + return nchw_8h8w32c_2d if layout == "n11c-2048c-2d": return n11c_2048c_2d + if layout == "n11c-2048c-1d": + return n11c_2048c_1d if layout == "ohwi32o-1d": return ohwi32o_1d + if layout == "nc11-2048c-2d": + return nc11_2048c_2d if layout == "ncw-32c64w-2d": return ncw_32c64w_2d if layout == "nchw-32c8h8w-2d": return nchw_32c8h8w_2d if layout == "nchw-32c8h4w-2d": return nchw_32c8h4w_2d + if layout == "nchw-8h8w32c-2d": + return nchw_8h8w32c_2d raise RuntimeError(f"Unexpected layout '{layout}'") diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc index 2a8c2440519e..1cfbab6e661e 100644 --- a/src/relay/op/nn/pooling.cc +++ b/src/relay/op/nn/pooling.cc @@ -32,6 +32,7 @@ #include #include "../../transforms/infer_layout_utils.h" +#include "pooling_common.h" namespace tvm { namespace relay { @@ -40,43 +41,6 @@ namespace relay { TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs); TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs); -template -InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs, - const Array& new_in_layouts, - const Array& old_in_layouts, - const Array& old_in_types) { - const auto* attrs_ptr = attrs.as(); - ICHECK(attrs_ptr); - ObjectPtr params = make_object(*attrs_ptr); - - if (params->out_layout != "") { - // when users specify the out_layout of pooling, follow user's preference - ICHECK_EQ(params->layout, params->out_layout) - << "Pooling input/output layouts mismatch: " << params->layout << " vs. " - << params->out_layout; - } else if (new_in_layouts.defined()) { - // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller - ICHECK_EQ(new_in_layouts.size(), 1); - params->layout = new_in_layouts[0].name(); - } - - return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params)); -} - -IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount, - IndexExpr pool_size, IndexExpr dilation, IndexExpr stride_size, - bool ceil_mode) { - IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1); - IndexExpr denominator = stride_size; - - // Emulate the behavior of running ceil on numerator / denominator rather than floor - if (ceil_mode) { - numerator += denominator - 1; - } - - return numerator / denominator + 1; -} - template bool Pool2DRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { diff --git a/src/relay/op/nn/pooling.h b/src/relay/op/nn/pooling.h index 32ae464101ab..123cfcd07570 100644 --- a/src/relay/op/nn/pooling.h +++ b/src/relay/op/nn/pooling.h @@ -18,8 +18,8 @@ */ /*! - * \file src/relay/op/nn/convolution.h - * \brief Properties def of convlution operator for sharing. + * \file src/relay/op/nn/pooling.h + * \brief utilities for creating pool ops */ #ifndef TVM_RELAY_OP_NN_POOLING_H_ #define TVM_RELAY_OP_NN_POOLING_H_ diff --git a/src/relay/op/nn/pooling_common.h b/src/relay/op/nn/pooling_common.h new file mode 100644 index 000000000000..1193d36ebe88 --- /dev/null +++ b/src/relay/op/nn/pooling_common.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/op/nn/pooling_common.h + * \brief Common functions for pooling operator definition. + */ +#ifndef TVM_RELAY_OP_NN_POOLING_COMMON_H_ +#define TVM_RELAY_OP_NN_POOLING_COMMON_H_ + +#include +#include +#include + +#include +#include +#include + +#include "../op_common.h" + +namespace tvm { +namespace relay { + +inline IndexExpr calculate_pool_dimension(IndexExpr in_dimension, IndexExpr pad_amount, + IndexExpr pool_size, IndexExpr dilation, + IndexExpr stride_size, bool ceil_mode) { + IndexExpr numerator = in_dimension + pad_amount - ((pool_size - 1) * dilation + 1); + IndexExpr denominator = stride_size; + + // Emulate the behavior of running ceil on numerator / denominator rather than floor + if (ceil_mode) { + numerator += denominator - 1; + } + + return numerator / denominator + 1; +} + +template +InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + const auto* attrs_ptr = attrs.as(); + ICHECK(attrs_ptr); + ObjectPtr params = make_object(*attrs_ptr); + + if (params->out_layout != "") { + // when users specify the out_layout of pooling, follow user's preference + ICHECK_EQ(params->layout, params->out_layout) + << "Pooling input/output layouts mismatch: " << params->layout << " vs. " + << params->out_layout; + } else if (new_in_layouts.defined()) { + // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller + // ICHECK_EQ(new_in_layouts.size(), 1); + params->layout = new_in_layouts[0].name(); + } + + return InferCorrectLayoutOutput({params->layout}, {params->layout}, Attrs(params)); +} +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_OP_NN_POOLING_COMMON_H_ diff --git a/src/relay/qnn/op/avg_pool2d.cc b/src/relay/qnn/op/avg_pool2d.cc new file mode 100644 index 000000000000..b2dc08b85686 --- /dev/null +++ b/src/relay/qnn/op/avg_pool2d.cc @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/avg_pool2d.cc + * \brief Quantized avg_pool2d operator + */ + +#include +#include +#include +#include +#include +#include + +#include "../../op/nn/nn.h" +#include "../../op/nn/pooling.h" +#include "../../op/nn/pooling_common.h" +#include "../../op/tensor/transform.h" +#include "../../transforms/infer_layout_utils.h" +#include "../../transforms/pattern_utils.h" +#include "../utils.h" +#include "op_common.h" + +namespace tvm { +namespace relay { +namespace qnn { + +// relay.op.qnn.avg_pool2d +bool QnnAvgPool2DRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // Expected Types: data, input_zero_point, input_scale, output_zero_point, output_scale + // out_type + + ICHECK_EQ(types.size(), 6); + + const auto* data = types[0].as(); + if (data == nullptr) return false; + ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8)) + << "Expected quantized avg_pool2d type(int8, uint8) for input but was " << data->dtype; + + const auto* param = attrs.as(); + ICHECK(param != nullptr) << "AvgPool2DAttrs cannot be nullptr."; + + // Check the types of scale and zero points. + for (size_t i = 1; i < 5; ++i) { + if (types[i].as()) { + return false; + } + } + + ICHECK(IsScalarType(types[1], DataType::Float(32))); // input_scale + ICHECK(IsScalarType(types[2], DataType::Int(32))); // input_zero_point + ICHECK(IsScalarType(types[3], DataType::Float(32))); // output_scale + ICHECK(IsScalarType(types[4], DataType::Int(32))); // output_zero_point + + // Find the output shape and data type + const auto dshape = data->shape; + ICHECK_GE(dshape.size(), 2U) + << "Pool2D only support input >= 2-D: input must have height and width"; + + // Check input and output layout + Layout layout(param->layout); + // The Layout is always NHWC + ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) && + !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w'))) + << "Invalid input layout " << layout + << ". qnn_avg_pool2d inut layout must have H and W, which cannot be split"; + + // Find the output shape and data type + const auto hidx = layout.IndexOf(LayoutAxis::Get('H')); + const auto widx = layout.IndexOf(LayoutAxis::Get('W')); + + IndexExpr pad_h, pad_w; + if (param->padding.size() == 1) { + pad_h = param->padding[0] * 2; + pad_w = param->padding[0] * 2; + } else if (param->padding.size() == 2) { + // (top, left) + pad_h = param->padding[0] * 2; + pad_w = param->padding[1] * 2; + } else if (param->padding.size() == 4) { + // (top, left, bottom, right) + pad_h = param->padding[0] + param->padding[2]; + pad_w = param->padding[1] + param->padding[3]; + } else { + return false; + } + + std::vector oshape(dshape.begin(), dshape.end()); + if (dshape[hidx].as()) { + oshape[hidx] = dshape[hidx]; + } else { + oshape[hidx] = + calculate_pool_dimension(dshape[hidx], pad_h, param->pool_size[0], param->dilation[0], + param->strides[0], param->ceil_mode); + } + if (dshape[widx].as()) { + oshape[widx] = dshape[widx]; + } else { + oshape[widx] = + calculate_pool_dimension(dshape[widx], pad_w, param->pool_size[1], param->dilation[1], + param->strides[1], param->ceil_mode); + } + + // assign output type + reporter->Assign(types[5], TensorType(oshape, data->dtype)); + return true; +} + +InferCorrectLayoutOutput QnnAvgPoolInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + // Use Relay AvgPool2D Infer correct layout. + auto avgpool_new_layouts = + PoolInferCorrectLayout(attrs, new_in_layouts, old_in_layouts, old_in_types); + + // Scales and zero points are scalars, use the "undef" layout for them. + Array input_layouts = {avgpool_new_layouts->input_layouts[0], Layout::Undef(), + Layout::Undef(), Layout::Undef(), Layout::Undef()}; + Array output_layouts = avgpool_new_layouts->output_layouts; + return InferCorrectLayoutOutput(input_layouts, output_layouts, attrs); +} + +/* + * \brief Forward rewrite the qnn avg_pool2d op. + * \param attrs The QNN avg_pool2d attrs. + * \param new_args The new mutated args to the call node. + * \param arg_types The types of input and output. + * \return The sequence of Relay ops for qnn avg_pool2d op. + * \note Lowering of the qnn.avg_pool2d operator + + * Quantized avg_pool2d will take one quantized input tensor and returns another + * quantized tensor. Since the input qnn params can be different from the output + * qnn params, first, we requantize the input tensors with output qnn params and + * cast the results into Int32. Then we call relay.nn.avg_pool2d on that requantized + * inputs. Finally, the results are cast into the quantized output data type. + + * Note: The RequantizeOrUpcast function only perform requantization if the input + * and output qnn params are different, otherwise it only does casting to Int32. + */ + +Expr QnnAvgPoolCanonicalize(const Attrs& attrs, const Array& new_args, + const Array& arg_types) { + ICHECK_EQ(new_args.size(), 5); + Expr input_data = new_args[0]; + Expr input_scale = new_args[1]; + Expr input_zero_point = new_args[2]; + Expr output_scale = new_args[3]; + Expr output_zero_point = new_args[4]; + const auto in_shape = get_shape(arg_types[0]); + const auto* avgpool_attrs = attrs.as(); + auto requantized_input = RequantizeOrUpcast(input_data, input_scale, input_zero_point, + output_scale, output_zero_point, in_shape); + Expr nn_avg = AvgPool2D(requantized_input, avgpool_attrs->pool_size, avgpool_attrs->strides, + avgpool_attrs->dilation, avgpool_attrs->padding, avgpool_attrs->layout, + avgpool_attrs->out_layout, avgpool_attrs->ceil_mode, + avgpool_attrs->count_include_pad); + + const auto* data = arg_types[5].as(); + const int32_t min_val = GetQmin(data->dtype); + const int32_t max_val = GetQmax(data->dtype); + return Cast(Clip(nn_avg, min_val, max_val), data->dtype); +} + +// Positional relay function to create quantized avg_pool2d operator used by frontend FFI. +Expr MakeQuantizedAvgPool2D(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale, + Expr output_zero_point, Array pool_size, + Array strides, Array padding, + Array dilation, bool ceil_mode, bool count_include_pad, + String layout, String output_layout) { + auto attrs = make_object(); + attrs->pool_size = std::move(pool_size); + attrs->strides = std::move(strides); + attrs->padding = std::move(padding); + attrs->dilation = std::move(dilation); + attrs->layout = std::move(layout); + attrs->out_layout = std::move(output_layout); + attrs->ceil_mode = ceil_mode; + attrs->count_include_pad = count_include_pad; + static const Op& op = Op::Get("qnn.avg_pool2d"); + return Call(op, {data, input_scale, input_zero_point, output_scale, output_zero_point}, + Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.avg_pool2d") + .describe("Customized? qnn_avg_pool2d for quantized tensors.") + .set_attrs_type() + .set_num_inputs(5) + .add_argument("data", "Quantized Tensor", "The input data.") + .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.") + .add_argument("output_zero_point", "Tensor", + "The quantization zero_point of the output tensor.") + .set_support_level(11) + .add_type_rel("QnnAvgPool2D", QnnAvgPool2DRel) + .set_attr("TOpPattern", kOutEWiseFusable) + .set_attr("FInferCorrectLayout", QnnAvgPoolInferCorrectLayout) + .set_attr("FTVMQnnCanonicalize", QnnAvgPoolCanonicalize); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.avg_pool2d").set_body_typed(MakeQuantizedAvgPool2D); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/tests/python/contrib/test_hexagon/test_qnn_op_integration.py b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py new file mode 100644 index 000000000000..8cff4ed626bc --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_qnn_op_integration.py @@ -0,0 +1,576 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name + +"""Tests for QNN operations on Hexagon""" + +import numpy as np + +import tvm.testing +import tvm.topi.testing +from tvm import relay +from tvm.contrib.hexagon.session import Session +from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET +from tvm.relay.backend import Executor +from tvm.relay.testing import run_opt_pass, run_infer_type + +from .infrastructure import quantize_np + + +@tvm.testing.requires_hexagon +def test_disable_qnn_legalize_pass(): + """No QNN pass test.""" + x = relay.var("x", shape=(4, 8), dtype="float32") + op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8") + op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5)) + relay_mod = tvm.IRModule.from_expr(op1) + + target_hexagon = tvm.target.hexagon("v68") + # Default compilation flow + with tvm.transform.PassContext(opt_level=3): + opt_with_legalize, _ = relay.optimize( + relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon) + ) + + # Disable QNN legalization and canonicalization passes + with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]): + opt_without_legalize, _ = relay.optimize( + relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon) + ) + + # Check that QNN ops are absent with default compilation flow. + text_with_legalize = opt_with_legalize.astext(show_meta_data=False) + assert "qnn.quantize" not in text_with_legalize and "qnn.dequantize" not in text_with_legalize + + # Check that QNN ops are present without "qnn.Legalize" passes. + text_without_legalize = opt_without_legalize.astext(show_meta_data=False) + assert "qnn.quantize" in text_without_legalize and "qnn.dequantize" in text_without_legalize + + +def build_hexagon_module(relay_mod): + with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]): + exe_mod = tvm.relay.build( + relay_mod, + tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET), + executor=Executor("aot"), + ) + + return exe_mod + + +def build_ref_module(relay_mod): + target_llvm = tvm.target.Target("llvm") + with tvm.transform.PassContext(opt_level=3): + exe_mod = tvm.relay.build( + relay_mod, tvm.target.Target(target_llvm, host=target_llvm), executor=Executor("aot") + ) + return exe_mod + + +def execute(mod_executor, inputs: dict): + for input_name, input_data in inputs.items(): + mod_executor.set_input(input_name, input_data) + mod_executor.run() + return [mod_executor.get_output(i).numpy() for i in range(mod_executor.get_num_outputs())] + + +def execute_on_hexagon(hexagon_session, exe_mod, inputs: dict): + return execute(hexagon_session.get_executor_from_factory(exe_mod), inputs) + + +def execute_on_cpu(exe_mod, inputs: dict): + return execute(tvm.runtime.executor.AotModule(exe_mod["default"](tvm.cpu(0))), inputs) + + +def assert_allclose(actuals, desireds, rtol=1e-07, atol=0.01): + return [tvm.testing.assert_allclose(a, d, rtol, atol) for a, d in zip(actuals, desireds)] + + +def run_and_compare(hexagon_session, relay_mod, inputs, rtol=None, atol=None): + """Compile and execute given relay module on CPU and Hexagon, and compare + results""" + hexagon_mod = build_hexagon_module(relay_mod) + cpu_mod = build_ref_module(relay_mod) + + hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs) + cpu_outs = execute_on_cpu(cpu_mod, inputs) + + # Do not pass rtol/atol if not present to use default values from assert_allclose + tolerances = dict() + if rtol is not None: + tolerances["rtol"] = rtol + if atol is not None: + tolerances["atol"] = atol + + assert_allclose(hexagon_outs, cpu_outs, **tolerances) + + +# First test basic QNN ops: quantize, dequantize, requantize +# +class TestQnnQuantize: + """QNN Quantize test class.""" + + input_shape = tvm.testing.parameter([1, 8, 8, 32], [1, 10, 10, 32], [1, 12, 12, 128]) + odtype = tvm.testing.parameter("int8", "uint8") + + @tvm.testing.requires_hexagon + def test_qnn_quantize(self, hexagon_session: Session, odtype, input_shape): + """Test qnn.quantize""" + + def gen_relay_expr_qnn(output_scale, output_zero_point): + data = relay.var("data", shape=input_shape, dtype="float32") + qnn_quantize = relay.qnn.op.quantize( + data, + output_scale=relay.const(output_scale), + output_zero_point=relay.const(output_zero_point), + axis=-1, + out_dtype=odtype, + ) + return qnn_quantize + + inputs = {"data": np.random.random(input_shape)} + # Use quantize_np to obtain reasonable quantization parameters. + ref_out, scale, zero_point = quantize_np(inputs["data"], odtype) + + relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(scale, zero_point)) + + hexagon_mod = build_hexagon_module(relay_mod) + hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs) + assert_allclose(hexagon_outs, [ref_out], atol=1) + + +class TestQnnDequantize: + """QNN Dequantize test class.""" + + input_shape = tvm.testing.parameter( + [1, 12, 32, 128], [1, 10, 10, 32], [1, 6, 6, 2048], [1, 1000] + ) + idtype = tvm.testing.parameter("int8", "uint8") + + @tvm.testing.requires_hexagon + def test_qnn_dequantize(self, hexagon_session: Session, idtype, input_shape): + """Test qnn.dequantize""" + + def gen_relay_expr_qnn(dtype, input_scale, input_zero_point): + data = relay.var("data", shape=input_shape, dtype=dtype) + qnn_dequantize = relay.qnn.op.dequantize( + data, + input_scale=relay.const(input_scale), + input_zero_point=relay.const(input_zero_point), + ) + return qnn_dequantize + + # Generate float data, then quantize it to produce input. + ref_out = np.random.random(input_shape) + data, scale, zero_point = quantize_np(ref_out, idtype) + inputs = {"data": data} + + relay_mod = tvm.IRModule.from_expr(gen_relay_expr_qnn(idtype, scale, zero_point)) + + hexagon_mod = build_hexagon_module(relay_mod) + hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs) + # We do + # original -[quantize]-> input -[dequantize]-> output + # then compare "original" with "output". Use rtol=1 because of the quantized + # format in the middle. + assert_allclose(hexagon_outs, [ref_out], rtol=1, atol=1e-2) # rtol = 1 + + +class TestQnnRequantize: + """QNN requantize test class""" + + @tvm.testing.requires_hexagon + def test_qnn_requantize(self, hexagon_session: Session): + """Test qnn.requantize""" + data_shape = [256] + data = relay.var("data", shape=data_shape, dtype="int32") + + op = relay.qnn.op.requantize( + data, + input_scale=relay.const(0.156), + input_zero_point=relay.const(2), + output_scale=relay.const(0.212), + output_zero_point=relay.const(1), + out_dtype="int8", + ) + relay_mod = tvm.IRModule.from_expr(op) + + inputs = {"data": np.arange(-256, 256, 2, dtype="int32")} + + run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0) # equal + + +class TestQnnAvgPool2d: + """QNN AvgPool2d test class.""" + + _multitest_params = [ + ([1, 12, 12, 32], "NHWC", [3, 3], [1, 1], [2, 3], [1, 2, 3, 4], False, False), + ([1, 18, 18, 32], "NCHW", [3, 3], [2, 2], [2, 1], [1, 2, 3, 4], False, True), + ] + + ( + input_shape, + layout, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + ) = tvm.testing.parameters(*_multitest_params) + + idtype, odtype = tvm.testing.parameters(("uint8", "uint8")) + + @tvm.testing.requires_hexagon + def test_qnn_avg_pool2d( + self, + hexagon_session: Session, + idtype, + odtype, + input_shape, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + layout, + ): + """Test qnn.avg_pool2d""" + + def gen_relay_expr_qnn( + dtype, input_scale, input_zero_point, output_scale, output_zero_point + ): + data = relay.var("data", shape=input_shape, dtype=dtype) + qnn_avg_pool = relay.qnn.op.avg_pool2d( + data, + input_scale=relay.const(input_scale), + input_zero_point=relay.const(input_zero_point), + output_scale=relay.const(output_scale), + output_zero_point=relay.const(output_zero_point), + pool_size=kernel, + strides=stride, + dilation=dilation, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + layout=layout, + ) + + return qnn_avg_pool + + # Generate inputs and reference data first. + fp_input = np.random.random(input_shape) + fp_output = tvm.topi.testing.poolnd_python( + fp_input, + kernel, + stride, + dilation, + padding_before=padding[:2], + padding_after=padding[2:], + pool_type="avg", + count_include_pad=count_include_pad, + ceil_mode=ceil_mode, + layout=layout, + ) + input_data, input_scale, input_zero_point = quantize_np(fp_input, idtype) + ref_out, output_scale, output_zero_point = quantize_np(fp_output, odtype) + inputs = {"data": input_data} + + relay_mod = tvm.IRModule.from_expr( + gen_relay_expr_qnn( + idtype, input_scale, input_zero_point, output_scale, output_zero_point + ) + ) + + hexagon_mod = build_hexagon_module(relay_mod) + hexagon_outs = execute_on_hexagon(hexagon_session, hexagon_mod, inputs) + assert_allclose(hexagon_outs, [ref_out], rtol=0, atol=2) + + +class TestQnnBinaryOp: + """QNN binary op test class""" + + operation = tvm.testing.parameter(relay.qnn.op.add, relay.qnn.op.subtract, relay.qnn.op.mul) + dtype = tvm.testing.parameter("uint8", "int8") + input_shape = tvm.testing.parameter([256], [4, 256]) + + @tvm.testing.requires_hexagon + def test_qnn_binary_op(self, hexagon_session: Session, operation, dtype, input_shape): + """Test binary qnn ops""" + lhs_shape = [4, 256] + rhs_shape = input_shape + lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype) + rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype) + lhs_zp = 1 + rhs_zp = 3 + + op = operation( + lhs, + rhs, + lhs_scale=relay.const(0.041, "float32"), + lhs_zero_point=relay.const(lhs_zp, "int32"), + rhs_scale=relay.const(0.017, "float32"), + rhs_zero_point=relay.const(rhs_zp, "int32"), + output_scale=relay.const(0.039, "float32"), + output_zero_point=relay.const(2, "int32"), + ) + relay_mod = tvm.IRModule.from_expr(op) + + inputs = { + "lhs": np.random.randint(np.iinfo(dtype).min + lhs_zp, np.iinfo(dtype).max, lhs_shape), + "rhs": np.random.randint(np.iinfo(dtype).min + rhs_zp, np.iinfo(dtype).max, rhs_shape), + } + + run_and_compare(hexagon_session, relay_mod, inputs, atol=1) # diff by 1 is ok + + @tvm.testing.requires_hexagon + def test_qnn_binary_op_broadcasting(self, hexagon_session: Session, operation): + """Test binary qnn ops (with argument broadcast)""" + lhs_shape = [4, 256] + lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8") + rhs = relay.const(11, dtype="uint8") + + op = operation( + lhs, + rhs, + lhs_scale=relay.const(0.049, "float32"), + lhs_zero_point=relay.const(1, "int32"), + rhs_scale=relay.const(0.067, "float32"), + rhs_zero_point=relay.const(3, "int32"), + output_scale=relay.const(0.041, "float32"), + output_zero_point=relay.const(2, "int32"), + ) + relay_mod = tvm.IRModule.from_expr(op) + + inputs = {"lhs": np.random.randint(1, 255, size=lhs_shape)} + + run_and_compare(hexagon_session, relay_mod, inputs, atol=1) # diff by 1 is ok + + +class TestQnnConcatenate: + """QNN concatenate test class""" + + @tvm.testing.requires_hexagon + def test_qnn_concatenate(self, hexagon_session: Session): + """Test qnn.concatenate""" + x_shape = [1, 64] + y_shape = [2, 64] + z_shape = [3, 64] + input_x = relay.var("x", shape=x_shape, dtype="uint8") + input_y = relay.var("y", shape=y_shape, dtype="uint8") + input_z = relay.var("z", shape=z_shape, dtype="uint8") + + op = relay.qnn.op.concatenate( + (input_x, input_y, input_z), + input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)), + input_zero_points=(relay.const(0), relay.const(1), relay.const(2)), + output_scale=relay.const(0.8), + output_zero_point=relay.const(5), + axis=0, + ) + relay_mod = tvm.IRModule.from_expr(op) + + inputs = { + "x": np.arange(0, 64, 1, dtype="uint8").reshape(x_shape), + "y": np.arange(0, 128, 1, dtype="uint8").reshape(y_shape), + "z": np.arange(0, 192, 1, dtype="uint8").reshape(z_shape), + } + + run_and_compare(hexagon_session, relay_mod, inputs, atol=1) # diff by 1 is ok + + +class TestQnnConv2D: + """QNN conv2d op test class.""" + + @tvm.testing.requires_hexagon + def test_qnn_quantize_conv2d_requantize(self, hexagon_session: Session): + """Tast qnn.conv2d""" + data_shape = [1, 8, 32, 32] + weight_shape = [16, 8, 3, 3] + data = relay.var("data", shape=data_shape, dtype="float32") + weight = relay.var("weight", shape=weight_shape, dtype="float32") + op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8") + op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8") + op2 = relay.qnn.op.conv2d( + op0, + op1, + input_zero_point=relay.const(0), + kernel_zero_point=relay.const(0), + input_scale=relay.const(0.078), + kernel_scale=relay.const(0.07), + padding=[0, 0, 0, 0], + channels=16, + kernel_size=[3, 3], + ) + op5 = relay.qnn.op.requantize( + op2, + input_scale=relay.const(0.05), + input_zero_point=relay.const(0), + output_scale=relay.const(0.21), + output_zero_point=relay.const(61), + out_dtype="int8", + ) + relay_mod = tvm.IRModule.from_expr(op5) + + inputs = { + "data": np.random.rand(*data_shape), + "weight": np.random.rand(*weight_shape) - 0.5, + } + + run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0) # equal + + +class TestQnnDense: + """QNN dense op test class.""" + + @tvm.testing.requires_hexagon + def test_alter_layout_qnn_dense(self): + """Test weights layout transformation of qnn.dense with int8 weights""" + data = relay.var("data", shape=(128, 16), dtype="uint8") + weight = relay.var("weight", shape=(64, 16), dtype="int8") + zero = relay.const(0) + iscale = relay.const(0.15) + wscale = relay.const(0.37) + + def before(): + return relay.qnn.op.dense(data, weight, zero, zero, iscale, wscale, units=None) + + def expected(): + op0 = relay.layout_transform(weight, src_layout="NC", dst_layout="NC32n4c") + return relay.qnn.op.contrib_dense_pack(data, op0, zero, zero, iscale, wscale, "NC32n4c") + + target = tvm.target.hexagon("v68") + with tvm.target.Target(target): + a = run_opt_pass(before(), tvm.relay.transform.AlterOpLayout()) + b = run_infer_type(expected()) + tvm.ir.assert_structural_equal(a, b) + + # Dense + bias_add + requantize + # + dtype = tvm.testing.parameter("uint8", "int8") + n_dim = tvm.testing.parameter(64, 60) + + @tvm.testing.requires_hexagon + def test_qnn_dense_biasadd_requantize(self, hexagon_session: Session, dtype, n_dim): + """Check lowering of qnn.dense + bias_add + qnn.requantize + dtype: type of weights + n_dim: N dimension of weights, need to check cases when it is multiple of 32 and not. + """ + data_shape = [128, 32] + weight_shape = [n_dim, 32] + bias_shape = [n_dim] + data = relay.var("data", shape=data_shape, dtype="uint8") + weight = relay.var("weight", shape=weight_shape, dtype=dtype) + bias = relay.var("bias", shape=bias_shape, dtype="int32") + + op0 = relay.qnn.op.dense( + data, + weight, + input_zero_point=relay.const(2), + kernel_zero_point=relay.const(0), + input_scale=relay.const(0.08), + kernel_scale=relay.const(0.07), + units=None, + ) + op1 = relay.nn.bias_add(op0, bias) + op2 = relay.qnn.op.requantize( + op1, + input_scale=relay.const(1.3), + input_zero_point=relay.const(4), + output_scale=relay.const(3.7), + output_zero_point=relay.const(1), + out_dtype="uint8", + ) + relay_mod = tvm.IRModule.from_expr(op2) + + np.random.seed(0) + + inputs = { + "data": np.random.randint(2, 8, size=data_shape, dtype="uint8"), + "weight": np.random.randint(0, 8, size=weight_shape, dtype=dtype), + "bias": np.random.randint(-10, 10, size=bias_shape, dtype="int32"), + } + + run_and_compare(hexagon_session, relay_mod, inputs, atol=1) # diff by 1 is ok + + # Dense + requantize + # + @tvm.testing.requires_hexagon + def test_qnn_dense_requantize(self, hexagon_session: Session): + """Check lowering of qnn.dense + qnn.requantize + Checkint the case: data type = "uint8", weight type = "int8", input zp = 0 and kernel zp = 0 + """ + data_shape = [128, 32] + weight_shape = [64, 32] + data = relay.var("data", shape=data_shape, dtype="uint8") + weight = relay.var("weight", shape=weight_shape, dtype="int8") + + op0 = relay.qnn.op.dense( + data, + weight, + input_zero_point=relay.const(0), + kernel_zero_point=relay.const(0), + input_scale=relay.const(0.06), + kernel_scale=relay.const(0.19), + units=64, + ) + op1 = relay.qnn.op.requantize( + op0, + input_scale=relay.const(0.1), + input_zero_point=relay.const(0), + output_scale=relay.const(0.24), + output_zero_point=relay.const(64), + out_dtype="uint8", + ) + relay_mod = tvm.IRModule.from_expr(op1) + + np.random.seed(0) + + inputs = { + "data": np.random.randint(0, 8, size=data_shape, dtype="uint8"), + "weight": np.random.randint(-4, 4, size=weight_shape, dtype="int8"), + } + + run_and_compare(hexagon_session, relay_mod, inputs, atol=1) # diff by 1 is ok + + +class TestQnnTanh: + """QNN tanh test class""" + + @tvm.testing.requires_hexagon + def test_qnn_tanh(self, hexagon_session: Session): + """Test qnn.tanh""" + data_shape = [256] + data = relay.var("data", shape=data_shape, dtype="uint8") + + op = relay.qnn.op.tanh( + data, + scale=relay.const(0.518), + zero_point=relay.const(137), + output_scale=relay.const(0.207), + output_zero_point=relay.const(128), + ) + relay_mod = tvm.IRModule.from_expr(op) + + inputs = {"data": np.arange(0, 256, 1, dtype="uint8")} + + run_and_compare(hexagon_session, relay_mod, inputs, rtol=0, atol=0) # equal + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py deleted file mode 100644 index f4342f5814df..000000000000 --- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py +++ /dev/null @@ -1,475 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""No QNN canonicalization tests.""" - -import numpy as np - -import tvm.testing -from tvm import relay -from tvm.contrib.hexagon.session import Session -from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET -from tvm.relay.backend import Executor -from tvm.relay.testing import run_opt_pass, run_infer_type - - -@tvm.testing.requires_hexagon -def test_no_qnn_pass(): - """No QNN pass test.""" - x = relay.var("x", shape=(4, 8), dtype="float32") - op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8") - op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5)) - mod = tvm.IRModule.from_expr(op1) - - target_hexagon = tvm.target.hexagon("v68") - # Default compilation flow - with tvm.transform.PassContext(opt_level=3): - opt_mod_1, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon)) - - # Disable QNN legalization and canonicalization passes - with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]): - opt_mod_2, _ = relay.optimize(mod, tvm.target.Target(target_hexagon, host=target_hexagon)) - - # Check that QNN ops are absent with default compilation flow. - assert "qnn.quantize" not in opt_mod_1.astext(show_meta_data=False) - assert "qnn.dequantize" not in opt_mod_1.astext(show_meta_data=False) - - # Check that QNN ops are present without "qnn.Legalize" passes. - assert "qnn.quantize" in opt_mod_2.astext(show_meta_data=False) - assert "qnn.dequantize" in opt_mod_2.astext(show_meta_data=False) - - -def test_alter_layout_qnn_dense(): - """Test weights layout transformation of qnn.dense with int8 weights""" - data = relay.var("data", shape=(128, 16), dtype="uint8") - weight = relay.var("weight", shape=(64, 16), dtype="int8") - zero = relay.const(0) - iscale = relay.const(0.15) - wscale = relay.const(0.37) - - def before(): - return relay.qnn.op.dense(data, weight, zero, zero, iscale, wscale, units=None) - - def expected(): - op0 = relay.layout_transform(weight, src_layout="NC", dst_layout="NC32n4c") - return relay.qnn.op.contrib_dense_pack(data, op0, zero, zero, iscale, wscale, "NC32n4c") - - target = tvm.target.hexagon("v68") - with tvm.target.Target(target): - a = run_opt_pass(before(), tvm.relay.transform.AlterOpLayout()) - b = run_infer_type(expected()) - tvm.ir.assert_structural_equal(a, b) - - -def execute(mod_executor, inputs: dict): - for input_name, input_data in inputs.items(): - mod_executor.set_input(input_name, input_data) - mod_executor.run() - return mod_executor.get_output(0).numpy() - - -def build_hexagon_module(mod): - with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]): - hexagon_lowered = tvm.relay.build( - mod, - tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET), - executor=Executor("aot"), - ) - - return hexagon_lowered - - -def build_ref_module(mod): - target_llvm = tvm.target.Target("llvm") - with tvm.transform.PassContext(opt_level=3): - llvm_lowered = tvm.relay.build( - mod, - tvm.target.Target(target_llvm, host=target_llvm), - executor=Executor("aot"), - ) - return llvm_lowered - - -@tvm.testing.requires_hexagon -def test_qnn_conv2d_rq(hexagon_session: Session): - """QNN conv2d test.""" - data_shape = [1, 8, 32, 32] - weight_shape = [16, 8, 3, 3] - data = relay.var("data", shape=data_shape, dtype="float32") - weight = relay.var("weight", shape=weight_shape, dtype="float32") - op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8") - op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8") - op2 = relay.qnn.op.conv2d( - op0, - op1, - input_zero_point=relay.const(0), - kernel_zero_point=relay.const(0), - input_scale=relay.const(0.078), - kernel_scale=relay.const(0.07), - padding=[0, 0, 0, 0], - channels=16, - kernel_size=[3, 3], - ) - op5 = relay.qnn.op.requantize( - op2, - input_scale=relay.const(0.05), - input_zero_point=relay.const(0), - output_scale=relay.const(0.21), - output_zero_point=relay.const(61), - out_dtype="int8", - ) - relay_mod = tvm.IRModule.from_expr(op5) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(relay_mod) - - # Reference compilation - llvm_lowered = build_ref_module(relay_mod) - - data_np = np.random.rand(*data_shape) - weight_np = np.random.rand(*weight_shape) - 0.5 - inputs = {"data": data_np, "weight": weight_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_out = execute(llvm_m, inputs) - - np.testing.assert_equal(hexagon_output, llvm_out) - - -class TestQnnDense: - """QNN dense op test class.""" - - dtype = tvm.testing.parameter("uint8", "int8") - n_dim = tvm.testing.parameter(64, 60) - - @tvm.testing.requires_hexagon - def test_qnn_dense_add_requantize(self, hexagon_session: Session, dtype, n_dim): - """Check lowering of qnn.dense + bias_add + qnn.requantize - dtype: type of weights - n_dim: N dimension of weights, need to check cases when it is multiple of 32 and not. - """ - data_shape = [128, 32] - weight_shape = [n_dim, 32] - bias_shape = [n_dim] - data = relay.var("data", shape=data_shape, dtype="uint8") - weight = relay.var("weight", shape=weight_shape, dtype=dtype) - bias = relay.var("bias", shape=bias_shape, dtype="int32") - - op0 = relay.qnn.op.dense( - data, - weight, - input_zero_point=relay.const(2), - kernel_zero_point=relay.const(0), - input_scale=relay.const(0.08), - kernel_scale=relay.const(0.07), - units=None, - ) - op1 = relay.nn.bias_add(op0, bias) - op2 = relay.qnn.op.requantize( - op1, - input_scale=relay.const(1.3), - input_zero_point=relay.const(4), - output_scale=relay.const(3.7), - output_zero_point=relay.const(1), - out_dtype="uint8", - ) - relay_mod = tvm.IRModule.from_expr(op2) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(relay_mod) - - # Reference compilation - llvm_lowered = build_ref_module(relay_mod) - - np.random.seed(0) - - data_np = np.random.randint(2, 8, size=data_shape, dtype="uint8") - weight_np = np.random.randint(0, 8, size=weight_shape, dtype=dtype) - bias_np = np.random.randint(-10, 10, size=bias_shape, dtype="int32") - inputs = {"data": data_np, "weight": weight_np, "bias": bias_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) - llvm_out = execute(llvm_m, inputs) - - # Diff by 1 is Ok. - tvm.testing.assert_allclose(hexagon_output, llvm_out, atol=1) - - @tvm.testing.requires_hexagon - def test_qnn_dense_requantize(self, hexagon_session: Session): - """Check lowering of qnn.dense + qnn.requantize - Checkint the case: data type = "uint8", weight type = "int8", input zp = 0 and kernel zp = 0 - """ - data_shape = [128, 32] - weight_shape = [64, 32] - data = relay.var("data", shape=data_shape, dtype="uint8") - weight = relay.var("weight", shape=weight_shape, dtype="int8") - - op0 = relay.qnn.op.dense( - data, - weight, - input_zero_point=relay.const(0), - kernel_zero_point=relay.const(0), - input_scale=relay.const(0.06), - kernel_scale=relay.const(0.19), - units=64, - ) - op1 = relay.qnn.op.requantize( - op0, - input_scale=relay.const(0.1), - input_zero_point=relay.const(0), - output_scale=relay.const(0.24), - output_zero_point=relay.const(64), - out_dtype="uint8", - ) - relay_mod = tvm.IRModule.from_expr(op1) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(relay_mod) - - # Reference compilation - llvm_lowered = build_ref_module(relay_mod) - - np.random.seed(0) - - data_np = np.random.randint(0, 8, size=data_shape, dtype="uint8") - weight_np = np.random.randint(-4, 4, size=weight_shape, dtype="int8") - inputs = {"data": data_np, "weight": weight_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) - llvm_out = execute(llvm_m, inputs) - - # Diff by 1 is Ok. - tvm.testing.assert_allclose(hexagon_output, llvm_out, atol=1) - - -class TestQnnBinaryOp: - """QNN binary op test class""" - - operation = tvm.testing.parameter( - relay.qnn.op.add, - relay.qnn.op.subtract, - relay.qnn.op.mul, - ) - dtype = tvm.testing.parameter("uint8", "int8") - input_shape = tvm.testing.parameter([256], [4, 256]) - - @tvm.testing.requires_hexagon - def test_qnn_binary_op_broadcasting( - self, hexagon_session: Session, operation, dtype, input_shape - ): - """qnn binary op test without QNN canonicalization.""" - lhs_shape = [4, 256] - rhs_shape = input_shape - lhs = relay.var("lhs", shape=lhs_shape, dtype=dtype) - rhs = relay.var("rhs", shape=rhs_shape, dtype=dtype) - zp_const1 = 1 - zp_const2 = 3 - - op = operation( - lhs, - rhs, - lhs_scale=relay.const(0.041, "float32"), - lhs_zero_point=relay.const(zp_const1, "int32"), - rhs_scale=relay.const(0.017, "float32"), - rhs_zero_point=relay.const(zp_const2, "int32"), - output_scale=relay.const(0.039, "float32"), - output_zero_point=relay.const(2, "int32"), - ) - mod = tvm.IRModule.from_expr(op) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(mod) - - # Reference compilation - llvm_lowered = build_ref_module(mod) - - lhs_np = np.random.randint(np.iinfo(dtype).min + zp_const1, np.iinfo(dtype).max, lhs_shape) - rhs_np = np.random.randint(np.iinfo(dtype).min + zp_const2, np.iinfo(dtype).max, rhs_shape) - inputs = {"lhs": lhs_np, "rhs": rhs_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_output = execute(llvm_m, inputs) - - # Diff by 1 is Ok. - tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1) - - @tvm.testing.requires_hexagon - def test_qnn_binary_op_scalar(self, hexagon_session: Session, operation): - """qnn binary op test without QNN canonicalization.""" - lhs_shape = [4, 256] - lhs = relay.var("lhs", shape=lhs_shape, dtype="uint8") - rhs = relay.const(11, dtype="uint8") - - op = operation( - lhs, - rhs, - lhs_scale=relay.const(0.049, "float32"), - lhs_zero_point=relay.const(1, "int32"), - rhs_scale=relay.const(0.067, "float32"), - rhs_zero_point=relay.const(3, "int32"), - output_scale=relay.const(0.041, "float32"), - output_zero_point=relay.const(2, "int32"), - ) - mod = tvm.IRModule.from_expr(op) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(mod) - - # Reference compilation - llvm_lowered = build_ref_module(mod) - - lhs_np = np.random.randint(1, 255, size=lhs_shape) - inputs = {"lhs": lhs_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_output = execute(llvm_m, inputs) - - # Diff by 1 is Ok. - tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1) - - -class TestQnnOp: - """QNN op test class""" - - @tvm.testing.requires_hexagon - def test_qnn_requantize(self, hexagon_session: Session): - """qnn.requantize test without QNN canonicalization.""" - data_shape = [256] - data = relay.var("data", shape=data_shape, dtype="int32") - - op = relay.qnn.op.requantize( - data, - input_scale=relay.const(0.156), - input_zero_point=relay.const(2), - output_scale=relay.const(0.212), - output_zero_point=relay.const(1), - out_dtype="int8", - ) - mod = tvm.IRModule.from_expr(op) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(mod) - - # Reference compilation - llvm_lowered = build_ref_module(mod) - - data_np = np.arange(-256, 256, 2, dtype="int32") - inputs = {"data": data_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_output = execute(llvm_m, inputs) - - np.testing.assert_equal(hexagon_output, llvm_output) - - @tvm.testing.requires_hexagon - def test_qnn_concatenate(self, hexagon_session: Session): - """qnn.concatenate op test without QNN canonicalization.""" - x_shape = [1, 64] - y_shape = [2, 64] - z_shape = [3, 64] - input_x = relay.var("x", shape=x_shape, dtype="uint8") - input_y = relay.var("y", shape=y_shape, dtype="uint8") - input_z = relay.var("z", shape=z_shape, dtype="uint8") - - op = relay.qnn.op.concatenate( - (input_x, input_y, input_z), - input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)), - input_zero_points=(relay.const(0), relay.const(1), relay.const(2)), - output_scale=relay.const(0.8), - output_zero_point=relay.const(5), - axis=0, - ) - mod = tvm.IRModule.from_expr(op) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(mod) - - # Reference compilation - llvm_lowered = build_ref_module(mod) - - x_np = np.arange(0, 64, 1, dtype="uint8").reshape(x_shape) - y_np = np.arange(0, 128, 1, dtype="uint8").reshape(y_shape) - z_np = np.arange(0, 192, 1, dtype="uint8").reshape(z_shape) - inputs = {"x": x_np, "y": y_np, "z": z_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_output = execute(llvm_m, inputs) - - # Diff by 1 is Ok. - tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1) - - @tvm.testing.requires_hexagon - def test_qnn_tanh(self, hexagon_session: Session): - """qnn.tanh op test without QNN canonicalization.""" - data_shape = [256] - data = relay.var("data", shape=data_shape, dtype="uint8") - - op = relay.qnn.op.tanh( - data, - scale=relay.const(0.518), - zero_point=relay.const(137), - output_scale=relay.const(0.207), - output_zero_point=relay.const(128), - ) - mod = tvm.IRModule.from_expr(op) - - # Compile for Hexagon - hexagon_lowered = build_hexagon_module(mod) - - # Reference compilation - llvm_lowered = build_ref_module(mod) - - data_np = np.arange(0, 256, 1, dtype="uint8") - inputs = {"data": data_np} - - hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = execute(hx_m, inputs) - - dev = tvm.cpu(0) - llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev)) - llvm_output = execute(llvm_m, inputs) - - np.testing.assert_equal(hexagon_output, llvm_output) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py index 0eedfdbf8da1..712d5b303eeb 100644 --- a/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_avg_pool2d_slice.py @@ -20,74 +20,70 @@ from tvm import te import tvm.testing +from tvm.topi.testing import poolnd_python from tvm.contrib.hexagon.session import Session import tvm.topi.hexagon.slice_ops as sl import tvm.topi.hexagon.qnn as qn from tvm.contrib.hexagon import allocate_hexagon_array -from ...infrastructure import ( - transform_numpy, - quantize_np, - get_hexagon_target, -) -from ...pytest_util import ( - get_multitest_ids, - create_populated_numpy_ndarray, - TensorContentRandom, -) +import pytest +from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target +from ...pytest_util import get_multitest_ids, create_populated_numpy_ndarray, TensorContentRandom -input_layout = tvm.testing.parameter( - "nhwc-8h2w32c2w-2d", -) -dtype = tvm.testing.parameter("float16", "uint8") +dtype = tvm.testing.parameter("uint8", "float16") @tvm.testing.fixture -def output_layout(output_shape, dtype): - o_b, o_h, o_w, o_c = output_shape - if dtype == "float16": - if o_h == 1 and o_w == 1: - return "n11c-1024c-2d" +def output_layout(output_shape, op_layout, dtype): + if op_layout == "NHWC": + o_b, o_h, o_w, o_c = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "n11c-1024c-2d" + else: + return "nhwc-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "n11c-2048c-2d" + else: + return "nhwc-8h8w32c-2d" else: - assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape" - return "nhwc-8h2w32c2w-2d" - elif dtype == "int8" or "uint8": - if o_h == 1 and o_w == 1: - return "n11c-2048c-2d" + raise RuntimeError(f"Unsupported data type '{dtype}'") + + elif op_layout == "NCHW": + o_b, o_c, o_h, o_w = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "nc11-1024c-2d" + else: + return "nchw-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "nc11-2048c-2d" + else: + return "nchw-8h8w32c-2d" else: - assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape" - return "nhwc-8h8w32c-2d" + raise RuntimeError(f"Unsupported data type '{dtype}'") else: - raise RuntimeError(f"Unsupported data type '{dtype}'") + raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'") @tvm.testing.fixture -def input_np(input_shape, dtype: str, input_tensor_populator): - if dtype == "uint8": - dtype = "float32" # Use "float32" input which will be quantized later - return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) - - -@tvm.testing.fixture -def transformed_expected_output_np(expected_output_np, output_layout, dtype): +def input_layout(op_layout, dtype): + in_layout = op_layout.lower() if dtype == "float16": - return transform_numpy(expected_output_np, "nhwc", output_layout) - elif dtype in ("uint8", "int8"): - quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype) - return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point] + return in_layout + "-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + return in_layout + "-8h8w32c-2d" else: raise RuntimeError(f"Unsupported data type '{dtype}'") @tvm.testing.fixture -def transformed_input_np_padded(input_np_padded, input_layout, dtype): - if dtype == "float16": - return transform_numpy(input_np_padded, "nhwc", input_layout) - elif dtype in ("uint8", "int8"): - quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype) - return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point] - else: - raise RuntimeError(f"Unsupported data type '{dtype}'") +def input_np(input_shape, dtype: str, input_tensor_populator): + if dtype == "uint8": + dtype = "float32" # Use "float32" input which will be quantized later + return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) class TestAvgPool2dSlice: @@ -99,110 +95,143 @@ class TestAvgPool2dSlice: "pad", # padding "ceil", # ceil_mode "cnt_padded", # count_include_pad + "op_layout", # input output 4D layout None, # input_tensor_populator ] - _multitest_params = [ ( - [1, 8, 8, 32], + [1, 7, 11, 32], + [3, 3], + [3, 2], + [2, 3], + [1, 2, 3, 4], + False, + False, + "NHWC", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [4, 4], + [2, 2], + [2, 3], + [0, 2, 1, 4], + False, + False, + "NHWC", + TensorContentRandom(), + ), + # Test default stride,dilation, and padding with different layouts + ( + [1, 10, 10, 32], [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 16, 16, 32], + [1, 12, 12, 32], [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], - [8, 8], + [1, 32, 14, 14], + [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), - # Test non-one stride and dilation ( - [1, 8, 8, 32], - [3, 3], - [2, 3], + [1, 32, 15, 15], + [8, 8], + [1, 1], [1, 1], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), + # Test non-one stride and dilation with different layouts ( - [1, 8, 8, 32], + [1, 18, 24, 32], [3, 3], - [2, 2], + [2, 3], [2, 2], [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], - [3, 3], + [1, 32, 18, 18], + [5, 5], [2, 2], [2, 3], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), - # Test non-zero padding + # Test non-zero padding with count include and exclude pad and different layouts ( - [1, 8, 8, 32], + [1, 6, 6, 32], [3, 3], [1, 1], [1, 1], [1, 1, 1, 1], False, - True, + False, + "NHWC", TensorContentRandom(), ), ( [1, 8, 8, 32], [3, 3], - [1, 1], - [1, 1], - [1, 2, 3, 4], + [1, 2], + [2, 3], + [2, 2, 3, 3], False, - True, + False, + "NHWC", TensorContentRandom(), ), ( - [1, 8, 8, 32], + [1, 32, 6, 6], [3, 3], [1, 1], [1, 1], [1, 2, 3, 4], False, - True, + False, + "NCHW", TensorContentRandom(), ), ( - [1, 8, 8, 32], + [1, 32, 15, 22], [3, 3], [3, 2], [2, 3], [1, 2, 3, 4], False, - True, + False, + "NCHW", TensorContentRandom(), ), # Test n11c-1024c-2d layout which will require input and output to have different layout @@ -214,6 +243,7 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( @@ -221,9 +251,21 @@ class TestAvgPool2dSlice: [6, 6], [1, 1], [1, 1], - [0, 0, 0, 0], + [2, 2, 2, 2], False, - True, + False, + "NHWC", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [4, 4], + [2, 2], + [2, 3], + [0, 2, 1, 4], + False, + False, + "NHWC", TensorContentRandom(), ), ( @@ -234,23 +276,24 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, + "NHWC", TensorContentRandom(), ), ( - [1, 1, 1, 2048], + [1, 2048, 1, 1], [4, 4], [2, 2], [2, 3], [0, 0, 0, 0], False, True, + "NCHW", TensorContentRandom(), ), ] _param_ids = get_multitest_ids(_multitest_params, _param_descs) - # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" ( output_shape, kernel, @@ -259,23 +302,17 @@ class TestAvgPool2dSlice: padding, ceil_mode, count_include_pad, + op_layout, input_tensor_populator, ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) @tvm.testing.fixture def expected_output_np( - self, - input_np, - kernel, - stride, - dilation, - padding, - ceil_mode, - count_include_pad, + self, input_np, kernel, stride, dilation, padding, ceil_mode, count_include_pad, op_layout ): pad_before = padding[:2] pad_after = padding[2:] - ref_np = tvm.topi.testing.poolnd_python( + ref_np = poolnd_python( input_np, kernel, stride, @@ -285,150 +322,129 @@ def expected_output_np( "avg", # pool_type count_include_pad, False, # ceil_mode, - layout="NHWC", + layout=op_layout, ) return ref_np @tvm.testing.fixture - def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout): + def input_shape( + self, output_shape, kernel, padding, stride, dilation, op_layout, output_layout + ): # Input shape without any padding; 'ceil' is being ignored from calculation: - o_b, o_h, o_w, o_c = output_shape + if op_layout == "NHWC": + o_b, o_h, o_w, o_c = output_shape + else: + o_b, o_c, o_h, o_w = output_shape d_h, d_w = dilation s_h, s_w = stride k_h, k_w = kernel pad_before_h, pad_before_w = padding[:2] pad_after_h, pad_after_w = padding[2:] - if output_layout == "n11c-1024c-2d": - assert ( - pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0 - ), "Padding must be zero for n11c-1024c-2d layout" + if ( + output_layout == "n11c-2048c-2d" + or output_layout == "nc11-2048c-2d" + or output_layout == "n11c-1024c-2d" + or output_layout == "nc11-1024c-2d" + ): assert o_h == 1 and o_w == 1, "Output height and width must be 1" in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w - return [o_b, in_h, in_w, o_c] - - @tvm.testing.fixture - def input_shape_padded(self, input_shape, padding, output_layout, dtype): - # Input shape is adjusted to account for 'padding'. Also, due to the physical - # layout of the buffer, height and width are adjusted so that they are a - # multiple of the buffer size dictated by the layout. - # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and - # for int8/uint8, it's nhwc-8h8w32c-2d. - # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple - # of 8. However, the width should be a multiple of 4 for the first case and 8 for - # the second case. - - height_mult = 8 - if dtype == "float16": - width_mult = 4 # input layout : nhwc-8h2w32c2w-2d - elif dtype in ("uint8", "int8"): - width_mult = 8 # input layout : nhwc-8h8w32c-2d + if op_layout == "NHWC": + return [o_b, in_h, in_w, o_c] else: - raise RuntimeError(f"Unsupport dtype '{dtype}'") - - pad_before_h, pad_before_w = padding[:2] - pad_after_h, pad_after_w = padding[2:] - padded_input_height = ( - (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult - ) * height_mult - padded_input_width = ( - (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult - ) * width_mult - return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]] - - @tvm.testing.fixture - def input_np_padded(self, input_np, input_shape, input_shape_padded, padding): - pad_before_h, pad_before_w = padding[:2] - pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h - pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w - input_padded = np.pad( - input_np, - ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)), - "constant", - ) - return input_padded + return [o_b, o_c, in_h, in_w] @tvm.testing.fixture def schedule_args( self, - stride, kernel, - dtype, + stride, + padding, dilation, - input_layout, + count_include_pad, output_layout, output_shape, - input_shape_padded, - transformed_input_np_padded, - transformed_expected_output_np, + input_np, + input_shape, + input_layout, + expected_output_np, + dtype, + op_layout, ): - """ - Construct schedule args based on dtype - """ - A = te.placeholder(input_shape_padded, name="A", dtype=dtype) - + """Construct schedule args based on dtype""" + A = te.placeholder(input_shape, name="A", dtype=dtype) if dtype == "float16": - M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape) + if op_layout == "NHWC": + M = sl.avg_pool2d_NHWC( + A, kernel, stride, padding, dilation, count_include_pad, output_shape + ) + elif op_layout == "NCHW": + M = sl.avg_pool2d_NCHW( + A, kernel, stride, padding, dilation, count_include_pad, output_shape + ) + else: + raise RuntimeError(f"Unsupported layout for slice_op.avg_pool2d '{op_layout}'") tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout) elif dtype in ("uint8", "int8"): - in_data, in_scale, in_zero_point = transformed_input_np_padded - _, out_scale, out_zero_point = transformed_expected_output_np - M = qn.qnn_avg_pool2d_compute( - A, - kernel, - stride, - dilation, - output_shape, - dtype, - in_zero_point, - in_scale, - out_zero_point, - out_scale, - ) + _, in_scale, in_zero_point = quantize_np(input_np, dtype) + _, out_scale, out_zero_point = quantize_np(expected_output_np, dtype) + if op_layout == "NHWC": + M = qn.qnn_avg_pool2d_NHWC( + A, + kernel, + stride, + padding, + dilation, + count_include_pad, + output_shape, + dtype, + in_scale, + in_zero_point, + out_scale, + out_zero_point, + ) + elif op_layout == "NCHW": + M = qn.qnn_avg_pool2d_NCHW( + A, + kernel, + stride, + padding, + dilation, + count_include_pad, + output_shape, + dtype, + in_scale, + in_zero_point, + out_scale, + out_zero_point, + ) + else: + raise RuntimeError(f"Unsupported layout for qnn.avg_pool2d '{op_layout}'") + tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout) return [tir_schedule.mod, [A, M]] @tvm.testing.requires_hexagon def test_avg_pool2d_slice( - self, - dtype, - output_layout, - output_shape, - transformed_input_np_padded, - transformed_expected_output_np, - schedule_args, - hexagon_session: Session, + self, dtype, input_np, expected_output_np, schedule_args, hexagon_session: Session ): - in_data = transformed_input_np_padded - + print("schedule_args : ", schedule_args) with tvm.transform.PassContext(opt_level=3): - func = tvm.build( - *schedule_args, - get_hexagon_target("v69"), - name="avg_pool2d", - ) - - input_axis_separator = [4] - if output_layout in ( - "nhwc-8h2w32c2w-2d", - "nhwc-8h8w32c-2d", - "n11c-1024c-2d", - "n11c-2048c-2d", - ): - output_axis_separator = [4] - else: - raise RuntimeError(f"Unexpected layout '{output_layout}'") + func = tvm.build(*schedule_args, get_hexagon_target("v69"), name="avg_pool2d") + + input_axis_separator = [] + output_axis_separator = [] if dtype == "float16": - in_data_np = transformed_input_np_padded - out_data_np = transformed_expected_output_np + in_data_np = input_np + out_data_np = expected_output_np elif dtype in ("uint8", "int8"): - in_data_np, _, _ = transformed_input_np_padded - out_data_np, _, _ = transformed_expected_output_np + in_data_np, _, _ = quantize_np(input_np, dtype) + out_data_np, _, _ = quantize_np(expected_output_np, dtype) else: raise RuntimeError(f"Unsupport dtype '{dtype}'") @@ -436,34 +452,26 @@ def test_avg_pool2d_slice( hexagon_session.device, data=in_data_np, axis_separators=input_axis_separator, - mem_scope="global.vtcm", + mem_scope="global.ddr", ) output_arr = allocate_hexagon_array( hexagon_session.device, out_data_np.shape, dtype, axis_separators=output_axis_separator, - mem_scope="global.vtcm", + mem_scope="global.ddr", ) mod = hexagon_session.load_module(func) mod(input_arr, output_arr) - b, h, w, c = output_shape - if output_layout == "nhwc-8h2w32c2w-2d": - output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) - elif output_layout == "nhwc-8h8w32c-2d": - output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) - elif output_layout == "n11c-2048c-2d": - output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048]) - elif output_layout == "n11c-1024c-2d": - output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024]) - else: - raise RuntimeError(f"Unexpected layout '{output_layout}'") + + output_np = output_arr.numpy() if dtype == "float16": np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3) else: - np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1) + output_np = output_arr.numpy() + np.testing.assert_allclose(output_np, out_data_np, rtol=0, atol=2) if __name__ == "__main__":