diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index fe1180e43985..cda63e2e1c73 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -18,6 +18,7 @@ """ Computes and Schedules for Hexagon slice ops. """ from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule +from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule from .add_subtract_multiply import * from .argmax import argmax_compute, argmax_schedule from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py new file mode 100644 index 000000000000..4bf958c11694 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py @@ -0,0 +1,196 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals + +""" Compute and schedule for max_pool2d slice op + +Please note the following assumptions made by the implementation: + +1) The input must be padded in advance to account for 'padding'. In addition, + both input and output must be padded as per the physical buffer layout. + +2) The current implementation assumes 'count_include_pad' to be 'True'. It can be + modified to support 'False' case but the element count for the pooling window + must be pre-computed and provided as an input to reduce the run-time overhead. + +3) 'padding' is ignored. It must be handled outside of the sliced op. + +4) This implementation will not work if the output includes any physical layout + related padding, as it can result into out-of-bound access for the input. +""" + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn + + +def validate_out_shape(out_shape, in_shape, kernel, stride, dilation): + """Validate output shape""" + _, oh, ow, _ = out_shape + _, ih, iw, _ = in_shape + kh, kw = kernel + sh, sw = stride + dh, dw = dilation + if ih < (oh - 1) * sh + dh * (kh - 1) + 1: + raise RuntimeError("Output height is too large") + if iw < (ow - 1) * sw + dw * (kw - 1) + 1: + raise RuntimeError("Output width is too large") + + +def max_pool2d_compute(A, out_shape, kernel, stride, dilation): + """max_pool2d compute""" + kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") + ob, oh, ow, oc = out_shape + if isinstance(ob, int): + validate_out_shape(out_shape, A.shape, kernel, stride, dilation) + + sh, sw = stride + dh, dw = dilation + + Max = te.compute( + out_shape, + lambda b, h, w, c: te.max( + A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(A.dtype), axis=[rh, rw] + ), + name="max", + ) + return Max + + +def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): + """Schedule for input and output layout nhwc-8h2w32c2w""" + func = te.create_prim_func([ins, outs]) + s = tir.Schedule(func) + + # NOTE!!! This scheduling logic is a work in progress. + # It is not known to ultimately result in near-optimal Hexagon performance. + # The schedule below strives to implement these heuristics: + # + # (1) For mathematical operations on tensor values, prefer HVX SIMD operations + # over per-element scalar operations. + # + # (2) Minimize the number of memory transfers used to operate on tensor values: + # host-memory <--> Hexagon DDR <--> VTCM <--> HVX registers + # + # As a consequence of (1) + (2), prefer TIR schedules that load each value + # into an HVX SIMD tensor exactly once. + + Max = s.get_block("max") + + input_transform_fn = get_layout_transform_fn(input_layout) + output_transform_fn = get_layout_transform_fn(output_layout) + + s.transform_layout(Max, ("read", 0), input_transform_fn) + s.transform_layout(Max, ("write", 0), output_transform_fn) + + # pylint: disable=line-too-long + # + # Restructure the loop nestings to have this overall structure: + # (loop over different 128-byte output-tensor chunks) : n, ho, wo, co }- the first level of a two-level tensor layout + # (loop within one 128-byte output-tensor chunk) : hi, wio, ci, wii }- the second level of a two-level tensor layout + # (loop over reduction axes) : rh, rw }- loop over multiple elements of the input tensor + # + # Note: This schedule is a work in progress. We *expect* that it's + # crucially important for the loops to have this relative ordering: + # n ... ho ... wo ... co ... hi ... wio ... ci ... wii + # because it lets us visit each of the 128-byte output chunks precisely once. + + ( + n, + h, + w, + c, + rh, + rw, + ) = s.get_loops(Max) + + # Restructure the loops from NHWC to nhwc_8h2w32c2w, with loops for 'max's reduction + # axes at the very end. + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 4]) + wio, wii = s.split(wi, [None, 2]) + co, ci = s.split(c, [None, 32]) + s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw) + + # TODO: Enable vectorization. + # Hexagon v69's HVX units support SIMD operations on 64-element float16 vectors. + # + # TVM's 'vectorize' schedule primitive is the idiomatic way to encourage lower layers of the + # compiler to generate this kind of SIMD object code. + # + # Several requirements must be met to use 'vectorize': + # + # 1) It can only be applied to a schedule's innermost loop variable. + # + # 2) Any block-iterator(s) bound to that innermost loop variable must be + # *data-parallel* block iterators. + # + # 3) Ideally, the innermost loop variable will iterate only over the output + # tensor's fastest-changing indices and nothing else. But in our case, + # our two innermost loops correspond to the the max operator's reduction axes. + # + # Finding a good way to satisfy all of these requirements at the same time is + # left for future work. + + # ci_wii = s.fuse(ci, wii) + # s.vectorize(ci_wii_rh_rw) + + return s + + +def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): + """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w""" + + # NOTE: This function is a variation of the STIR_schedule_nhwc_8h2w32c2w + # functions. Most of that function's code comments apply to this function + # as well, but are ommited for brevity. + + # NOTE: the "n11c-1024c" output layout is shorthand for this axis mapping: + # [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024] + func = te.create_prim_func([ins, outs]) + + s = tir.Schedule(func) + Max = s.get_block("max") + + input_transform_fn = get_layout_transform_fn(input_layout) + output_transform_fn = get_layout_transform_fn(output_layout) + s.transform_layout(Max, ("read", 0), input_transform_fn) + s.transform_layout(Max, ("write", 0), output_transform_fn) + + ( + n, + h, + w, + c, + rh, + rw, + ) = s.get_loops(Max) + co, ci = s.split(c, [None, 1024]) + # s.vectorize(ci) + + return s + + +def max_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str): + """STIR based schedule""" + if output_layout == "nhwc-8h2w32c2w-2d": + return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) + if output_layout == "n11c-1024c-2d": + return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout) + raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py new file mode 100644 index 000000000000..373a59e0b613 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py @@ -0,0 +1,389 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +import numpy as np +from typing import * + +from tvm import te +import tvm.testing +from tvm.topi import testing +from tvm.contrib.hexagon.build import HexagonLauncher +from tvm.contrib.hexagon.session import Session +import tvm.topi.hexagon.slice_ops as sl +from ..infrastructure import allocate_hexagon_array, transform_numpy +from ..pytest_util import ( + get_numpy_dtype_info, + get_test_id, + get_multitest_ids, + create_populated_numpy_ndarray, + TensorContentRandom, +) + + +@tvm.testing.fixture +def input_np(input_shape, dtype: str, input_tensor_populator): + return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) + + +@tvm.testing.fixture +def transformed_expected_output_np(expected_output_np, output_layout): + return transform_numpy(expected_output_np, "nhwc", output_layout) + + +@tvm.testing.fixture +def transformed_input_np_padded(input_np_padded, input_layout): + return transform_numpy(input_np_padded, "nhwc", input_layout) + + +class TestmaxPool2dSlice: + _param_descs = [ + "out_shape", # output_shape + "kernel", # kernel + "stride", # stride + "dil", # dilation + "pad", # padding + "ceil", # ceil_mode + "cnt_padded", # count_include_pad + "out_layout", # output_layout + None, # dtype + None, # input_tensor_populator + ] + + _multitest_params = [ + ( + [1, 8, 8, 32], + [3, 3], + [1, 1], + [1, 1], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 16, 16, 32], + [3, 3], + [1, 1], + [1, 1], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 8, 8, 32], + [8, 8], + [1, 1], + [1, 1], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + # Test non-one stride and dilation + ( + [1, 8, 8, 32], + [3, 3], + [2, 3], + [1, 1], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 8, 8, 32], + [3, 3], + [2, 2], + [2, 2], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 8, 8, 32], + [3, 3], + [2, 2], + [2, 3], + [0, 0, 0, 0], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + # Test non-zero padding + ( + [1, 8, 8, 32], + [3, 3], + [1, 1], + [1, 1], + [1, 1, 1, 1], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 8, 8, 32], + [3, 3], + [1, 1], + [1, 1], + [1, 2, 3, 4], + False, + True, + "nhwc-8h2w32c2w-2d", + "float16", + TensorContentRandom(), + ), + # Test n11c-1024c-2d layout which will require input and output to have different layout + ( + [1, 1, 1, 2048], + [8, 8], + [1, 1], + [1, 1], + [0, 0, 0, 0], + False, + True, + "n11c-1024c-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [6, 6], + [1, 1], + [1, 1], + [0, 0, 0, 0], + False, + True, + "n11c-1024c-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [3, 3], + [2, 2], + [1, 1], + [0, 0, 0, 0], + False, + True, + "n11c-1024c-2d", + "float16", + TensorContentRandom(), + ), + ( + [1, 1, 1, 2048], + [4, 4], + [2, 2], + [2, 3], + [0, 0, 0, 0], + False, + True, + "n11c-1024c-2d", + "float16", + TensorContentRandom(), + ), + ] + + _param_ids = get_multitest_ids(_multitest_params, _param_descs) + + input_layout = tvm.testing.parameter( + "nhwc-8h2w32c2w-2d", + ) + + # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" + ( + output_shape, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + output_layout, + dtype, + input_tensor_populator, + ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) + + @tvm.testing.fixture + def expected_output_np( + self, + input_np, + kernel, + stride, + dilation, + padding, + ceil_mode, + count_include_pad, + ): + pad_before = padding[:2] + pad_after = padding[2:] + ref_np = tvm.topi.testing.poolnd_python( + input_np, + kernel, + stride, + dilation, + pad_before, + pad_after, + "max", # pool_type + count_include_pad, + False, # ceil_mode, + layout="NHWC", + ) + return ref_np + + @tvm.testing.fixture + def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout): + # Input shape without any padding; 'ceil' is being ignored from calculation: + o_b, o_h, o_w, o_c = output_shape + d_h, d_w = dilation + s_h, s_w = stride + k_h, k_w = kernel + pad_before_h, pad_before_w = padding[:2] + pad_after_h, pad_after_w = padding[2:] + + if output_layout == "n11c-1024c-2d": + assert ( + pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0 + ), "Padding must be zero for n11c-1024c-2d layout" + assert o_h == 1 and o_w == 1, "Output height and width must be 1" + + in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h + in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w + + return [o_b, in_h, in_w, o_c] + + @tvm.testing.fixture + def input_shape_padded(self, input_shape, padding, output_layout): + # Input shape is adjusted to account for 'padding'. Also, due to the physical + # layout of the buffer, height and width are adjusted so that they are a + # multiple of 8 and 4 respectively. + # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d. + pad_before_h, pad_before_w = padding[:2] + pad_after_h, pad_after_w = padding[2:] + padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8 + padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4 + return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]] + + @tvm.testing.fixture + def input_np_padded(self, input_np, input_shape, input_shape_padded, padding): + pad_before_h, pad_before_w = padding[:2] + pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h + pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w + + input_padded = np.pad( + input_np, + ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)), + "constant", + ) + return input_padded + + @tvm.testing.requires_hexagon + def test_max_pool2d_slice( + self, + stride, + kernel, + dtype, + dilation, + padding, + ceil_mode, # only needed for manually obtaining the test id string + input_tensor_populator, # only needed for manually obtaining the test id string + count_include_pad, + input_layout, + output_layout, + output_shape, + input_shape, + input_shape_padded, + input_np, + input_np_padded, + transformed_input_np_padded, + transformed_expected_output_np, + expected_output_np, + hexagon_session: Session, + ): + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928") + + target_hexagon = tvm.target.hexagon("v69") + A = te.placeholder(input_shape_padded, name="A", dtype=dtype) + + M = sl.max_pool2d_compute(A, output_shape, kernel, stride, dilation) + + # tir schedule + tir_schedule = sl.max_pool2d_STIR_schedule(M, A, output_layout, input_layout) + sch = tir_schedule.mod + + input_axis_separator = [4] + if output_layout == "nhwc-8h2w32c2w-2d": + output_axis_separator = [4] + elif output_layout == "n11c-1024c-2d": + output_axis_separator = [4] + else: + raise RuntimeError(f"Unexpected layout '{output_layout}'") + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build( + sch, + [A, M], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="max_pool2d", + ) + + input_arr = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np_padded, + axis_separators=input_axis_separator, + mem_scope="global.vtcm", + ) + output_arr = allocate_hexagon_array( + hexagon_session.device, + transformed_expected_output_np.shape, + dtype, + axis_separators=output_axis_separator, + mem_scope="global.vtcm", + ) + + mod = hexagon_session.load_module(func) + mod(input_arr, output_arr) + b, h, w, c = output_shape + if output_layout == "nhwc-8h2w32c2w-2d": + output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) + elif output_layout == "n11c-1024c-2d": + output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024]) + else: + raise RuntimeError(f"Unexpected layout '{output_layout}'") + + np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3) + + +if __name__ == "__main__": + tvm.testing.main()