From e5196a31187b32249685826402aaea5a7d993ff9 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 14 Sep 2021 14:21:47 -0700 Subject: [PATCH] [Hexagon] Add contrib tests for blocked conv2d and maxpool2d (#8960) * Add hexagon contrib tests for blocked conv2d and maxpool2d * Restructure based on review comments --- tests/python/contrib/test_hexagon/__init__.py | 18 + tests/python/contrib/test_hexagon/conftest.py | 37 ++ .../contrib/test_hexagon/infrastructure.py | 88 ++++ .../test_hexagon/test_conv2d_blocked.py | 473 ++++++++++++++++++ .../test_hexagon/test_maxpool2d_blocked.py | 155 ++++++ 5 files changed, 771 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/__init__.py create mode 100644 tests/python/contrib/test_hexagon/conftest.py create mode 100644 tests/python/contrib/test_hexagon/infrastructure.py create mode 100644 tests/python/contrib/test_hexagon/test_conv2d_blocked.py create mode 100644 tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py diff --git a/tests/python/contrib/test_hexagon/__init__.py b/tests/python/contrib/test_hexagon/__init__.py new file mode 100644 index 000000000000..58dc4cc1e03d --- /dev/null +++ b/tests/python/contrib/test_hexagon/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Testing infrastructure for Hexagon """ diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py new file mode 100644 index 000000000000..0329328de3df --- /dev/null +++ b/tests/python/contrib/test_hexagon/conftest.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Hexagon testing fixtures used to deduce testing argument + values from testing parameters """ + +import tvm +from .infrastructure import get_packed_filter_layout + + +@tvm.testing.fixture +def shape_nhwc(batch, in_channel, in_size): + return (batch, in_size, in_size, in_channel) + + +@tvm.testing.fixture +def shape_oihw(out_channel, in_channel, kernel): + return (out_channel, in_channel, kernel, kernel) + + +@tvm.testing.fixture +def shape_oihw8i32o4i(out_channel, in_channel, kernel): + return get_packed_filter_layout(out_channel, in_channel, kernel, kernel) diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py new file mode 100644 index 000000000000..193a8630c3d2 --- /dev/null +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Hexagon testing infrastructure """ + +import tvm +import numpy + + +def ceildiv(o, d): + return tvm.tir.floordiv(o + d - 1, d) + + +def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True): + assert len(shape_nhwc) == 4 + shape = [shape_nhwc[0]] + off_h, off_w, off_c = block_shape + shape.append(ceildiv(shape_nhwc[1], off_h)) + shape.append(ceildiv(shape_nhwc[2], off_w)) + if packed_C: + shape.append(ceildiv(shape_nhwc[3], off_c)) + shape.extend(block_shape) + else: + shape.extend([off_h, off_w, shape_nhwc[3]]) + return shape + + +def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w): + out_factor, in_first_factor, in_second_factor = 32, 32, 4 + return ( + int(ceildiv(out_channel, out_factor)), + int(ceildiv(in_channel, in_first_factor)), + kernel_h, + kernel_w, + in_first_factor // in_second_factor, + out_factor, + in_second_factor, + ) + + +def build_and_run(inputs, func, target, target_host, *args, **kwargs): + schedule, placeholders, binds = func(*args, **kwargs) + + func = tvm.build(schedule, placeholders, target=target, target_host=target_host, binds=binds) + dev = tvm.device(target) + tensors = [] + for tensor in inputs: + tensors.append(tvm.nd.array(tensor, dev)) + tensors.append( + tvm.nd.array( + numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), + dev, + ) + ) + func(*tensors) + + return tensors[-1].asnumpy() + + +def get_block_shape(): + return 8, 8, 32 + + +def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, out_channels): + assert len(shape_nhwc) == 4 + kernel = [] + kernel.append((kernel_size[0] - 1) * dilation[0] + 1) + kernel.append((kernel_size[1] - 1) * dilation[1] + 1) + return ( + shape_nhwc[0], + (shape_nhwc[1] - kernel[0] + padding[0] + padding[1]) // strides[0] + 1, + (shape_nhwc[2] - kernel[1] + padding[2] + padding[3]) // strides[1] + 1, + out_channels, + ) diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py new file mode 100644 index 000000000000..e0b7fb20ab8e --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py @@ -0,0 +1,473 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +import tvm +from tvm import te +from tvm import topi +from tvm.topi import testing +from .infrastructure import ( + ceildiv, + build_and_run, + get_block_shape, + get_conv2d_nhwc_shape, + get_packed_filter_layout, + get_packed_activation_layout, +) + +import numpy as np +import pytest + + +def conv2d_logical( + shape_nhwc, + shape_oihw, + kernel_size, + stride, + padding, + dtype, + storage_scope="global", +): + """ + Conv2d TE wherein both input activation and filter tensors + are defined with their logical NHWC/OIHW shapes, respectively. + The packed physical layout for the activation and filter are: + Activation: nhwc8h8w32c + Filter: oihw8i32o4i + """ + assert kernel_size == tuple(shape_oihw[2:]) + + block_shape = get_block_shape() + block_H, block_W, block_C = block_shape + shape = get_packed_activation_layout(shape_nhwc, block_shape) + logical_output_shape = get_conv2d_nhwc_shape( + shape_nhwc, kernel_size, stride, padding, [1, 1], shape_oihw[0] + ) + output_shape = get_packed_activation_layout(logical_output_shape, block_shape) + + N, H, W, C = shape_nhwc + X = te.placeholder(shape_nhwc, dtype=dtype) + # Combination of padding required by conv2d operator and padding to evenly divisible + # number of blocks. Note that this padding should be inlined in the schedule so + # as to avoid input copying. + pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H + pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W + X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0) + # Calculate packed layout + X_packed = te.compute( + shape, + lambda n, ho, wo, co, hi, wi, ci: X_pad[ + n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci + ], + ) + + # Filter shape using KCRS (OIHW) notation + K, C, R, S = shape_oihw + filter_Ki, filter_Ci, filter_Cii = 32, 32, 4 + shape_filter = get_packed_filter_layout(K, C, R, S) + filt = te.placeholder(shape_oihw, dtype=dtype) + # Channel padding to multiples of 32 + pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci + pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki + filt_pad = topi.nn.pad( + filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter" + ) + filt_packed = te.compute( + shape_filter, + lambda ko, co, r, s, cio, ki, cii: filt_pad[ + ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s + ], + name="packed_filter", + ) + + rh = te.reduce_axis((0, kernel_size[0]), name="rh") + rw = te.reduce_axis((0, kernel_size[1]), name="rw") + rc = te.reduce_axis((0, C), name="rc") + + def compute(n, ho, wo, ko, hi, wi, ki): + # Construct blockized strided conv2d height index + h = ho * block_H + hi + h_contig = h * stride[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + # Construct blockized strided conv2d width index + w = wo * block_W + wi + w_contig = w * stride[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + # Construct blockized conv2d channel index + c_block_id = rc // block_C + c_block_offset = rc % block_C + + # Construct flat filter input channel indices + rco = rc // filter_Ci + rcio = (rc % filter_Ci) // filter_Cii + rcii = rc % filter_Cii + + return te.sum( + X_packed[ + n, + h_block_id, + w_block_id, + c_block_id, + h_block_offset, + w_block_offset, + c_block_offset, + ] + * filt_packed[ko, rco, rh, rw, rcio, ki, rcii], + axis=[rh, rw, rc], + ) + + Y = te.compute(output_shape, compute) + s = te.create_schedule(Y.op) + + # Ensure the padding and array packing is performed inline + s[X_pad].compute_inline() + s[X_packed].compute_inline() + + s[filt_pad].compute_inline() + s[filt_packed].compute_inline() + + binds = {} + if storage_scope and storage_scope != "global": + with tvm.transform.PassContext(): + Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope) + Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope) + binds = {X: Xb, Y: Yb} + + return (s, [X, filt, Y], binds) + + +def conv2d_packed_filter( + shape_nhwc, + shape_oihw8i32o4i, + kernel_size, + stride, + padding, + dtype, + storage_scope="global", +): + """ + Conv2d TE wherein the input activation is defined by its + logical NHWC shape, but the filter is provided in the + packed layout oihw8i32o4i. The physical packed layout used + for the activation is: nhwc8h8w32c + """ + assert kernel_size == tuple(shape_oihw8i32o4i[2:4]) + + block_shape = get_block_shape() + block_H, block_W, block_C = block_shape + shape = get_packed_activation_layout(shape_nhwc, block_shape) + logical_output_shape = get_conv2d_nhwc_shape( + shape_nhwc, + kernel_size, + stride, + padding, + [1, 1], + shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5], + ) + + output_shape = get_packed_activation_layout(logical_output_shape, block_shape) + + N, H, W, C = shape_nhwc + X = te.placeholder(shape_nhwc, dtype=dtype) + # Combination of padding required by conv2d operator and padding to evenly divisible + # number of blocks. Note that this padding should be inlined in the schedule so + # as to avoid input copying. + pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H + pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W + + X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0) + # Calculate packed layout + packed_shape = get_packed_activation_layout(X_pad.shape, block_shape) + + X_packed = te.compute( + packed_shape, + lambda n, ho, wo, co, hi, wi, ci: X_pad[ + n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci + ], + ) + + # Filter shape using KCRS (OIHW) notation + filter_Ki, filter_Ci, filter_Cii = 32, 32, 4 + assert shape_oihw8i32o4i[-1] == filter_Cii + assert shape_oihw8i32o4i[-2] == filter_Ki + assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii + + filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype) + + rh = te.reduce_axis((0, kernel_size[0]), name="rh") + rw = te.reduce_axis((0, kernel_size[1]), name="rw") + rc = te.reduce_axis((0, C), name="rc") + + def compute(n, ho, wo, ko, hi, wi, ki): + # Construct blockized strided conv2d height index + h = ho * block_H + hi + h_contig = h * stride[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + # Construct blockized strided conv2d width index + w = wo * block_W + wi + w_contig = w * stride[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + # Construct blockized conv2d channel index + c_block_id = rc // block_C + c_block_offset = rc % block_C + + # Construct flat filter input channel indices + rco = rc // filter_Ci + rcio = (rc % filter_Ci) // filter_Cii + rcii = rc % filter_Cii + + return te.sum( + X_packed[ + n, + h_block_id, + w_block_id, + c_block_id, + h_block_offset, + w_block_offset, + c_block_offset, + ] + * filt_packed[ko, rco, rh, rw, rcio, ki, rcii], + axis=[rh, rw, rc], + ) + + Y = te.compute(output_shape, compute) + s = te.create_schedule(Y.op) + + # Ensure the padding and array packing is performed inline + s[X_pad].compute_inline() + s[X_packed].compute_inline() + + # Perform scheduling + n, hid, wid, cid, hoff, woff, coff = s[Y].op.axis + slice = s[Y].fuse(wid, cid) + Xl = s.cache_read(X_packed, storage_scope, [Y]) + Yl = s.cache_write(Y, storage_scope) + + s[Yl].compute_at(s[Y], hid) + n, hid, slice, hoff, woff, coff = s[Yl].op.axis + s[Xl].compute_at(s[Yl], slice) + + binds = {} + if storage_scope and storage_scope != "global": + with tvm.transform.PassContext(): + Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope) + Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope) + binds = {X: Xb, Y: Yb} + + return (s, [X, filt_packed, Y], binds) + + +def conv2d_packed_filter_nhwhwc( + shape_nhwc, + shape_oihw8i32o4i, + kernel_size, + stride, + padding, + dtype, + storage_scope="global", +): + """ + Conv2d TE wherein the input activation is defined by its + logical NHWC shape, but the filter is provided in the + packed layout oihw8i32o4i. The physical packed layout used + for the activation is: nhw8h8wc + + """ + assert kernel_size == tuple(shape_oihw8i32o4i[2:4]) + + block_shape = get_block_shape() + block_H, block_W, _ = block_shape + shape = get_packed_activation_layout(shape_nhwc, block_shape, packed_C=False) + logical_output_shape = get_conv2d_nhwc_shape( + shape_nhwc, + kernel_size, + stride, + padding, + [1, 1], + shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5], + ) + output_shape = get_packed_activation_layout(logical_output_shape, block_shape, packed_C=False) + + N, H, W, C = shape_nhwc + X = te.placeholder(shape_nhwc, dtype=dtype) + # Combination of padding required by conv2d operator and padding to evenly divisible + # number of blocks. Note that this padding should be inlined in the schedule so + # as to avoid input copying. + pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H + pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W + X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0) + # Calculate packed layout + packed_shape = get_packed_activation_layout(X_pad.shape, block_shape, packed_C=False) + X_packed = te.compute( + packed_shape, lambda n, ho, wo, hi, wi, c: X_pad[n, ho * block_H + hi, wo * block_W + wi, c] + ) + + # Filter shape using KCRS (OIHW) notation + filter_Ki, filter_Ci, filter_Cii = 32, 32, 4 + assert shape_oihw8i32o4i[-1] == filter_Cii + assert shape_oihw8i32o4i[-2] == filter_Ki + assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii + + filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype) + + rh = te.reduce_axis((0, kernel_size[0]), name="rh") + rw = te.reduce_axis((0, kernel_size[1]), name="rw") + rc = te.reduce_axis((0, C), name="rc") + + def compute(n, ho, wo, hi, wi, k): + # Construct blockized strided conv2d height index + h = ho * block_H + hi + h_contig = h * stride[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + # Construct blockized strided conv2d width index + w = wo * block_W + wi + w_contig = w * stride[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + # Construct flat filter input channel indices + rco = rc // filter_Ci + rcio = (rc % filter_Ci) // filter_Cii + rcii = rc % filter_Cii + + # Construct split filter output channel index + ko = k // filter_Ki + ki = k % filter_Ki + + return te.sum( + X_packed[n, h_block_id, w_block_id, h_block_offset, w_block_offset, rc] + * filt_packed[ko, rco, rh, rw, rcio, ki, rcii], + axis=[rh, rw, rc], + ) + + Y = te.compute(output_shape, compute) + s = te.create_schedule(Y.op) + + # Ensure the padding and array packing is performed inline + s[X_pad].compute_inline() + s[X_packed].compute_inline() + + n, ho, wo, hi, wi, k = s[Y].op.axis + rh, rw, rc = s[Y].op.reduce_axis + + rco, rci = s[Y].split(rc, factor=32) + s[Y].reorder(n, rco, wo, ho, k, hi, wi) + Xl = s.cache_read(X_packed, storage_scope, [Y]) + s[Xl].compute_at(s[Y], rco) + + ko, ki = s[Y].split(k, factor=32) + s[Y].reorder(n, rco, wo, ho, ko, hi, wi, ki) + Fl = s.cache_read(filt_packed, storage_scope, [Y]) + s[Fl].compute_at(s[Y], ko) + + binds = {} + if storage_scope and storage_scope != "global": + with tvm.transform.PassContext(): + Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope) + Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope) + binds = {X: Xb, Y: Yb} + + return (s, [X, filt_packed, Y], binds) + + +class BaseConv2d: + batch = tvm.testing.parameter(1) + in_size = tvm.testing.parameter(8, 56) + in_channel = tvm.testing.parameter(64) + out_channel = tvm.testing.parameter(64) + kernel = tvm.testing.parameter(3) + stride = tvm.testing.parameter(1) + pad = tvm.testing.parameter(1) + dtype = tvm.testing.parameter("float32") + + +class TestConv2dLogical(BaseConv2d): + @tvm.testing.parametrize_targets("llvm") + def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target): + inputs = [ + np.random.uniform(0, 255, size=shape_nhwc).astype(dtype), + np.random.uniform(0, 255, size=shape_oihw).astype(dtype), + ] + np_filter = inputs[1].transpose(2, 3, 1, 0) + ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) + output = build_and_run( + inputs, + conv2d_logical, + target, + target, + shape_nhwc=shape_nhwc, + shape_oihw=shape_oihw, + kernel_size=(kernel, kernel), + stride=(stride, stride), + padding=(pad, pad, pad, pad), + dtype=dtype, + ) + return output, ref_output + + +class TestConv2dPackedFilter(BaseConv2d): + conv2d_impl = tvm.testing.parameter(conv2d_packed_filter, conv2d_packed_filter_nhwhwc) + + @tvm.testing.parametrize_targets("llvm") + def test_conv2d( + self, + conv2d_impl, + shape_nhwc, + shape_oihw, + shape_oihw8i32o4i, + kernel, + stride, + pad, + dtype, + target, + ): + inputs = [ + np.random.uniform(0, 255, size=shape_nhwc).astype(dtype), + np.random.uniform(0, 255, size=shape_oihw8i32o4i).astype(dtype), + ] + np_filter = ( + inputs[1].transpose(0, 5, 1, 4, 6, 2, 3).reshape(shape_oihw).transpose(2, 3, 1, 0) + ) + ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) + output = build_and_run( + inputs, + conv2d_impl, + target, + target, + shape_nhwc=shape_nhwc, + shape_oihw8i32o4i=shape_oihw8i32o4i, + kernel_size=(kernel, kernel), + stride=(stride, stride), + padding=(pad, pad, pad, pad), + dtype=dtype, + ) + return output, ref_output + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py new file mode 100644 index 000000000000..67af8d87f708 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +import tvm +from tvm import te +from tvm import topi +from tvm.topi import testing +from .infrastructure import ( + ceildiv, + build_and_run, + get_block_shape, + get_packed_filter_layout, + get_packed_activation_layout, +) + +import numpy as np +import pytest + +# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c] +def maxpool2d_logical( + shape_nhwc, + window_shape, + stride, + padding, + dtype, + storage_scope="global", +): + """ + Maxpool2d TE wherein the input activation is defined by its + logical NHWC shape. The packed physical layout for the + activation is nhwc8h8w32c. + """ + + block_shape = get_block_shape() + block_H, block_W, block_C = block_shape + shape = get_packed_activation_layout(shape_nhwc, block_shape) + logical_output_shape = ( + shape_nhwc[0], + (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1, + (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1, + shape_nhwc[3], + ) + output_shape = get_packed_activation_layout(logical_output_shape, block_shape) + + N, H, W, C = shape_nhwc + X = te.placeholder(shape_nhwc, dtype=dtype) + + # Combination of padding required by maxpool operator and padding to evenly divisible + # number of blocks. Note that this padding should be inlined in the schedule so + # as to avoid input copying. + pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H + pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W + X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0) + + # Calculate packed layout + X_packed = te.compute( + shape, + lambda n, ho, wo, co, hi, wi, ci: X_pad[ + n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci + ], + ) + + rh = te.reduce_axis((0, window_shape[0]), name="rh") + rw = te.reduce_axis((0, window_shape[1]), name="rw") + + def compute(n, ho, wo, co, hi, wi, ci): + # Construct blockized strided maxpool height indices + h = ho * block_H + hi + h_contig = h * stride[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + # Construct blockized strided maxpool width indices + w = wo * block_W + wi + w_contig = w * stride[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + return te.max( + X_packed[n, h_block_id, w_block_id, co, h_block_offset, w_block_offset, ci], + axis=[rh, rw], + ) + + Y = te.compute(output_shape, compute) + s = te.create_schedule(Y.op) + + # Ensure the padding and array packing is performed inline + s[X_pad].compute_inline() + s[X_packed].compute_inline() + + binds = {} + if storage_scope and storage_scope != "global": + with tvm.transform.PassContext(): + Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope) + Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope) + binds = {X: Xb, Y: Yb} + + return (s, [X, Y], binds) + + +class BaseMaxPooling: + batch = tvm.testing.parameter(1) + in_size = tvm.testing.parameter(8, 112) + in_channel = tvm.testing.parameter(64) + window_size = tvm.testing.parameter(3) + stride = tvm.testing.parameter(2) + pad = tvm.testing.parameter(1) + dtype = tvm.testing.parameter("float32") + + +class TestMaxPooling(BaseMaxPooling): + @tvm.testing.parametrize_targets("llvm") + def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target): + inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)] + ref_output = testing.poolnd_python( + inputs[0], + (window_size, window_size), + strides=(stride, stride), + dilation=(1, 1), + padding_before=(pad, pad), + padding_after=(pad, pad), + pool_type="max", + ) + output = build_and_run( + inputs, + maxpool2d_logical, + target, + target, + shape_nhwc, + window_shape=(window_size, window_size), + stride=(stride, stride), + padding=(pad, pad, pad, pad), + dtype=dtype, + ) + return output, ref_output + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv))